In [1]:
import numpy as np
import pandas as pd
import os
from IPython import display
import datasets
from datasets.info import DatasetInfosDict
from datasets import load_dataset,concatenate_datasets, DatasetDict, Dataset, Audio

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
BASE_PATH = "/raid/cs20mds14030/telugu_asr/data"
BASE_PATH

'/raid/cs20mds14030/telugu_asr/data'

In [3]:
INDIC_SUPREB_DATASETS = ["indic_superb/clean_train","indic_superb/clean_valid",
                         "indic_superb/clean_test_known","indic_superb/clean_test_unknown"]

OPENSLR_DATASETS = ["open_slr/te_in_female","open_slr/te_in_male"]

ULCA_DATASETS = ["ulca/BBC_News_Telugu_17-08-2021_00-57",
                 "ulca/Chai_Bisket_Stories_16-08-2021_14-17",
                 "ulca/Telangana_Sahitya_Akademi_16-08-2021_14-40"]

MUCS_DATASETS = ["mucs/te-in-Test/Audios","mucs/te-in-Train/Audios"]

In [4]:
# convert m4a to wav
# update file extensions in metadata file
# add duration column to metadata

In [5]:
# indic_superb/clean_train/
# indic_superb/clean_valid/
# indic_superb/clean_test_known/
# indic_superb/clean_test_unknown/

# open_slr/te_in_female/
# open_slr/te_in_male/

# ulca/BBC_News_Telugu_17-08-2021_00-57/
# ulca/Chai_Bisket_Stories_16-08-2021_14-17/
# ulca/Telangana_Sahitya_Akademi_16-08-2021_14-40/

In [6]:
from pydub import AudioSegment
def get_audio_len(file_name):
    sound = AudioSegment.from_file(file_name)
    return round((sound.duration_seconds % 60),3)

In [7]:
def create_metadata_for_indicsuperb_datasets():

    for dataset in INDIC_SUPREB_DATASETS:    
        df = pd.read_csv(f'{BASE_PATH}/{dataset}/transcription_n2w.txt', header=None,sep="\t",names=['file_name','transcription'])
        # replace .m4a with wav
        df.file_name = df.file_name.apply(lambda x: x.replace(".m4a",".wav"))
        
        # add audio duration column
        df.loc[:,"duration"] = df.file_name.apply(lambda file_name: get_audio_len(f"{BASE_PATH}/{dataset}/{file_name}"))
        
        df.to_csv(f'{BASE_PATH}/{dataset}/metadata.csv',index=False)
        # show sample audio
        sample_audio_path = f"{BASE_PATH}/{dataset}/{df.head(1).values[0][0]}"
        sample_audio_transcript = df.head(1).values[0][1]
        print(f"sample_audio_path --> {sample_audio_path}")
        print(f"sample_audio_transcript --> {sample_audio_transcript}")
        print(f'created metadata.csv for {dataset}; contains --> {df.shape[0]}, {round(df.duration.sum(),2)} sec')
        del df
        print("\n")

In [8]:
create_metadata_for_indicsuperb_datasets()

sample_audio_path --> /raid/cs20mds14030/telugu_asr/data/indic_superb/clean_train/844424932551735-484-f.wav
sample_audio_transcript --> గతంలో స్థానిక ప్రజా ప్రతినిధులు మెట్రో రైలును పొడిగించేందుకు హామీలు ఇవ్వడంతో కోటి ఆశలు పెంచుకున్నారు
created metadata.csv for indic_superb/clean_train; contains --> 70692, 510370.05 sec


sample_audio_path --> /raid/cs20mds14030/telugu_asr/data/indic_superb/clean_valid/844424931162336-381-f.wav
sample_audio_transcript --> మలయాళ వర్షన్ ని ప్రముఖ గాయకుడు యేసుదాస్ కొడుకు విజయ్ యేసుదాస్ చే పాడించారు
created metadata.csv for indic_superb/clean_valid; contains --> 2379, 18028.44 sec


sample_audio_path --> /raid/cs20mds14030/telugu_asr/data/indic_superb/clean_test_known/844424932576755-134-f.wav
sample_audio_transcript --> ఇదిలా ఉంటే ఆస్తుల నమోదుపై అనుమానాల్ని నివృత్తి చేసే వారు లేక ప్రజలు సైతం అయోమయంలో పడుతున్నారు
created metadata.csv for indic_superb/clean_test_known; contains --> 2410, 18025.85 sec


sample_audio_path --> /raid/cs20mds14030/telugu_asr/dat

In [9]:
def create_metadata_for_openslr_datasets():

    for dataset in OPENSLR_DATASETS:    
        df = pd.read_csv(f'{BASE_PATH}/{dataset}/line_index.tsv', header=None,sep="\t",names=['file_name','transcription'])
        # add .wav suffix to all the files
        df.file_name = df.file_name+".wav"
        
        # add audio duration column
        df.loc[:,"duration"] = df.file_name.apply(lambda file_name: get_audio_len(f"{BASE_PATH}/{dataset}/{file_name}"))
        
        df.to_csv(f'{BASE_PATH}/{dataset}/metadata.csv',index=False)
        # show sample audio
        sample_audio_path = f"{BASE_PATH}/{dataset}/{df.head(1).values[0][0]}"
        sample_audio_transcript = df.head(1).values[0][1]
        print(f"sample_audio_path --> {sample_audio_path}")
        print(f"sample_audio_transcript --> {sample_audio_transcript}")
        # display.Audio(sample_audio_path,rate=16000)
        print(f'created metadata.csv for {dataset}; contains --> {df.shape[0]}, {round(df.duration.sum(),2)} sec')
        del df
        print("\n")

In [10]:
create_metadata_for_openslr_datasets()

sample_audio_path --> /raid/cs20mds14030/telugu_asr/data/open_slr/te_in_female/tef_01033_00351357063.wav
sample_audio_transcript --> ఈ వివాదం సెప్టెంబర్ రెండు వేల తొమ్మిదిన పరిష్కారమైనది
created metadata.csv for open_slr/te_in_female; contains --> 2294, 9819.64 sec


sample_audio_path --> /raid/cs20mds14030/telugu_asr/data/open_slr/te_in_male/tem_07220_01981175708.wav
sample_audio_transcript --> దీనిని తటస్థీకరణము అందురు
created metadata.csv for open_slr/te_in_male; contains --> 2154, 10732.32 sec




In [11]:
display.Audio("/raid/cs20mds14030/telugu_asr/data/open_slr/te_in_female/tef_01033_00351357063.wav",rate=16000)

In [12]:
display.Audio("/raid/cs20mds14030/telugu_asr/data/open_slr/te_in_male/tem_07220_01981175708.wav",rate=16000)

In [13]:
def create_metadata_for_ulca_datasets():

    for dataset in ULCA_DATASETS:    
        df = pd.read_json(f'{BASE_PATH}/{dataset}/data.json')
        df.rename(columns={'text':'transcription','audioFilename':'file_name'},inplace=True)
        df = df[['file_name', 'duration','transcription']]        
        df.to_csv(f'{BASE_PATH}/{dataset}/metadata.csv',index=False)
        # show sample audio
        sample_audio_path = f"{BASE_PATH}/{dataset}/{df.head(1).values[0][0]}"
        sample_audio_transcript = df.head(1).values[0][2]
        print(f"sample_audio_path --> {sample_audio_path}")
        print(f"sample_audio_transcript --> {sample_audio_transcript}")
        print(f'created metadata.csv for {dataset}; contains --> {df.shape[0]}, {round(df.duration.sum(),2)} sec')
        del df
        print("\n")

In [14]:
create_metadata_for_ulca_datasets()

sample_audio_path --> /raid/cs20mds14030/telugu_asr/data/ulca/BBC_News_Telugu_17-08-2021_00-57/16_479file-id9RKLtQ7GDF0.wav
sample_audio_transcript --> క్రౌన్ తీసుకున్నారు  అది అంతా ఎలా వచ్చింది
created metadata.csv for ulca/BBC_News_Telugu_17-08-2021_00-57; contains --> 15000, 76220.55 sec


sample_audio_path --> /raid/cs20mds14030/telugu_asr/data/ulca/Chai_Bisket_Stories_16-08-2021_14-17/63_639file-idbU4bWLbaKYA.wav
sample_audio_transcript --> స్పీకింగ్
created metadata.csv for ulca/Chai_Bisket_Stories_16-08-2021_14-17; contains --> 585, 3216.78 sec


sample_audio_path --> /raid/cs20mds14030/telugu_asr/data/ulca/Telangana_Sahitya_Akademi_16-08-2021_14-40/227_2886file-id3QmBhOso7Fs.wav
sample_audio_transcript --> చెమట శక్తిని మోసిన  గది ప్రస్తుతం కంపు వాసన వస్తుంది
created metadata.csv for ulca/Telangana_Sahitya_Akademi_16-08-2021_14-40; contains --> 176, 685.92 sec




In [15]:
display.Audio("/raid/cs20mds14030/telugu_asr/data/ulca/BBC_News_Telugu_17-08-2021_00-57/16_479file-id9RKLtQ7GDF0.wav")

In [16]:
display.Audio("/raid/cs20mds14030/telugu_asr/data/ulca/Chai_Bisket_Stories_16-08-2021_14-17/63_639file-idbU4bWLbaKYA.wav")

In [17]:
display.Audio("/raid/cs20mds14030/telugu_asr/data/ulca/Telangana_Sahitya_Akademi_16-08-2021_14-40/227_2886file-id3QmBhOso7Fs.wav")

In [18]:
def create_metadata_for_mucs_datasets():

    for dataset in MUCS_DATASETS:
        df = pd.read_csv(f'{BASE_PATH}/{dataset}/transcription.txt',dtype=str,header=None,sep="\t",names=['file_name','transcription'])
        df.file_name = df.file_name.apply(lambda x: str(x)+".wav")
        
        # add audio duration column
        df.loc[:,"duration"] = df.file_name.apply(lambda file_name: get_audio_len(f"{BASE_PATH}/{dataset}/{file_name}"))
        
        df.to_csv(f'{BASE_PATH}/{dataset}/metadata.csv',index=False)
        # show sample audio
        sample_audio_path = f"{BASE_PATH}/{dataset}/{df.head(1).values[0][0]}"
        sample_audio_transcript = df.head(1).values[0][1]
        print(f"sample_audio_path --> {sample_audio_path}")
        print(f"sample_audio_transcript --> {sample_audio_transcript}")
        print(f'created metadata.csv for {dataset}; contains --> {df.shape[0]}, {round(df.duration.sum(),2)} sec')
        del df
        print("\n")

In [19]:
create_metadata_for_mucs_datasets()

sample_audio_path --> /raid/cs20mds14030/telugu_asr/data/mucs/te-in-Test/Audios/001050238.wav
sample_audio_transcript --> ఈ సందర్భంగా చంద్రబాబు అమరావతి నగర అభివృద్ధిపై తన ఆలోచనలను వారికి వివరించారు
created metadata.csv for mucs/te-in-Test/Audios; contains --> 3040, 18000.07 sec


sample_audio_path --> /raid/cs20mds14030/telugu_asr/data/mucs/te-in-Train/Audios/TE2406-TE2408_1-A.089.wav
sample_audio_transcript --> కచ్చితంగా చూపిస్తుంది కదా మరి
created metadata.csv for mucs/te-in-Train/Audios; contains --> 44882, 144000.29 sec




In [20]:
display.Audio('/raid/cs20mds14030/telugu_asr/data/mucs/te-in-Test/Audios/001050238.wav',rate=16000)

In [21]:
display.Audio('/raid/cs20mds14030/telugu_asr/data/mucs/te-in-Train/Audios/TE2406-TE2408_1-A.089.wav',rate=16000)

In [29]:
def create_dataset_from_metadata(dataset_name):
    ds = load_dataset('csv', data_files=f"{BASE_PATH}/{dataset_name}/metadata.csv")
    # add the complete file name
    ds = ds.map(lambda x: {'file_name':BASE_PATH+"/"+dataset_name+"/"+x["file_name"]})
    return ds['train']

In [25]:
DATASETS = INDIC_SUPREB_DATASETS+OPENSLR_DATASETS+ULCA_DATASETS+MUCS_DATASETS
DATASETS

['indic_superb/clean_train',
 'indic_superb/clean_valid',
 'indic_superb/clean_test_known',
 'indic_superb/clean_test_unknown',
 'open_slr/te_in_female',
 'open_slr/te_in_male',
 'ulca/BBC_News_Telugu_17-08-2021_00-57',
 'ulca/Chai_Bisket_Stories_16-08-2021_14-17',
 'ulca/Telangana_Sahitya_Akademi_16-08-2021_14-40',
 'mucs/te-in-Test/Audios',
 'mucs/te-in-Train/Audios']

In [30]:
print(DATASETS[0])
ds = create_dataset_from_metadata(DATASETS[0])

indic_superb/clean_train


Using custom data configuration default-532e0d89d5b5caac


Downloading and preparing dataset csv/default to /raid/cs20mds14030/.cache/huggingface/datasets/csv/default-532e0d89d5b5caac/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317...


Downloading data files: 100%|████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 2041.02it/s]
Extracting data files: 100%|██████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 671.52it/s]
  return pd.read_csv(xopen(filepath_or_buffer, "rb", use_auth_token=use_auth_token), **kwargs)
                                                                    

Dataset csv downloaded and prepared to /raid/cs20mds14030/.cache/huggingface/datasets/csv/default-532e0d89d5b5caac/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317. Subsequent calls will reuse this data.


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 368.57it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████| 70692/70692 [00:03<00:00, 17746.03ex/s]


In [36]:
(np.sum(ds['duration'])/60)/60

141.76945694444444

In [38]:
print(ds)
for dataset_name in DATASETS[1:]:
    temp_ds = create_dataset_from_metadata(dataset_name)
    ds = concatenate_datasets([ds,temp_ds])
    print(ds)

Dataset({
    features: ['file_name', 'transcription', 'duration'],
    num_rows: 70692
})


Using custom data configuration default-468b4ecfefbfd25e


Downloading and preparing dataset csv/default to /raid/cs20mds14030/.cache/huggingface/datasets/csv/default-468b4ecfefbfd25e/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317...


Downloading data files: 100%|████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 3297.41it/s]
Extracting data files: 100%|██████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 689.51it/s]
  return pd.read_csv(xopen(filepath_or_buffer, "rb", use_auth_token=use_auth_token), **kwargs)
                                                        

Dataset csv downloaded and prepared to /raid/cs20mds14030/.cache/huggingface/datasets/csv/default-468b4ecfefbfd25e/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317. Subsequent calls will reuse this data.


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 449.74it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 2379/2379 [00:00<00:00, 17460.24ex/s]


Dataset({
    features: ['file_name', 'transcription', 'duration'],
    num_rows: 73071
})


Using custom data configuration default-b0d552ea2e7939b2


Downloading and preparing dataset csv/default to /raid/cs20mds14030/.cache/huggingface/datasets/csv/default-b0d552ea2e7939b2/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317...


Downloading data files: 100%|████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 4578.93it/s]
Extracting data files: 100%|██████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 897.56it/s]
  return pd.read_csv(xopen(filepath_or_buffer, "rb", use_auth_token=use_auth_token), **kwargs)
                                                        

Dataset csv downloaded and prepared to /raid/cs20mds14030/.cache/huggingface/datasets/csv/default-b0d552ea2e7939b2/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317. Subsequent calls will reuse this data.


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 642.21it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 2410/2410 [00:00<00:00, 17588.32ex/s]


Dataset({
    features: ['file_name', 'transcription', 'duration'],
    num_rows: 75481
})


Using custom data configuration default-c4b5f2d9278ebf50


Downloading and preparing dataset csv/default to /raid/cs20mds14030/.cache/huggingface/datasets/csv/default-c4b5f2d9278ebf50/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317...


Downloading data files: 100%|████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 4744.69it/s]
Extracting data files: 100%|█████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 1173.89it/s]
  return pd.read_csv(xopen(filepath_or_buffer, "rb", use_auth_token=use_auth_token), **kwargs)
                                                        

Dataset csv downloaded and prepared to /raid/cs20mds14030/.cache/huggingface/datasets/csv/default-c4b5f2d9278ebf50/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317. Subsequent calls will reuse this data.


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 498.85it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 1492/1492 [00:00<00:00, 18036.01ex/s]


Dataset({
    features: ['file_name', 'transcription', 'duration'],
    num_rows: 76973
})


Using custom data configuration default-d68039ae23c6209e


Downloading and preparing dataset csv/default to /raid/cs20mds14030/.cache/huggingface/datasets/csv/default-d68039ae23c6209e/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317...


Downloading data files: 100%|████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 1358.70it/s]
Extracting data files: 100%|█████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 1193.26it/s]
  return pd.read_csv(xopen(filepath_or_buffer, "rb", use_auth_token=use_auth_token), **kwargs)
                                                        

Dataset csv downloaded and prepared to /raid/cs20mds14030/.cache/huggingface/datasets/csv/default-d68039ae23c6209e/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317. Subsequent calls will reuse this data.


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 495.60it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 2294/2294 [00:00<00:00, 17064.47ex/s]


Dataset({
    features: ['file_name', 'transcription', 'duration'],
    num_rows: 79267
})


Using custom data configuration default-8e50d83afce11997


Downloading and preparing dataset csv/default to /raid/cs20mds14030/.cache/huggingface/datasets/csv/default-8e50d83afce11997/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317...


Downloading data files: 100%|████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 3905.31it/s]
Extracting data files: 100%|██████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 875.09it/s]
  return pd.read_csv(xopen(filepath_or_buffer, "rb", use_auth_token=use_auth_token), **kwargs)
                                                        

Dataset csv downloaded and prepared to /raid/cs20mds14030/.cache/huggingface/datasets/csv/default-8e50d83afce11997/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317. Subsequent calls will reuse this data.


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 489.93it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 2154/2154 [00:00<00:00, 23868.61ex/s]


Dataset({
    features: ['file_name', 'transcription', 'duration'],
    num_rows: 81421
})


Using custom data configuration default-5b87688d0d08ad08


Downloading and preparing dataset csv/default to /raid/cs20mds14030/.cache/huggingface/datasets/csv/default-5b87688d0d08ad08/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317...


Downloading data files: 100%|████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 4262.50it/s]
Extracting data files: 100%|█████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 1047.27it/s]
  return pd.read_csv(xopen(filepath_or_buffer, "rb", use_auth_token=use_auth_token), **kwargs)
                                                                    

Dataset csv downloaded and prepared to /raid/cs20mds14030/.cache/huggingface/datasets/csv/default-5b87688d0d08ad08/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317. Subsequent calls will reuse this data.


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 404.47it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████| 15000/15000 [00:00<00:00, 21432.12ex/s]


Dataset({
    features: ['file_name', 'transcription', 'duration'],
    num_rows: 96421
})


Using custom data configuration default-7956806fe110f261


Downloading and preparing dataset csv/default to /raid/cs20mds14030/.cache/huggingface/datasets/csv/default-7956806fe110f261/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317...


Downloading data files: 100%|████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 4443.12it/s]
Extracting data files: 100%|██████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 865.16it/s]
  return pd.read_csv(xopen(filepath_or_buffer, "rb", use_auth_token=use_auth_token), **kwargs)
                                                        

Dataset csv downloaded and prepared to /raid/cs20mds14030/.cache/huggingface/datasets/csv/default-7956806fe110f261/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317. Subsequent calls will reuse this data.


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 568.49it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████| 585/585 [00:00<00:00, 30388.99ex/s]


Dataset({
    features: ['file_name', 'transcription', 'duration'],
    num_rows: 97006
})


Using custom data configuration default-5ea047a5e02a4379


Downloading and preparing dataset csv/default to /raid/cs20mds14030/.cache/huggingface/datasets/csv/default-5ea047a5e02a4379/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317...


Downloading data files: 100%|████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 3953.16it/s]
Extracting data files: 100%|██████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 852.33it/s]
  return pd.read_csv(xopen(filepath_or_buffer, "rb", use_auth_token=use_auth_token), **kwargs)
                                                        

Dataset csv downloaded and prepared to /raid/cs20mds14030/.cache/huggingface/datasets/csv/default-5ea047a5e02a4379/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317. Subsequent calls will reuse this data.


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 794.23it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████| 176/176 [00:00<00:00, 27119.67ex/s]


Dataset({
    features: ['file_name', 'transcription', 'duration'],
    num_rows: 97182
})


Using custom data configuration default-6f72d14d2ba0e3c7


Downloading and preparing dataset csv/default to /raid/cs20mds14030/.cache/huggingface/datasets/csv/default-6f72d14d2ba0e3c7/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317...


Downloading data files: 100%|████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 3765.08it/s]
Extracting data files: 100%|██████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 923.25it/s]
  return pd.read_csv(xopen(filepath_or_buffer, "rb", use_auth_token=use_auth_token), **kwargs)
                                                        

Dataset csv downloaded and prepared to /raid/cs20mds14030/.cache/huggingface/datasets/csv/default-6f72d14d2ba0e3c7/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317. Subsequent calls will reuse this data.


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 509.45it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 3040/3040 [00:00<00:00, 16946.05ex/s]


Dataset({
    features: ['file_name', 'transcription', 'duration'],
    num_rows: 100222
})


Using custom data configuration default-6c088cff11fd8dff


Downloading and preparing dataset csv/default to /raid/cs20mds14030/.cache/huggingface/datasets/csv/default-6c088cff11fd8dff/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317...


Downloading data files: 100%|████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 3437.95it/s]
Extracting data files: 100%|██████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 730.59it/s]
  return pd.read_csv(xopen(filepath_or_buffer, "rb", use_auth_token=use_auth_token), **kwargs)
                                                                    

Dataset csv downloaded and prepared to /raid/cs20mds14030/.cache/huggingface/datasets/csv/default-6c088cff11fd8dff/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317. Subsequent calls will reuse this data.


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 486.07it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████| 44882/44882 [00:02<00:00, 21382.16ex/s]


Dataset({
    features: ['file_name', 'transcription', 'duration'],
    num_rows: 145104
})


In [39]:
ds

Dataset({
    features: ['file_name', 'transcription', 'duration'],
    num_rows: 145104
})

In [40]:
np.sum(ds['duration'])/60)

820261.137