In [17]:
# Notebook for exploring CommonVoice dataset

import requests
import tarfile
from tqdm import tqdm
import torchaudio
import soundfile as sf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from IPython.display import Audio, display
from torchaudio.transforms import MelSpectrogram

#from zoraspeech.utils import visualization

In [18]:
# set up autoreload

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [19]:
# download the commonvoice dataset with a progress bar
COMMON_VOICE_URL = "https://storage.googleapis.com/common-voice-prod-prod-datasets/cv-corpus-19.0-2024-09-13/cv-corpus-19.0-2024-09-13-en.tar.gz?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=gke-prod%40moz-fx-common-voice-prod.iam.gserviceaccount.com%2F20241126%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20241126T182209Z&X-Goog-Expires=43200&X-Goog-SignedHeaders=host&X-Goog-Signature=389bbafbdb2025b4cf737570dfa19792367e1e2438008f425e6581c02c74b499d52e7bab4d547a0e4b4d4c7054c6ac80dadabcda19bceb24828d9efe56ca8a192bc0ac791230c104e0e3ea983e07f0d49aad3f0364ac52a6be406e0cb2391718f9fab85615747336ebbaa2f670a80afdeb402b8ae55b5c1518da771d69c43ec3c9a01966d28474d82c8546010e624131411b4652bfa12b7e7958d0cc9ea8d3886116b16fcfd44e83ee22688e13bb74e78e4cee6d5554772ca4871916188a3902010f7aed7e4919e1d7cb32e08ddb5a7dad558fc19178d14886935d9428338798e3f52c0f1d6d6ea4114f2b1f8f7033b608ddeac701554acf9e484e287df9d172"

COMMON_VOICE_DIRECTORY = "/data/jo/commonvoice"

In [None]:
# create the directory if it doesn't exist
os.makedirs(COMMON_VOICE_DIRECTORY, exist_ok=True)

def download_file(url, destination):
    response = requests.get(url, stream=True)
    response.raise_for_status()
    
    total_size = int(response.headers.get('content-length', 0))
    block_size  = 8192
    
    with open(destination, 'wb') as f:
        progress_bar = tqdm(total=total_size, unit='iB', unit_scale=True)
        
        for data in response.iter_content(block_size):
            progress_bar.update(len(data))
            f.write(data)
            
        progress_bar.close()

# Download the file
output_file = os.path.join(COMMON_VOICE_DIRECTORY, "commonvoice.tar.gz")
download_file(COMMON_VOICE_URL, output_file)


In [None]:
output_file

In [None]:
COMMON_VOICE_DIRECTORY

In [None]:
# extract the dataset using a python library making sure to uncompress temporaty files on in the commonvoice directory

# this takes a long time over nfs! try doing this locally instead

#with tarfile.open(output_file, "r:gz") as tar:
#    tar.extractall(path=COMMON_VOICE_DIRECTORY)

In [20]:
COMMON_VOICE_PATH = os.path.join(COMMON_VOICE_DIRECTORY, "cv-corpus-19.0-2024-09-13/en/")

print(COMMON_VOICE_PATH)

/data/jo/commonvoice/cv-corpus-19.0-2024-09-13/en/


In [21]:
# load in tsv into pandas dataframe
df = pd.read_csv(COMMON_VOICE_PATH + "train.tsv", sep="\t")

# print the first 5 rows of the dataframe for jupyter notebook
df.head()

  df = pd.read_csv(COMMON_VOICE_PATH + "train.tsv", sep="\t")


Unnamed: 0,client_id,path,sentence_id,sentence,sentence_domain,up_votes,down_votes,age,gender,accents,variant,locale,segment
0,e6e86bf2343dfd28fc27eb798c1f12d42a909c2a9eb931...,common_voice_en_35390714.mp3,c6fa7671e6084d554916f618a5c99be045d027b544ff29...,Some differences exist between eukaryotes and ...,,2,0,,,,,en,
1,e6e86bf2343dfd28fc27eb798c1f12d42a909c2a9eb931...,common_voice_en_35390716.mp3,c6d44521804dd6d7861f1ef62724564ab2d412b2a16899...,"‘You don’t seem to mind it,’ observed Bob.",,2,0,,,,,en,
2,e6e86bf2343dfd28fc27eb798c1f12d42a909c2a9eb931...,common_voice_en_35390717.mp3,c6f474c62c583b90e66f8d11eda514b067683ce7d6b676...,"But this time, it's different.",,2,0,,,,,en,
3,e6ee693358a24450e38185ea324c48a9626b2e7515d4ea...,common_voice_en_17280426.mp3,f2daecd0ccb0e31dda418615f9b72a5a,The convenience store manager was mad at the u...,,2,0,,,,,en,
4,e6ef88404179e958b61f65ffdac36c2a357306a53765d7...,common_voice_en_23361615.mp3,6a292889c808671942445e7e4e7557dd2cff5b24dd47f7...,Elevators are characteristic to skyscrapers.,,2,1,,,,,en,


In [22]:
# get the number of rows in the dataframe
len(df)

1116234

In [23]:
# for the first 5 rows, print the sentence and the audio path, an audio player, a waveform, and a specgram

for i in range(5):
    sentence = df.iloc[i]['sentence']
    path = COMMON_VOICE_PATH + 'clips/' + df.iloc[i]['path']
    #path = COMMON_VOICE_PATH + 'clips_wav/' + df.iloc[i]['path'].replace("mp3", "wav")
    print(sentence)
    print(path)
    display(Audio(path))

    # load in the audio file
    audio, sr = torchaudio.load(path)

    # visualize the waveform and specgram

    #visualization.plot_waveform(audio, sr)
    #visualization.plot_specgram(audio, sr)

Some differences exist between eukaryotes and prokaryotes.
/data/jo/commonvoice/cv-corpus-19.0-2024-09-13/en/clips/common_voice_en_35390714.mp3


‘You don’t seem to mind it,’ observed Bob.
/data/jo/commonvoice/cv-corpus-19.0-2024-09-13/en/clips/common_voice_en_35390716.mp3


But this time, it's different.
/data/jo/commonvoice/cv-corpus-19.0-2024-09-13/en/clips/common_voice_en_35390717.mp3


The convenience store manager was mad at the unfriendly cashier.
/data/jo/commonvoice/cv-corpus-19.0-2024-09-13/en/clips/common_voice_en_17280426.mp3


Elevators are characteristic to skyscrapers.
/data/jo/commonvoice/cv-corpus-19.0-2024-09-13/en/clips/common_voice_en_23361615.mp3


In [25]:
clips = os.listdir(COMMON_VOICE_PATH + 'clips')

len(clips)

2459129

In [27]:
# convert all the audio files to wav with sr of 16000

sample_rate = 16000

# create clips_wav directory if it doesn't exist
wav_dir = os.path.join(COMMON_VOICE_PATH, 'clips_wav')
os.makedirs(wav_dir, exist_ok=True)

# iterate through all mp3 files in clips directory
clips_dir = os.path.join(COMMON_VOICE_PATH, 'clips')
for mp3_file in tqdm(os.listdir(clips_dir)):
    if mp3_file.endswith('.mp3'):
        # construct input and output paths
        mp3_path = os.path.join(clips_dir, mp3_file)
        wav_path = os.path.join(wav_dir, mp3_file.replace('.mp3', '.wav'))
        
        # skip if wav file already exists
        if os.path.exists(wav_path):
            continue
            
        # load mp3 and save as wav
        try:
            audio, sr = torchaudio.load(mp3_path)
            torchaudio.save(wav_path, audio, sample_rate)
        except Exception as e:
            print(f"Error converting {mp3_file}: {str(e)}")


  0%|          | 1132/2459129 [02:10<78:43:39,  8.67it/s] 


KeyboardInterrupt: 

In [None]:
# check sample rate of the first 10 wav files
for wav_file in tqdm(os.listdir(wav_dir)[:10]):
    audio, sr = torchaudio.load(os.path.join(wav_dir, wav_file))
    print(sr)