In [1]:
import os, sys
import json
from unidecode import unidecode
import pandas as pd

In [2]:
file_path = './CommonVoice2_dataset/validated.tsv'
clips_path = './data/CommonVoice2_dataset/clips'
manifest_path = './manifests'
labels_path = './labels'
invalid_clips = [
    'common_voice_en_18406522.mp3',
    'common_voice_en_18406523.mp3',
    'common_voice_en_18406525.mp3'
]

In [3]:
# Read data table
df = pd.read_table(file_path)
print(df.shape)

# Filter NA
df.dropna(inplace=True)
print(df.shape)

# Filter out invalid clips
df = df[~df['path'].isin(invalid_clips)]
print(df.shape)

# Filter unneccesary accent
df = df[df['accent'] != 'other']
print(df.shape)

# Add prefix path to clips path
df['path'] = df['path'].apply(lambda path: '{}/{}'.format(clips_path, path))

# Clean up sentence text
df['sentence'] = df['sentence'].apply(lambda sentence: unidecode(sentence.lower()))

# Save manifest per accent
for accent in df['accent'].unique():
    df.loc[df['accent'] == accent,['path','sentence']].to_csv('{}/cv_20190612_{}.csv'.format(manifest_path, accent), index=False, header=False)
    
# Prepare label list
char_list = set()
for sentence in df.sentence:
    for char in unidecode(sentence.lower()):
        char_list.add(char)
char_list = list(char_list)
char_list.remove('_')
char_list.insert(0, '_')

char_dict = {idx: char for idx,char in enumerate(char_list)}

# Save label list
with open('{}/cv_labels.json'.format(labels_path), 'w') as f:
    json.dump(char_dict, f)

(644119, 8)
(298721, 8)
(298718, 8)
(288560, 8)


In [5]:
# Test
for accent in df['accent'].unique():
    acc_char_list = set()
    for sentence in df.loc[df['accent'] == accent,'sentence']:
        for char in sentence.lower():
            acc_char_list.add(char)
    print(acc_char_list - set(char_list))

set()
set()
set()
set()
set()
set()
set()
set()
set()
set()
set()
set()
set()
set()
set()
set()
