# Insight
- There are 780 total validated hours based on Mozilla (https://voice.mozilla.org/en/datasets)
- Only 46.3% of the validated data contains demography information, which is around 350 hours
- After filtering empty demography data, there are only less than 2 hours test data, 5 hours dev data, and 62 hours train data
| dataset | # data | duration (hr) | # w/ demo | duration w/ demo (hr) |
|------|------|------|------|------|
| train | 63330 | - | 43922 | 62 |
| dev | 13178 | - | 3619 | 5.24 |
| test | 13178 | - | 1277 | 1.85 |
| other | 173800 | ±210 | - | - |
| invalidated | 78904 | ±90 | - | - |
| validated | 644119 | 780 | 298721 | ±350 |

In [1]:
import os, sys
import librosa
import pandas as pd

In [2]:
base_path = './CommonVoice2_dataset'
file_paths = [
    'train.tsv',
    'dev.tsv',
    'test.tsv',
    'other.tsv',
    'invalidated.tsv',
    'validated.tsv'
]   

In [3]:
!ls ./CommonVoice2_dataset/clips | wc -l

896823


In [4]:
dfs = []
for file_path in file_paths:
    df = pd.read_table('{}/{}'.format(base_path, file_path))
    dfs.append(df)
    df['file'] = file_path
    print('=={}=='.format(file_path))
    print('df.shape', df.shape)
    print()
    print('MISSING')
    print(df.isna().sum())
    print()
    print('up_votes', df['up_votes'].nunique(), df['up_votes'].unique().tolist())
    print('down_votes', df['down_votes'].nunique(), df['down_votes'].unique().tolist())
    print('age', df['age'].nunique(), df['age'].unique().tolist())
    print('gender', df['gender'].nunique(), df['gender'].unique().tolist())
    print('accent', df['accent'].nunique(), df['accent'].unique().tolist())
    print()
df = pd.concat(dfs)
print('all shape', df[df['file'].isin(['other.tsv', 'invalidated.tsv','validated.tsv'])].shape)

==train.tsv==
df.shape (63330, 9)

MISSING
client_id         0
path              0
sentence          0
up_votes          0
down_votes        0
age            7199
gender         7073
accent        18982
file              0
dtype: int64

up_votes 11 [2, 3, 4, 5, 19, 7, 6, 8, 15, 20, 17]
down_votes 5 [0, 1, 2, 3, 4]
age 9 [nan, 'twenties', 'thirties', 'teens', 'sixties', 'fifties', 'fourties', 'seventies', 'nineties', 'eighties']
gender 3 [nan, 'male', 'female', 'other']
accent 15 [nan, 'us', 'england', 'hongkong', 'indian', 'african', 'australia', 'newzealand', 'canada', 'scotland', 'ireland', 'philippines', 'wales', 'singapore', 'malaysia', 'other']

==dev.tsv==
df.shape (13178, 9)

MISSING
client_id        0
path             0
sentence         0
up_votes         0
down_votes       0
age           7886
gender        7884
accent        9494
file             0
dtype: int64

up_votes 8 [2, 3, 4, 6, 8, 5, 86, 94]
down_votes 6 [0, 1, 2, 3, 4, 6]
age 8 [nan, 'thirties', 'twenties', 'teens', 

In [5]:
print((df[df['file'] == 'train.tsv'].path.isin(df[df['file'] == 'validated.tsv'].path)).sum())
print((df[df['file'] == 'train.tsv'].path.isin(df[df['file'] == 'invalidated.tsv'].path)).sum())
print((df[df['file'] == 'train.tsv'].path.isin(df[df['file'] == 'other.tsv'].path)).sum())

63330
0
0


In [6]:
print((df[df['file'] == 'dev.tsv'].path.isin(df[df['file'] == 'validated.tsv'].path)).sum())
print((df[df['file'] == 'dev.tsv'].path.isin(df[df['file'] == 'invalidated.tsv'].path)).sum())
print((df[df['file'] == 'dev.tsv'].path.isin(df[df['file'] == 'other.tsv'].path)).sum())

13178
0
0


In [7]:
print((df[df['file'] == 'test.tsv'].path.isin(df[df['file'] == 'validated.tsv'].path)).sum())
print((df[df['file'] == 'test.tsv'].path.isin(df[df['file'] == 'invalidated.tsv'].path)).sum())
print((df[df['file'] == 'test.tsv'].path.isin(df[df['file'] == 'other.tsv'].path)).sum())

13178
0
0


In [8]:
print(df[df['file'] == 'validated.tsv'].dropna().shape)

(298721, 9)


In [9]:
print(df[df['file'] == 'train.tsv'].dropna().shape)
print(df[df['file'] == 'dev.tsv'].dropna().shape)
print(df[df['file'] == 'test.tsv'].dropna().shape)

(43922, 9)
(3619, 9)
(1277, 9)


In [10]:
yes_df = df[df['file'] == 'validated.tsv'].dropna()

In [11]:
def print_accent_age_gender(yes_df):
    print(yes_df.accent.nunique(), yes_df.age.nunique(), yes_df.gender.nunique())
    print(yes_df.groupby('accent').size() / yes_df.shape[0] * 100)
    print(yes_df.groupby('age').size() / yes_df.shape[0] * 100)
    print(yes_df.groupby('gender').size() / yes_df.shape[0] * 100)

In [12]:
print_accent_age_gender(df[df['file'] == 'validated.tsv'].dropna())

17 9 3
accent
african            1.360802
australia          6.569675
bermuda            0.121518
canada             5.832198
england           19.508839
hongkong           0.395352
indian             7.993412
ireland            1.144881
malaysia           0.282203
newzealand         2.031996
other              3.400497
philippines        0.441214
scotland           1.464912
singapore          0.231989
southatlandtic     0.070969
us                48.771931
wales              0.377610
dtype: float64
age
eighties      0.215251
fifties       9.295630
fourties     15.359148
nineties      0.018747
seventies     1.379213
sixties       7.653630
teens         7.585674
thirties     27.346922
twenties     31.145785
dtype: float64
gender
female    19.910887
male      77.706623
other      2.382491
dtype: float64


In [13]:
print_accent_age_gender(df[df['file'] == 'train.tsv'].dropna())

15 9 3
accent
african         1.006329
australia       8.626656
canada          7.178635
england        11.898365
hongkong        0.045535
indian         10.372934
ireland         0.585128
malaysia        0.259551
newzealand      1.331907
other           0.257274
philippines     0.733118
scotland        0.853786
singapore       0.669368
us             56.174582
wales           0.006830
dtype: float64
age
eighties      0.443969
fifties       7.970948
fourties     11.224443
nineties      0.120668
seventies     1.395656
sixties      18.878922
teens         8.458176
thirties     20.584217
twenties     30.923000
dtype: float64
gender
female    15.725604
male      81.155230
other      3.119166
dtype: float64


In [14]:
print_accent_age_gender(df[df['file'] == 'dev.tsv'].dropna())

17 8 3
accent
african            1.353965
australia          2.735562
bermuda            0.303951
canada             6.797458
england           12.600166
hongkong           0.663167
indian            13.733075
ireland            1.132910
malaysia           0.994750
newzealand         0.856590
other              1.436861
philippines        1.464493
scotland           0.221056
singapore          1.132910
southatlandtic     0.055264
us                53.605968
wales              0.911854
dtype: float64
age
eighties      0.055264
fifties       6.769826
fourties     10.638298
seventies     1.353965
sixties       2.569771
teens        12.931749
thirties     22.271346
twenties     43.409782
dtype: float64
gender
female    14.562034
male      84.056369
other      1.381597
dtype: float64


In [15]:
print_accent_age_gender(df[df['file'] == 'test.tsv'].dropna())

17 9 3
accent
african            1.957713
australia          1.566171
bermuda            0.783085
canada             4.463587
england           12.059514
hongkong           0.861394
indian            22.552858
ireland            1.487862
malaysia           0.861394
newzealand         0.861394
other              2.584182
philippines        0.783085
scotland           0.861394
singapore          0.156617
southatlandtic     0.234926
us                47.611590
wales              0.313234
dtype: float64
age
eighties      0.234926
fifties       6.186374
fourties      9.710258
nineties      0.078309
seventies     1.018011
sixties       2.584182
teens        12.607674
thirties     21.221613
twenties     46.358653
dtype: float64
gender
female    14.565388
male      84.886453
other      0.548160
dtype: float64


In [16]:
train_df = df[df['file'] == 'train.tsv'].dropna()
dev_df = df[df['file'] == 'dev.tsv'].dropna()
test_df = df[df['file'] == 'test.tsv'].dropna()
validated_df = df[df['file'] == 'validated.tsv'].dropna()

In [19]:
total_duration = 0
data = {
    'dev.tsv':(dev_df, {'path':[], 'duration':[]}),
    'test.tsv':(test_df, {'path':[], 'duration':[]}),
    'train.tsv':(train_df, {'path':[], 'duration':[]}),
    'validated.tsv':(validated_df, {'path':[], 'duration':[]})
}
for file_name, (df, data) in data.items():
    print('file_name', file_name, flush=True)
    total_duration = 0
    for path in df.path:
        duration = librosa.get_duration(filename='{}/clips/{}'.format(base_path,path))
        total_duration += duration
        data['path'].append(path)
        data['duration'].append(duration)
    print('{} duration : {}'.format(file_name[:-4], total_duration), flush=True)
    df.merge(pd.DataFrame(data), on='path', how='inner').to_csv('./out_{}.csv'.format(file_name[:-4]), index=False)

file_name dev.tsv
dev duration : 18853.099999999984
file_name test.tsv
test duration : 6656.399999999999
file_name train.tsv
train duration : 223320.9999999992
file_name validated.tsv


EOFError: 

In [20]:
total_duration

583104.4999999955

In [25]:
len(data['path'])

137190

In [24]:
paths = df.path.tolist()
len(paths)

298721

In [27]:
file_name

'validated.tsv'

In [29]:
print('continue file_name', file_name, flush=True)
for idx, path in enumerate(paths[137190:]):
    try:
        duration = librosa.get_duration(filename='{}/clips/{}'.format(base_path,path))
        total_duration += duration
        data['path'].append(path)
        data['duration'].append(duration)
    except:
        print('error on index {}'.format(idx))
print('{} duration : {}'.format(file_name[:-4], total_duration), flush=True)
df.merge(pd.DataFrame(data), on='path', how='inner').to_csv('./out_{}.csv'.format(file_name[:-4]), index=False)

continue file_name validated.tsv
error on index 0
error on index 1
error on index 2
validated duration : 1224209.0999999186


In [30]:
df.merge(pd.DataFrame(data), on='path', how='inner').shape

(298718, 10)

In [31]:
print('{} duration : {}'.format(file_name[:-4], total_duration), flush=True)

validated duration : 1224209.0999999186


In [32]:
validated_df = df.merge(pd.DataFrame(data), on='path', how='inner')

In [44]:
stats_df = validated_df.groupby(['accent','gender','age']).agg({'duration':'sum','path':'count'}).reset_index()
stats_df.columns = ['accent','gender','age','duration','count']
stats_df.head()

Unnamed: 0,accent,gender,age,duration,count
0,african,female,fifties,506.7,88
1,african,female,fourties,202.4,41
2,african,female,sixties,832.9,183
3,african,female,teens,164.3,47
4,african,female,thirties,1821.3,416


In [45]:
stats_df.to_csv('statistics.csv', index=False)

# Insight
- There are 780 total validated hours based on Mozilla (https://voice.mozilla.org/en/datasets)
- Only 46.3% of the validated data contains demography information, which is around 350 hours
- After filtering empty demography data, there are only less than 2 hours test data, 5 hours dev data, and 62 hours train data
| dataset | # data | duration (hr) | # w/ demo | duration w/ demo (hr) |
|------|------|------|------|------|
| train | 63330 | - | 43922 | 62 |
| dev | 13178 | - | 3619 | 5.24 |
| test | 13178 | - | 1277 | 1.85 |
| other | 173800 | ±210 | - | - |
| invalidated | 78904 | ±90 | - | - |
| validated | 644119 | 780 | 298721 | 340 |