In [None]:
import pandas as pd
import numpy as np
import seaborn as sn
import os
import random

In [None]:
data_path = '/media/wiebke/248535ce-9ac0-4041-bcc5-7b5fd94a36bb/home/wiebke/data/' #on ps4: /data/mswc/
mswc_metadata = data_path+'mswc/metadata.json' # get from https://mlcommons.org/en/multilingual-spoken-words/

In [None]:
df = pd.read_json(mswc_metadata)

df.iloc[:,0:-1].T.sort_values(by='number_of_words', axis=0, ascending=False)

In [None]:
de_kws = dict(sorted(df['de']['wordcounts'].items(), key=lambda item: item[1], reverse=True)) 
en_kws = dict(sorted(df['en']['wordcounts'].items(), key=lambda item: item[1], reverse=True)) 
fr_kws = dict(sorted(df['fr']['wordcounts'].items(), key=lambda item: item[1], reverse=True)) 
rw_kws = dict(sorted(df['rw']['wordcounts'].items(), key=lambda item: item[1], reverse=True)) 

In [None]:
en_splits = pd.read_csv(data_path+'mswc/en_full/en_splits.csv')

In [None]:
en_gender_stats = en_splits.groupby(['GENDER','SET'])['VALID'].count()#.reset_index()
en_gender_totals = en_gender_stats.reset_index().groupby('GENDER')['VALID'].sum()#.reset_index()
print(en_gender_stats / en_gender_totals)
print(en_gender_stats / en_gender_stats.sum())

In [None]:
de_splits = pd.read_csv(data_path+'mswc/de_full/de_splits.csv') # download from https://mlcommons.org/en/multilingual-spoken-words/ - same goes for all other splits

In [None]:
de_gender_stats = de_splits.groupby(['GENDER','SET'])['VALID'].count()#.reset_index()
de_gender_totals = de_gender_stats.reset_index().groupby('GENDER')['VALID'].sum()#.reset_index()
print(de_gender_stats / de_gender_totals)
print(de_gender_stats / de_gender_stats.sum())

In [None]:
fr_splits = pd.read_csv(data_path+'mswc/fr_full/fr_splits.csv')

In [None]:
fr_gender_stats = fr_splits.groupby(['GENDER','SET'])['VALID'].count()#.reset_index()
fr_gender_totals = fr_gender_stats.reset_index().groupby('GENDER')['VALID'].sum()#.reset_index()
print(fr_gender_stats / fr_gender_totals)
print(fr_gender_stats / fr_gender_stats.sum())

In [None]:
rw_splits = pd.read_csv(data_path+'mswc/rw_full/rw_splits.csv')

In [None]:
rw_gender_stats = rw_splits.groupby(['GENDER','SET'])['VALID'].count()#.reset_index()
rw_gender_totals = rw_gender_stats.reset_index().groupby('GENDER')['VALID'].sum()#.reset_index()
print(rw_gender_stats / rw_gender_totals)
print(rw_gender_stats / rw_gender_stats.sum())

Rules for constructing evaluation dataset:
- language selection: en, de, fr, rw - most resourced languages
- we need train, dev, test splits where male and female speakers are equally represented across keywords and where keywords are equally represented
- all 4 languages should be equally represented
- we want each language dataset to resemble google speech commands - so roughly 100k keywords per dataset

In [None]:
def get_filtered_kw_list(dataset):
    """
    Filter dataset to only include keywords that meet the follwoing criteria:
    * in top 100 most frequent occurences
    * keyword length is greater than 3 characters
    * if there are multiple keywords that start with the same three characters, only take the first occuring keyword
    """
    
    word_list = dataset.groupby(['WORD'])['VALID'].count().sort_values(ascending=False)[:100] #select 100 most spoken keywords
    
    seen = set()
    uniq = []
    for x in word_list.index:
        if len(x) > 3: # only take words with more than 3 characters following MSWC paper
            if x[:3] not in seen: # if words start with the same 3 letters, only take the first occurence
                uniq.append(x)
                seen.add(x[:3])
    
    kws_selector = np.array([w for w in word_list.index if w in uniq])
    
    filtered_word_list = word_list[word_list.index.isin(kws_selector)].sort_index() #subselect only those words that meet criteria
    
    return filtered_word_list



def get_train_val_test_splits(dataset):
    # --> partially follows protocol as described in MSWC paper
    
    dataset['WORD_SPEAKER'] = list(zip(dataset.WORD, dataset.SPEAKER)) 
    train_val_test_lists = {}
    random.seed(4)
    
    for g in ['MALE','FEMALE']:
        #create list of unique (keyword, speaker) pairs so that train, test and eval sets are separate 
        unique_kws_speaker_pairs = dataset[dataset.GENDER==g]['WORD_SPEAKER'].unique()    
        #randomly sample 80% of (keyword, speaker) pairs for TRAINING
        train_kws_speaker_pairs = random.sample(list(unique_kws_speaker_pairs), round(0.8*len(unique_kws_speaker_pairs))) 
        #randomly sample 10% of (keyword, speaker) pairs for VALIDATION, excluding pairs already in TRAINING
        val_kws_speaker_pairs = random.sample(list(set(unique_kws_speaker_pairs).difference(set(train_kws_speaker_pairs))), round(0.1*len(unique_kws_speaker_pairs)))
        #use the remaining (keyword, speaker) pairs for TESTING
        test_kws_speaker_pairs = list(set(unique_kws_speaker_pairs).difference(set(train_kws_speaker_pairs).union(set(val_kws_speaker_pairs))))
        
        #get file links for all pairs    
        train_val_test_lists['training_list_'+g.lower()] = list(dataset[dataset.WORD_SPEAKER.isin(train_kws_speaker_pairs)]['LINK'].values)
        train_val_test_lists['validation_list_'+g.lower()] = list(dataset[dataset.WORD_SPEAKER.isin(val_kws_speaker_pairs)]['LINK'].values)
        train_val_test_lists['testing_list_'+g.lower()] = list(dataset[dataset.WORD_SPEAKER.isin(test_kws_speaker_pairs)]['LINK'].values)
    
    return train_val_test_lists



def generate_mswc_data_lists_gender_balanced(splits, n_kws):
    
    splits_mf = splits[splits.GENDER.isin(['MALE','FEMALE'])] #only use audio clips where gender metadata is known
    filtered_word_list = get_filtered_kw_list(splits_mf)
    
    # create dataset balanced by gender across keywords
    word_count_mf = splits_mf[splits_mf.WORD.isin(filtered_word_list.index)].groupby(['WORD','GENDER'])['VALID'].count().reset_index()
    data_gen = []
    for word in filtered_word_list.index:
        counter = min(word_count_mf.loc[(word_count_mf.WORD==word)&(word_count_mf.GENDER=='FEMALE'),'VALID'].values[0],
                    word_count_mf.loc[(word_count_mf.WORD==word)&(word_count_mf.GENDER=='MALE'),'VALID'].values[0])
        
        data_gen.append(splits_mf[splits_mf.WORD==word].groupby(['GENDER']).sample(n=counter, random_state=1))
        
    dataset = pd.concat(data_gen, axis=0, ignore_index=True)
    kw_list = dataset.groupby(['GENDER','WORD'])['SPEAKER'].count().sort_values(ascending=False)[:n_kws*2].reset_index()['WORD'].unique()
    dataset = dataset.loc[dataset.WORD.isin(kw_list)]
   
    train_val_test_dict = get_train_val_test_splits(dataset)
    
    assert(len(dataset)==sum([len(x) for x in train_val_test_dict.values()]))
    
    return train_val_test_dict, kw_list


def save_mswc_data_lists(splits_dict, n_kws, save_dir=None):

    list_dict = {}
    
    for k, v in splits_dict.items():

        train_val_test_dict, kw_list = generate_mswc_data_lists_gender_balanced(v, n_kws)

        list_dict[k] = {}    
        list_dict[k]['kw_list'] = kw_list
        i=0
        for l in train_val_test_dict.keys():
            list_dict[k][l] = train_val_test_dict[l]

            if save_dir is not None:
                os.makedirs(save_dir+'/'+k, exist_ok=True)
                write_to = save_dir+'/'+k+'/'+l+'.txt'
                with open(write_to, 'w') as f:
                    save_list = train_val_test_dict[l]
                    random.shuffle(save_list)
                    for line in save_list:
                        f.write("%s\n" % line.replace("opus", "wav"))
                print('Saved ', write_to)
            i+=1
            
    return list_dict


def generate_commands(kw_list, n_kws):

    kw_list.sort()
    command_dict = dict(zip(kw_list, range(0, n_kws, 1)))
    
    return print(command_dict)

In [None]:
n_kws=35
save_dir = '/home/wiebke/data/mswc/wav_files'+str(n_kws)
splits_dict = {'de':de_splits, 'rw':rw_splits, 'en':en_splits, 'fr':fr_splits}

list_dict = save_mswc_data_lists(splits_dict, n_kws, save_dir=None)

In [None]:
generate_commands(list_dict['rw']['kw_list'], n_kws)

In [None]:
from collections import Counter

commands = dict(Counter([x.split('/')[0] for x in list_dict['rw']['validation_list_male']]))
len(commands.keys())

In [None]:
def dataset_info(list_dict, language, agg_list=['count', 'nunique']):

    lists = ['training_list_female', 'training_list_male', 'validation_list_female', 'validation_list_male', 'testing_list_female', 'testing_list_male']

    agg_func = [pd.Series.nunique if x=='nunique' else x for x in agg_list]
    kw_count = pd.DataFrame(index = list_dict[language]['kw_list'])
    for l in lists:
        df = splits_dict[language][splits_dict[language].LINK.isin(list_dict[language][l])].groupby('WORD')['SPEAKER'].agg(agg_func)
        df.columns = [l+'_'+agg for agg in agg_list]
        kw_count = kw_count.join(df)      
    kw_count['total'] = kw_count.sum(axis=1)
    kw_count.sort_values(by='total', ascending=False, inplace=True)

    return kw_count

In [None]:
group1 = (l+'_count' for l in ['training_list_female', 'training_list_male', 'validation_list_female', 'validation_list_male', 'testing_list_female', 'testing_list_male'])
group2 = (l+'_nunique' for l in ['training_list_female', 'training_list_male', 'validation_list_female', 'validation_list_male', 'testing_list_female', 'testing_list_male'])

dataset_info(list_dict, 'de',['count']).drop(['total'], axis=1).plot.bar(stacked=True, figsize=(10, 5), 
                                                                         title='Histogram of MSWC-de keywords across train, test and validation splits for males and females');

In [None]:
dataset_info(list_dict, 'de',['count']).sum()

In [None]:
dataset_info(list_dict, 'fr',['count']).sum()

In [None]:
(85572-75644)/75644

In [None]:
dataset_info(list_dict, 'en',['count']).drop(['total'], axis=1).plot.bar(stacked=True, figsize=(10, 5),
                                                                         title='Histogram of MSWC-en keywords across train, test and validation splits for males and females');

In [None]:
dataset_info(list_dict, 'fr',['count']).drop(['total'], axis=1).plot.bar(stacked=True, figsize=(10, 5),
                                                                         title='Histogram of MSWC-fr keywords across train, test and validation splits for males and females');

In [None]:
dataset_info(list_dict, 'rw',['count']).drop(['total'], axis=1).plot.bar(stacked=True, figsize=(10, 5),
                                                                         title='Histogram of MSWC-rw keywords across train, test and validation splits for males and females');

In [None]:
#create function that concats all lists to give stats of dataset

dataset.groupby(['GENDER','WORD'])['SPEAKER'].nunique().unstack().T.plot.barh(figsize=(25,10), legend=True)

In [None]:
dataset[dataset.WORD_SPEAKER.isin(dev_kws_speaker_pairs)].groupby(['GENDER'])[['WORD','SPEAKER', 'WORD_SPEAKER']].agg(['count','nunique'])