In [20]:
import os
from pathlib import Path
import pandas as pd
import numpy as np

In [21]:
metadata_path = "../metadata"

In [22]:
# Create a dictionary of DataFrames, one for each language, with all the data provided
lang_data = {
    lang_dir: pd.concat(
        [pd.read_csv(f'{metadata_path}/{lang_dir}/cv-corpus-15.0-2023-09-08/{lang_dir}/{set_name}.tsv', delimiter="\t")
         for set_name in ["train", "test"]],
    )
    for lang_dir in os.listdir(metadata_path)
}

  [pd.read_csv(f'{metadata_path}/{lang_dir}/cv-corpus-15.0-2023-09-08/{lang_dir}/{set_name}.tsv', delimiter="\t")


In [23]:
# Modify the dictionary of DataFrames to have also the duration
for lang in lang_data: 
    lang_data[lang] = pd.merge(
        lang_data[lang].rename(columns={'path': 'clip'}),
        pd.read_csv(f'{metadata_path}/{lang}/cv-corpus-15.0-2023-09-08/{lang}/clip_durations.tsv', delimiter="\t"),
    )

In [24]:
#Print the number of speakers for each language
print("number of speakers for each language")
for lang,df in lang_data.items():
    print(lang, df["client_id"].nunique())

number of speakers for each language
ar 982
de 12277
en 47861
es 10990
fr 12153
id 332
it 5098
ja 1073
lg 514
ru 2144
sw 956
ta 471
tr 1268
zh-CN 3454


In [25]:
#Print the total duration for each language
print("total duration for each language")
for lang,df in lang_data.items():
    print(lang, df["duration[ms]"].sum())

total duration for each language
ar 162160035
de 3341755685
en 6174878911
es 1720018728
fr 2820768179
id 42566295
it 975913433
ja 57698436
lg 466082146
ru 192763014
sw 305649144
ta 329813232
tr 149813327
zh-CN 215156913


In [26]:
#Create several DataFrames
genders = ["female", "male", None]
languages = list(lang_data.keys())

data = []

for gender in genders:
    for lang in languages:
        df = lang_data[lang]
        total_duration = df[df["gender"] == gender]["duration[ms]"].sum()
        num_speakers = df["client_id"].nunique()
        data.append([gender, lang, total_duration, num_speakers])
        
        #For each language and each gender : average, max, median, min number of recordings per speaker
        #One DataFrame for each language and each gender
        new_df = df.groupby(by="client_id")["duration[ms]"].agg(['max', 'min', 'mean', 'median'])
        #new_df.to_csv(f"{lang}_{gender}.csv")

# For each language and each genre : total duration of recordings, the number of different speakers
result_df = pd.DataFrame(data, columns=["Gender", "Language", "Total Duration", "Number of Speakers"])



In [33]:
lang_data["ar"][lang_data["ar"]["age"].notna()]["age"].unique()

array(['twenties', 'thirties', 'teens', 'fourties', 'sixties', 'fifties',
       'nineties'], dtype=object)

In [28]:
for lang,df in lang_data.items():
    fem_df = []

In [61]:
for lang,df in lang_data.items():
    print(lang)
    counts = df[df["age"] == "nineties"]["client_id"].value_counts()
    if not counts.empty :
        print(counts.idxmax())
        print(counts.max()/counts.sum(),'\n')

ar
0e68ec0efd84a7584b516991849470ecfdac7466619031cdf6d91d599835a50e5ed949520fcb99280d0b902f3a2f6089840eea2a2221bec276316800efef4c73
1.0 

de
7a20a1260801f4f5858d793478a1968346c32ee642c2fd6189b8ac33cd7c7d19b7663d1ff800468edf5c9f9f70dcb1bd64afa890200f1da33c23ca624cbca7cb
1.0 

en
1614deccdbfff77e69c0d1b186e5dfc4de1c54b80c6b71bc2fba88c80b279cc195addee8636b5a6aa8a5d4b5f086cce67f05b2d926d8c7aec97d99d0631a69d4
0.363013698630137 

es
9ef63c9b9fc1b47bd05e7a351efaed8fb08729b6c80d8cea74b8061bbe591c25617d3f6e80abb188905ce486a565e4eaa45e55c9d642012895bfdfdee22f1ffd
0.7215189873417721 

fr
id
it
c11a790afbae5008d5779765d01c0e0c658f54fc7be71f7b0d36edaa00255b4b61ac2357128e4ae66fe9b819667e5c17f1e0f5830bba62c24b746c5b9afd4c76
1.0 

ja
bae6f5691ffa93b244332d429bbe25a3c7d8db55662febffd58ee0f6ebaad1314ea7214bef381f0b550520b79d4f9f1c6b919c5521ee11a21c1a10b622d8117f
1.0 

lg
cb684886da071f9271fa154a6228cb11474bdceab3ecdf7b62adc89ad6674965d51e56ab67c219e372f499c2922feeb42c6c8e4e02a6b9eebdbd6155b87883d0
1.0 
