# Notebook for cleaning & processing the dataset before sampling

In [None]:
import pandas as pd
import sys
import os
import multiprocessing
import montreal_forced_aligner
import numpy as np
import ast
import engCDSutils as util
import importlib
importlib.reload(util)

In [2]:
# Replace the below links according to the file structure on your local computer

directory_with_corpora = 'infoTheoryProj/allTextInputs'
directory_of_mfa =  'MFA/pretrained_models/dictionary'
directory_of_csv = 'infoTheoryProj/dataScrape'
directory_of_output = 'infoTheoryProj/dictAlignment'
directory_of_g2p = 'MFA/pretrained_models/g2p'

### Part 1 of the data cleaning pipeline : Basic pre-processing and first pass of g2p (dictionary based) 

In [3]:
def runNotebookp1(corpus_name, dict_name):
    df_path = os.path.join(directory_of_csv,corpus_name+".csv")
    og_df = pd.read_csv(df_path)
    cds_df = og_df[(og_df['speaker_role'] !='Target_Child') & (og_df['speaker_role'] !='Investigator')]
    cds_utt_list = list(cds_df['gloss'])
    print("Total number of CDS utterances in corpus (includes null values):", len(cds_utt_list))
    cds_utt_list = [x for x in cds_utt_list if x is not np.nan]
    print("Total number of non-null CDS utterances in corpus:", len(cds_utt_list))
    
    list_of_names = []
    with open('dictAlignment/'+ corpus_name+ '_names.txt', 'r') as file:
        name_list_v1 = [line.strip() for line in file]
        for i in name_list_v1:
            newi = i.strip(".,!?;:\"()[]{}").lower()
            if "'s" in newi:
                list_of_names.append(newi[:-2])
            newi=newi.replace("'", "")
            list_of_names.append(newi)
            if len(newi)>2 and newi[-1]=="s":
                list_of_names.append(newi[:-1])
    
    cleaned_words, length ,thrown_out = util.preProcess(cds_utt_list,list_of_names)
    
    link_to_dict = os.path.join(directory_of_mfa, dict_name)
    transcribed_text, length_of_cleaned, unrecog_ctr = util.createTranscription(cleaned_words,link_to_dict)
    list_of_unrecognized_words=[]
    for word,transcription in transcribed_text.items():
        if transcription =='':
                list_of_unrecognized_words.append(word)

    print("Total number of word tokens which could not be recognized in dictionary-based g2p:", unrecog_ctr)            
    print("Total number of UNIQUE word types which could not be recognized in dictionary-based g2p:", len(list_of_unrecognized_words))  
    print("Percentage of total word tokens it could not transcribe: ", 100*unrecog_ctr/length_of_cleaned, "%")
    
    
    ## Making a text file of unrecognizable words which will go in next round of g2p (phonology based)
    unrecognized_df = pd.DataFrame(list_of_unrecognized_words)
    csvpath = os.path.join(directory_of_output, corpus_name+'_unrecog_errors.txt')
    unrecognized_df.to_csv(csvpath,index=False,header=False)
    
    cds_df_nonnull =cds_df[~cds_df['gloss'].isna()]  
    preprocessed_df = util.preProcessCSV(cds_df_nonnull,list_of_Newman_names)
    
    return transcribed_text, preprocessed_df
    

### Part 2 of the data cleaning pipeline : Treating some words manually and then second pass through g2p (phonology based)

In [4]:
def runNotebookp2(corpus_name, dict_name):
    name_of_mfa = dict_name[:-5]
    path_to_text = os.path.join(directory_of_output, corpus_name+'_unrecog_errors_treated.txt')
    output_path = os.path.join(directory_of_output, corpus_name+'_secondpass_result.txt')
    util.create_and_submit_job(path_to_text, name_of_mfa,output_path)
    return output_path

### Part 3 of the data cleaning pipeline : Preparing the final gloss in a clean csv

In [None]:
def runNotebookp3(corpus_name, dict_name, output_path,transcribed_text,preprocessed_df):   
    dictionary_from_second_pass = util.giveDictionary(output_path)
    
    transcribed_text.update(dictionary_from_second_pass)
    
    result_dict, result_df, stillerror = util.prepareGlossedCSV(preprocessed_df, corpus_name,transcribed_text,directory_of_output)
    
    phonemic_cleaned = []
    for x in result_df['phonemic_gloss']:
        x_clean = []

        ctr = 0
        phonemecount = 0
        for y in x: 
            phonemecount+=len(y)
            if y!='':
                ctr +=1
                x_clean.append(y) 
        phonemic_cleaned.append(x_clean)

    result_df['phonemic_cleaned']= phonemic_cleaned  
    ipa_output_path = os.path.join(directory_of_output, corpus_name+'_ipa_result.csv')
    result_df.to_csv(ipa_output_path,index=False)
    return result_df

Trying the pipeline on the first corpus, NewmanRatner

In [None]:
transcribed_text, preprocessed_df = runNotebookp1('NewmanRatner','english_us_mfa.dict')

In [51]:
path = runNotebookp2('NewmanRatner','english_us_mfa.dict')

Your job 4795573 ("job_script_item_name.sh") has been submitted




In [58]:
final_df = runNotebookp3('NewmanRatner','english_us_mfa.dict', path ,transcribed_text,preprocessed_df) 

146158


In [70]:
final_df 

Unnamed: 0,id,gloss,stem,actual_phonology,model_phonology,type,language,num_morphemes,num_tokens,utterance_order,...,media_unit,collection_name,collection_id,corpus_id,speaker_id,target_child_id,transcript_id,gloss_cleaned,phonemic_gloss,phonemic_cleaned
3,2647047,oh yeah sure,oh yeah sure,,,declarative,eng,3,3,4,...,,Eng-NA,2,76,4228,4226,10155,oh yeah sure,"[[ow], [j, æ], [ʃ, ʊ, ɹ]]","[[ow], [j, æ], [ʃ, ʊ, ɹ]]"
5,2647067,mhm,mhm,,,declarative,eng,1,1,6,...,s,Eng-NA,2,76,4228,4226,10155,mhm,[],[]
7,2647084,alright okay xxx,alright okay,,,declarative,eng,2,3,8,...,s,Eng-NA,2,76,4228,4226,10155,alright okay,"[[ɑ, ɫ, ɹ, aj, t], [cʰ, ej]]","[[ɑ, ɫ, ɹ, aj, t], [cʰ, ej]]"
8,2647089,what looks good,what look good,,,question,eng,4,3,9,...,,Eng-NA,2,76,4228,4226,10155,what looks good,"[[w, ɐ], [l, ʊ, k, s], [ɡ, ʊ, d]]","[[w, ɐ], [l, ʊ, k, s], [ɡ, ʊ, d]]"
10,2647108,what looks good,what look good,,,question,eng,4,3,11,...,s,Eng-NA,2,76,4228,4226,10155,what looks good,"[[w, ɐ], [l, ʊ, k, s], [ɡ, ʊ, d]]","[[w, ɐ], [l, ʊ, k, s], [ɡ, ʊ, d]]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
208557,2861987,yeah,yeah,,,imperative_emphatic,eng,1,1,596,...,,Eng-NA,2,76,5683,5679,11239,yeah,"[[j, æ]]","[[j, æ]]"
208594,2862545,can you say pencil,can you say pencil,,,question,eng,4,4,633,...,s,Eng-NA,2,76,5683,5679,11239,can you say pencil,"[[k, n̩], [j, ʉː], [s, ej], [pʰ, ɛ, n, s, ə, ɫ]]","[[k, n̩], [j, ʉː], [s, ej], [pʰ, ɛ, n, s, ə, ɫ]]"
208603,2862655,that's when he usually signs,that when he usual sign,,,declarative,eng,9,5,642,...,s,Eng-NA,2,76,5683,5679,11239,thats when he usually signs,"[[ð, æ, t, s], [w, ɪ, n], [iː], [j, ʉː, ʒ, ʊ, ...","[[ð, æ, t, s], [w, ɪ, n], [iː], [j, ʉː, ʒ, ʊ, ..."
208604,2862668,so I'll be surprised if he actually does it,so I be surprise if he actual do it,,,declarative,eng,14,9,643,...,s,Eng-NA,2,76,5683,5679,11239,so i'll be surprised if he actually does it,"[[s, ow], [aj, ɫ], [bʲ, i], [s, ɚ, p, ɹ, aj, z...","[[s, ow], [aj, ɫ], [bʲ, i], [s, ɚ, p, ɹ, aj, z..."


## Some  descriptive analysis

In [23]:
result_df = pd.read_csv(ipa_output_path)

In [54]:
import ast
how_many_words = []
how_many_phonemes = []
how_many_transcribed = []
for i in result_df['phonemic_gloss']:
    x = ast.literal_eval(i)
    how_many_words.append(len(x))
    ctr = 0
    phonemecount = 0
    for y in x: 
        phonemecount+=len(y)
        if y!='':
            ctr +=1
    how_many_transcribed.append(ctr)
    how_many_phonemes.append(phonemecount)

In [58]:
result_df['length_of_result'] = how_many_words
result_df['num_relevant_words'] = how_many_transcribed
result_df['num_phonemes'] = how_many_phonemes

In [59]:
print("Total number of words we had from CHILDES originally", result_df['num_tokens'].sum())
print("Total number of words we had after initial cleaning", result_df['length_of_result'].sum())
print("Total number of words we have transcriptions for", result_df['num_relevant_words'].sum())
print("Total number of phonemes we have ", result_df['num_phonemes'].sum())

Total number of words we had from CHILDES originally 685315
Total number of words we had after initial cleaning 687991
Total number of words we have transcriptions for 685413
Total number of phonemes we have transcriptions for 1873661


In [60]:
subset_df = result_df[['gloss', 'target_child_age','target_child_sex','phonemic_gloss','num_relevant_words','num_phonemes']]
subset_df

Unnamed: 0,gloss,target_child_age,target_child_sex,phonemic_gloss,num_relevant_words,num_phonemes
0,oh yeah sure,7.000144,female,"[['ow'], ['j', 'æ'], ['ʃ', 'ʊ', 'ɹ']]",3,6
1,mhm,7.000144,female,[''],0,0
2,alright okay xxx,7.000144,female,"[['ɑ', 'ɫ', 'ɹ', 'aj', 't'], ['cʰ', 'ej']]",2,7
3,what looks good,7.000144,female,"[['w', 'ɐ'], ['l', 'ʊ', 'k', 's'], ['ɡ', 'ʊ', ...",3,9
4,what looks good,7.000144,female,"[['w', 'ɐ'], ['l', 'ʊ', 'k', 's'], ['ɡ', 'ʊ', ...",3,9
...,...,...,...,...,...,...
146153,yeah,24.000493,male,"[['j', 'æ']]",1,2
146154,can you say pencil,24.000493,male,"[['k', 'n̩'], ['j', 'ʉː'], ['s', 'ej'], ['pʰ',...",4,12
146155,that's when he usually signs,24.000493,male,"[['ð', 'æ', 't', 's'], ['w', 'ɪ', 'n'], ['iː']...",5,19
146156,so I'll be surprised if he actually does it,24.000493,male,"[['s', 'ow'], ['aj', 'ɫ'], ['bʲ', 'i'], ['s', ...",9,27


In [63]:
aggregated_df = subset_df.groupby(['target_child_age', 'target_child_sex']).agg({'num_relevant_words': 'sum','num_phonemes': 'sum'})

In [64]:
aggregated_df

Unnamed: 0_level_0,Unnamed: 1_level_0,num_relevant_words,num_phonemes
target_child_age,target_child_sex,Unnamed: 2_level_1,Unnamed: 3_level_1
7.000144,female,81324,221790
7.000144,male,68863,188129
10.000205,female,52330,143764
10.000205,male,45243,125237
11.000226,female,81838,225677
11.000226,male,61962,169780
18.00037,female,39664,108043
18.00037,male,38359,104802
24.000493,female,107671,292976
24.000493,male,106394,288701


In [68]:
result_df.to_csv(ipa_output_path,index=False)

In [67]:
result_df

Unnamed: 0,id,gloss,stem,actual_phonology,model_phonology,type,language,num_morphemes,num_tokens,utterance_order,...,collection_id,corpus_id,speaker_id,target_child_id,transcript_id,phonemic_gloss,gloss_cleaned,length_of_result,num_relevant_words,num_phonemes
0,2647047,oh yeah sure,oh yeah sure,,,declarative,eng,3,3,4,...,2,76,4228,4226,10155,"[['ow'], ['j', 'æ'], ['ʃ', 'ʊ', 'ɹ']]",oh yeah sure,3,3,6
1,2647067,mhm,mhm,,,declarative,eng,1,1,6,...,2,76,4228,4226,10155,[''],mhm,1,0,0
2,2647084,alright okay xxx,alright okay,,,declarative,eng,2,3,8,...,2,76,4228,4226,10155,"[['ɑ', 'ɫ', 'ɹ', 'aj', 't'], ['cʰ', 'ej']]",alright okay,2,2,7
3,2647089,what looks good,what look good,,,question,eng,4,3,9,...,2,76,4228,4226,10155,"[['w', 'ɐ'], ['l', 'ʊ', 'k', 's'], ['ɡ', 'ʊ', ...",what looks good,3,3,9
4,2647108,what looks good,what look good,,,question,eng,4,3,11,...,2,76,4228,4226,10155,"[['w', 'ɐ'], ['l', 'ʊ', 'k', 's'], ['ɡ', 'ʊ', ...",what looks good,3,3,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
146153,2861987,yeah,yeah,,,imperative_emphatic,eng,1,1,596,...,2,76,5683,5679,11239,"[['j', 'æ']]",yeah,1,1,2
146154,2862545,can you say pencil,can you say pencil,,,question,eng,4,4,633,...,2,76,5683,5679,11239,"[['k', 'n̩'], ['j', 'ʉː'], ['s', 'ej'], ['pʰ',...",can you say pencil,4,4,12
146155,2862655,that's when he usually signs,that when he usual sign,,,declarative,eng,9,5,642,...,2,76,5683,5679,11239,"[['ð', 'æ', 't', 's'], ['w', 'ɪ', 'n'], ['iː']...",thats when he usually signs,5,5,19
146156,2862668,so I'll be surprised if he actually does it,so I be surprise if he actual do it,,,declarative,eng,14,9,643,...,2,76,5683,5679,11239,"[['s', 'ow'], ['aj', 'ɫ'], ['bʲ', 'i'], ['s', ...",so i'll be surprised if he actually does it,9,9,27


In [74]:
result_df.to_csv(ipa_output_path,index=False)

# Now trying this pipeline on other corpora

NewmanRatner : done above

### BrentSiskind corpus

In [None]:
transcribed_text, preprocessed_df = runNotebookp1('Brent','english_us_mfa.dict')

In [95]:
path = runNotebookp2('Brent','english_us_mfa.dict')

Your job 4795968 ("job_script_item_name.sh") has been submitted




In [96]:
final_df = runNotebookp3('Brent','english_us_mfa.dict', path ,transcribed_text,preprocessed_df) 

148994


In [97]:
final_df

Unnamed: 0,id,gloss,stem,actual_phonology,model_phonology,type,language,num_morphemes,num_tokens,utterance_order,...,media_unit,collection_name,collection_id,corpus_id,speaker_id,target_child_id,transcript_id,gloss_cleaned,phonemic_gloss,phonemic_cleaned
0,2138060,Morgan,Morgan,,,declarative,eng,1,1,1,...,s,Eng-NA,2,68,3838,3832,8268,,[],[]
1,2138061,pull it up yourself,pull it up yourself,,,imperative_emphatic,eng,4,4,1,...,s,Eng-NA,2,68,3837,3836,8267,pull it up yourself,"[[pʰ, ʊ, ɫ], [ɪ], [ɐ, p], [j, ʊ, ɹ, s, ɛ, ɫ, f]]","[[pʰ, ʊ, ɫ], [ɪ], [ɐ, p], [j, ʊ, ɹ, s, ɛ, ɫ, f]]"
2,2138063,see,see,,,imperative_emphatic,eng,1,1,2,...,s,Eng-NA,2,68,3838,3832,8268,see,"[[s, iː]]","[[s, iː]]"
3,2138064,hands up,hand up,,,imperative_emphatic,eng,3,2,2,...,s,Eng-NA,2,68,3837,3836,8267,hands up,"[[h, æ, n, z], [ɐ, p]]","[[h, æ, n, z], [ɐ, p]]"
4,2138066,hands up,hand up,,,imperative_emphatic,eng,3,2,3,...,s,Eng-NA,2,68,3837,3836,8267,hands up,"[[h, æ, n, z], [ɐ, p]]","[[h, æ, n, z], [ɐ, p]]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
168014,2330240,xxx,,,,declarative,eng,-2147483648,1,1087,...,s,Eng-NA,2,68,3892,3890,8477,,[],[]
168015,2330241,you want,you want,,,question,eng,2,2,1088,...,s,Eng-NA,2,68,3892,3890,8477,you want,"[[j, ʉː], [w, ə, n]]","[[j, ʉː], [w, ə, n]]"
168016,2330242,that what you want,that what you want,,,question,eng,4,4,1089,...,s,Eng-NA,2,68,3892,3890,8477,that what you want,"[[ð, æ], [w, ɐ], [j, ʉː], [w, ə, n]]","[[ð, æ], [w, ɐ], [j, ʉː], [w, ə, n]]"
168017,2330243,you trying to snap your finger,you try to snap your finger,,,question,eng,7,6,1090,...,s,Eng-NA,2,68,3892,3890,8477,you trying to snap your finger,"[[j, ʉː], [t, ɹ, aj, ɪ, n], [tʰ, ʊ], [s, ɲ, æ,...","[[j, ʉː], [t, ɹ, aj, ɪ, n], [tʰ, ʊ], [s, ɲ, æ,..."


### Rollins corpus

In [None]:
transcribed_text, preprocessed_df = runNotebookp1('Rollins','english_us_mfa.dict')

In [99]:
path = runNotebookp2('Rollins','english_us_mfa.dict')

Your job 4796000 ("job_script_item_name.sh") has been submitted




In [100]:
final_df = runNotebookp3('Rollins','english_us_mfa.dict', path ,transcribed_text,preprocessed_df) 

17246


In [101]:
final_df

Unnamed: 0,id,gloss,stem,actual_phonology,model_phonology,type,language,num_morphemes,num_tokens,utterance_order,...,media_unit,collection_name,collection_id,corpus_id,speaker_id,target_child_id,transcript_id,gloss_cleaned,phonemic_gloss,phonemic_cleaned
0,906608,hello Chi,hello Chi,,,declarative,eng,2,2,1,...,s,Eng-NA,2,47,2532,2530,4523,hello chi,"[[h, ɛ, l, ow], [ʃ, aj]]","[[h, ɛ, l, ow], [ʃ, aj]]"
1,906617,how are you today,how be you today,,,declarative,eng,5,4,2,...,s,Eng-NA,2,47,2532,2530,4523,how are you today,"[[h, aw], [ɚ], [j, ʉː], [tʰ, ʊ, d, ej]]","[[h, aw], [ɚ], [j, ʉː], [tʰ, ʊ, d, ej]]"
4,906635,I love you,I love you,,,declarative,eng,3,3,5,...,s,Eng-NA,2,47,2532,2530,4523,i love you,"[[aj], [l, ɐ, v], [j, ʉː]]","[[aj], [l, ɐ, v], [j, ʉː]]"
6,906648,I love you,I love you,,,declarative,eng,3,3,7,...,s,Eng-NA,2,47,2532,2530,4523,i love you,"[[aj], [l, ɐ, v], [j, ʉː]]","[[aj], [l, ɐ, v], [j, ʉː]]"
7,906654,uh,,,,declarative,eng,-2147483648,1,8,...,s,Eng-NA,2,47,2532,2530,4523,uh,[[ə]],[[ə]]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27744,943620,what did you find,what do you find,,,question,eng,5,4,629,...,s,Eng-NA,2,47,2532,2530,4650,what did you find,"[[w, ɐ], [dʲ, ɪ, d], [j, ʉː], [f, aj, n]]","[[w, ɐ], [dʲ, ɪ, d], [j, ʉː], [f, aj, n]]"
27748,943662,can Chi make the birdie sing,can Chi make the bird sing,,,question,eng,7,6,633,...,s,Eng-NA,2,47,2532,2530,4650,can chi make the birdie sing,"[[k, n̩], [ʃ, aj], [m, ej, k], [d̪, iː], [b, ɝ...","[[k, n̩], [ʃ, aj], [m, ej, k], [d̪, iː], [b, ɝ..."
27750,943684,very good,very good,,,declarative,eng,2,2,635,...,s,Eng-NA,2,47,2532,2530,4650,very good,"[[v, ɛ, ɹ, i], [ɡ, ʊ, d]]","[[v, ɛ, ɹ, i], [ɡ, ʊ, d]]"
27753,943712,oh that rattle is nice,oh that rattle be nice,,,declarative,eng,6,5,638,...,s,Eng-NA,2,47,2532,2530,4650,oh that rattle is nice,"[[ow], [ð, æ], [ɹ, æ, t, ə, ɫ], [z], [n, aj, s]]","[[ow], [ð, æ], [ɹ, æ, t, ə, ɫ], [z], [n, aj, s]]"


### Providence corpus

In [None]:
transcribed_text, preprocessed_df = runNotebookp1('Providence','english_us_mfa.dict')

In [8]:
path = runNotebookp2('Providence','english_us_mfa.dict')

Your job 4980722 ("job_script_item_name.sh") has been submitted




In [9]:
final_df = runNotebookp3('Providence','english_us_mfa.dict', path ,transcribed_text,preprocessed_df) 

### Vankleeck corpus

In [7]:
transcribed_text, preprocessed_df = runNotebookp1('Vankleeck','english_us_mfa.dict')

Total number of CDS utterances in corpus (includes null values): 8756
Total number of non-null CDS utterances in corpus: 8756
Total number of word tokens originally in corpus: 38091
Total number of word tokens found during preprocessing: 38393
Total number of word tokens left after preprocessing: 38067
Total number of UNIQUE word types left after preprocessing: 1743
Total number of word tokens we threw out: 326
Total number of word tokens which could not be recognized in dictionary-based g2p: 1548
Total number of UNIQUE word types which could not be recognized in dictionary-based g2p: 129
Percentage of total word tokens it could not transcribe:  4.031984997265126 %


In [8]:
path = runNotebookp2('Vankleeck','english_us_mfa.dict')

Your job 4984984 ("job_script_item_name.sh") has been submitted




In [9]:
final_df = runNotebookp3('Vankleeck','english_us_mfa.dict', path ,transcribed_text,preprocessed_df) 

In [10]:
final_df

Unnamed: 0,id,gloss,stem,actual_phonology,model_phonology,type,language,num_morphemes,num_tokens,utterance_order,...,media_unit,collection_name,collection_id,corpus_id,speaker_id,target_child_id,transcript_id,gloss_cleaned,phonemic_gloss,phonemic_cleaned
0,809791,wanna play with the farm,want play with the farm,,,question,eng,6,5,1,...,,Eng-NA,2,44,2477,2476,4258,wanna play with the farm,"[[w, ə, n, ə], [p, l, ej], [w, ɪ, θ], [ð], [f,...","[[w, ə, n, ə], [p, l, ej], [w, ɪ, θ], [ð], [f,..."
1,809798,okay,okay,,,declarative,eng,1,1,2,...,,Eng-NA,2,44,2477,2476,4258,okay,"[[cʰ, ej]]","[[cʰ, ej]]"
2,809804,can I play too,can I play too,,,question,eng,4,4,3,...,,Eng-NA,2,44,2477,2476,4258,can i play too,"[[k, n̩], [aj], [p, l, ej], [tʰ, ʉ]]","[[k, n̩], [aj], [p, l, ej], [tʰ, ʉ]]"
3,809813,is it okay,be it okay,,,question,eng,4,3,4,...,,Eng-NA,2,44,2477,2476,4258,is it okay,"[[z], [ɪ], [cʰ, ej]]","[[z], [ɪ], [cʰ, ej]]"
4,809820,Amy what happened to the cow,Amy what happen to the cow,,,question,eng,7,6,5,...,,Eng-NA,2,44,2477,2476,4258,what happened to the cow,"[[w, ɐ], [h, æ, p, ə, n, d], [tʰ, ʊ], [ð], [kʰ...","[[w, ɐ], [h, æ, p, ə, n, d], [tʰ, ʊ], [ð], [kʰ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15424,837110,what now,what now,,,question,eng,2,2,571,...,,Eng-NA,2,44,2498,2495,4277,what now,"[[w, ɐ], [n, aw]]","[[w, ɐ], [n, aw]]"
15426,837140,oh we can't make some because we're going to s...,oh we can make some because we go to school I see,,,declarative,eng,15,12,573,...,,Eng-NA,2,44,2498,2495,4277,oh we can't make some because we're going to s...,"[[ow], [w, iː], [k, ə, n, t], [m, ej, k], [s, ...","[[ow], [w, iː], [k, ə, n, t], [m, ej, k], [s, ..."
15428,837174,what did you sing today,what do you sing today,,,question,eng,6,5,575,...,,Eng-NA,2,44,2498,2495,4277,what did you sing today,"[[w, ɐ], [dʲ, ɪ, d], [j, ʉː], [s, ɪ, n], [tʰ, ...","[[w, ɐ], [dʲ, ɪ, d], [j, ʉː], [s, ɪ, n], [tʰ, ..."
15429,837188,did you go to music,do you go to music,,,question,eng,6,5,576,...,,Eng-NA,2,44,2498,2495,4277,did you go to music,"[[dʲ, ɪ, d], [j, ʉː], [ɡ, ow], [tʰ, ʊ], [mʲ, ʉ...","[[dʲ, ɪ, d], [j, ʉː], [ɡ, ow], [tʰ, ʊ], [mʲ, ʉ..."


## Descriptive analyses of different corpora - SUMMARY STATS

## NewmanRatner

In [2]:
subset, aggr = util.tokenDescriptions(directory_of_output,'NewmanRatner')

Total number of words we had from CHILDES originally 685315
Total number of words we had after initial cleaning 687991
Total number of words we have transcriptions for 684703
Total number of phonemes we have  1871118
Total number of  unique child IDs detected: 503
Total number of  unique speaker roles detected: 1


In [48]:
uid_list = (list(set(subset['target_child_name'])))

In [49]:
processed_uids = []
only7s = []
only10s = []
only11s = []
only18s = []
only24s = []

for uid in uid_list:
    if isinstance(uid, str):
        processed_uids.append(str(uid)[:4])
        if uid[-1] == '7':
            only7s.append(uid[:6])
        elif uid[-2:] == "10":
            only10s.append(uid[:6])
        elif uid[-2:] == "11":
            only11s.append(uid[:6])
        elif uid[-2:] == "18":
            only18s.append(uid[:6])
        elif uid[-2:] == "24":
            only24s.append(uid[:6])
            
print ('total number of unique child IDs now,', len(set(processed_uids)))
print ('total number of unique child IDs sampled at 7  mos,', len(set(only7s)))
print ('total number of unique child IDs sampled at 10 mos,', len(set(only10s)))
print ('total number of unique child IDs sampled at 11 mos,', len(set(only11s)))
print ('total number of unique child IDs sampled at 18 mos,', len(set(only18s)))
print ('total number of unique child IDs sampled at 24 mos,', len(set(only24s)))

total number of unique child IDs now, 124
total number of unique child IDs sampled at 7  mos, 125
total number of unique child IDs sampled at 10 mos, 84
total number of unique child IDs sampled at 11 mos, 113
total number of unique child IDs sampled at 18 mos, 55
total number of unique child IDs sampled at 24 mos, 123


In [64]:
len(full_set)

124

In [3]:
subset['child_id_initials'] =  subset['target_child_name'].str[:6]
subset['child_id'] =  subset['target_child_name'].str[:4]

In [4]:
subset

Unnamed: 0,gloss,target_child_name,transcript_id,target_child_age,target_child_sex,target_child_id,phonemic_gloss,num_relevant_words,num_phonemes,child_id_initials,child_id
0,oh yeah sure,4269LP7,10155,7.000144,female,4226,"[['ow'], ['j', 'æ'], ['ʃ', 'ʊ', 'ɹ']]",3,6,4269LP,4269
1,mhm,4269LP7,10155,7.000144,female,4226,[''],0,0,4269LP,4269
2,alright okay xxx,4269LP7,10155,7.000144,female,4226,"[['ɑ', 'ɫ', 'ɹ', 'aj', 't'], ['cʰ', 'ej']]",2,7,4269LP,4269
3,what looks good,4269LP7,10155,7.000144,female,4226,"[['w', 'ɐ'], ['l', 'ʊ', 'k', 's'], ['ɡ', 'ʊ', ...",3,9,4269LP,4269
4,what looks good,4269LP7,10155,7.000144,female,4226,"[['w', 'ɐ'], ['l', 'ʊ', 'k', 's'], ['ɡ', 'ʊ', ...",3,9,4269LP,4269
...,...,...,...,...,...,...,...,...,...,...,...
146153,yeah,7300AP24,11239,24.000493,male,5679,"[['j', 'æ']]",1,2,7300AP,7300
146154,can you say pencil,7300AP24,11239,24.000493,male,5679,"[['k', 'n̩'], ['j', 'ʉː'], ['s', 'ej'], ['pʰ',...",4,12,7300AP,7300
146155,that's when he usually signs,7300AP24,11239,24.000493,male,5679,"[['ð', 'æ', 't', 's'], ['w', 'ɪ', 'n'], ['iː']...",5,19,7300AP,7300
146156,so I'll be surprised if he actually does it,7300AP24,11239,24.000493,male,5679,"[['s', 'ow'], ['aj', 'ɫ'], ['bʲ', 'i'], ['s', ...",9,27,7300AP,7300


In [5]:
print("Total no of unique child IDs found:", subset['child_id'].nunique())

Total no of unique child IDs found: 124


In [6]:
print("Total no of unique child ID-initial strings:", subset['child_id_initials'].nunique()) 

Total no of unique child ID-initial strings: 130


In [8]:
subset.to_csv(os.path.join(directory_of_output,'NewmanRatner_id_fixed.csv'))

In [56]:
newman_binned = util.binInto6(subset)

In [14]:
util.ageSexSummary(subset)

Unnamed: 0,target_child_age,target_child_sex,total_words,total_phonemes,no_unique_child
0,7.000144,female,81273,221594,62
1,7.000144,male,68840,188074,61
2,10.000205,female,52274,143573,42
3,10.000205,male,45214,125146,41
4,11.000226,female,81684,225043,60
5,11.000226,male,61906,169620,51
6,18.00037,female,39613,107854,27
7,18.00037,male,38304,104597,26
8,24.000493,female,107516,292371,61
9,24.000493,male,106314,288482,59


In [None]:
util.ageSexSummary(newman_binned)

In [None]:
corpusname = "NewmanRatner"
aggregation  = util.ageSexSummary(newman_binned)
aggregation.to_csv("aggregation_"+corpusname+".csv",index=False)

## BrentSiskind

In [9]:
subset, aggr = util.tokenDescriptions(directory_of_output,'Brent')

Total number of words we had from CHILDES originally 504910
Total number of words we had after initial cleaning 497307
Total number of words we have transcriptions for 495578
Total number of phonemes we have  1373980
Total number of  unique child IDs detected: 19
Total number of  unique speaker roles detected: 2


In [60]:
howManyUnique(subset['target_child_id'])

19

#### Fixing Brent corpus scraping by working on ages and number of subjects

In [10]:
brent_transcripts = pd.read_csv('dictAlignment/brent_transcripts.csv')

In [11]:
child_ids = []
agestrings = []
actualages = []
for filename in brent_transcripts['filename']:
    components = filename.split('/')
    child_ids.append(components[2])
    agestrings.append((components[3])[:-4])

actualages=[]
for agestring in agestrings:
    if agestring[:2] =='m1':
        actualages.append(0)
        continue
    if agestring[-1] in ['a','b']:
        agestring = agestring[:-1]
    
    yrs = int(agestring[:2])
    mos =  int(agestring[2:4])
    days =  int(agestring[4:])
    totalmos = 12*yrs + mos + days/30.5
    actualages.append(totalmos)

In [12]:
brent_transcripts['target_child_age_old'] = brent_transcripts['target_child_age']
brent_transcripts['target_child_age'] = actualages
brent_transcripts['child_id'] = child_ids

In [13]:
brent_transcripts_relevant = brent_transcripts[['transcript_id','child_id', 'target_child_age','target_child_age_old']]

Now merging this data with the original utterance-level data and then producing summary stats. Note that m1 child should be deleted as the sessions were not transcribed.

In [14]:
merged_brent_results = pd.merge(subset,brent_transcripts_relevant,how='left',on='transcript_id').drop(['target_child_age_x'],axis=1)
merged_brent_results = merged_brent_results.rename(columns={"target_child_age_y": "target_child_age"})

In [66]:
binned_brent = util.binInto6(merged_brent_results)

In [53]:
merged_brent_results['child_id'].value_counts() 

child_id
s2    15983
v1    13255
s1    13194
i1    12827
d1    11070
c1    10366
f2    10322
w1    10171
f1     9546
w3     8876
v2     8398
t1     7709
j1     6944
s3     3745
m2     3355
q1     3233
Name: count, dtype: int64

Note - the utterance level scraping did not have m1 results to begin with
age bracket 9-10 means inclusive of 9, not of 10

In [16]:
util.ageSexSummary(merged_brent_results)

Unnamed: 0,target_child_age,target_child_sex,total_words,total_phonemes,no_unique_child
0,8.918033,male,5283,13951,2
1,9.032787,male,2910,7895,1
2,9.065574,male,3342,9260,1
3,9.131148,female,3953,11419,1
4,9.262295,male,1157,3111,1
...,...,...,...,...,...
113,15.131148,male,4430,11973,2
114,15.163934,male,948,2467,1
115,15.196721,male,4212,11722,1
116,15.262295,male,3530,9722,1


In [None]:
util.ageSexSummary(binned_brent)

In [None]:
corpusname = "Brent"
aggregation  = util.ageSexSummary(binned_brent)
aggregation.to_csv("aggregation_"+corpusname+".csv",index=False)

In [19]:
merged_brent_results

Unnamed: 0,gloss,target_child_name,transcript_id,target_child_sex,target_child_id,phonemic_gloss,num_relevant_words,num_phonemes,child_id,target_child_age,target_child_age_old
0,Morgan,,8268,,3832,[],0,0,c1,9.983607,
1,pull it up yourself,Morgan,8267,female,3836,"[['pʰ', 'ʊ', 'ɫ'], ['ɪ'], ['ɐ', 'p'], ['j', 'ʊ...",4,13,c1,9.557377,9.558718
2,see,,8268,,3832,"[['s', 'iː']]",1,2,c1,9.983607,
3,hands up,Morgan,8267,female,3836,"[['h', 'æ', 'n', 'z'], ['ɐ', 'p']]",2,6,c1,9.557377,9.558718
4,hands up,Morgan,8267,female,3836,"[['h', 'æ', 'n', 'z'], ['ɐ', 'p']]",2,6,c1,9.557377,9.558718
...,...,...,...,...,...,...,...,...,...,...,...
148989,xxx,Vas_Coleman,8477,male,3890,[],0,0,w3,14.950820,14.953079
148990,you want,Vas_Coleman,8477,male,3890,"[['j', 'ʉː'], ['w', 'ə', 'n']]",2,5,w3,14.950820,14.953079
148991,that what you want,Vas_Coleman,8477,male,3890,"[['ð', 'æ'], ['w', 'ɐ'], ['j', 'ʉː'], ['w', 'ə...",4,9,w3,14.950820,14.953079
148992,you trying to snap your finger,Vas_Coleman,8477,male,3890,"[['j', 'ʉː'], ['t', 'ɹ', 'aj', 'ɪ', 'n'], ['tʰ...",6,21,w3,14.950820,14.953079


In [20]:
merged_brent_results.to_csv((os.path.join(directory_of_output,'Brent_id_fixed.csv'))) 

## Rollins

In [None]:
subset, aggr = util.tokenDescriptions(directory_of_output,'Rollins')

#### Fixing Rollins corpus scraping by working on ages and number of subjects

In [22]:
def howManyUnique(l):
    return len(set(l))

In [23]:
rollins_transcripts = pd.read_csv('dictAlignment/rollins_transcripts.csv')
child_ids = []
actualages = []
for filename in rollins_transcripts['filename']:
    components = filename.split('/')
    initials_age = components[2] [:-4] 
    child_ids.append(initials_age[:2])
    actualages.append(int(initials_age[2:4]))

In [24]:
rollins_transcripts['target_child_age_old'] = rollins_transcripts['target_child_age']
rollins_transcripts['target_child_age'] = actualages
rollins_transcripts['child_id'] = child_ids

In [25]:
rollins_transcripts_relevant = rollins_transcripts[['transcript_id','child_id', 'target_child_age','target_child_age_old']]
merged_rollins_results = pd.merge(subset,rollins_transcripts_relevant,how='left',on='transcript_id').drop(['target_child_age_x'],axis=1)
merged_rollins_results = merged_rollins_results.rename(columns={"target_child_age_y": "target_child_age"})

In [26]:
merged_rollins_results

Unnamed: 0,gloss,target_child_name,transcript_id,target_child_sex,target_child_id,phonemic_gloss,num_relevant_words,num_phonemes,child_id,target_child_age,target_child_age_old
0,hello Chi,,4523,,2530,"[['h', 'ɛ', 'l', 'ow'], ['ʃ', 'aj']]",2,6,cb,6,6.000123
1,how are you today,,4523,,2530,"[['h', 'aw'], ['ɚ'], ['j', 'ʉː'], ['tʰ', 'ʊ', ...",4,9,cb,6,6.000123
2,I love you,,4523,,2530,"[['aj'], ['l', 'ɐ', 'v'], ['j', 'ʉː']]",3,6,cb,6,6.000123
3,I love you,,4523,,2530,"[['aj'], ['l', 'ɐ', 'v'], ['j', 'ʉː']]",3,6,cb,6,6.000123
4,uh,,4523,,2530,[['ə']],1,1,cb,6,6.000123
...,...,...,...,...,...,...,...,...,...,...,...
17241,what did you find,,4650,,2530,"[['w', 'ɐ'], ['dʲ', 'ɪ', 'd'], ['j', 'ʉː'], ['...",4,10,nb,12,12.000246
17242,can Chi make the birdie sing,,4650,,2530,"[['k', 'n̩'], ['ʃ', 'aj'], ['m', 'ej', 'k'], [...",6,16,nb,12,12.000246
17243,very good,,4650,,2530,"[['v', 'ɛ', 'ɹ', 'i'], ['ɡ', 'ʊ', 'd']]",2,7,nb,12,12.000246
17244,oh that rattle is nice,,4650,,2530,"[['ow'], ['ð', 'æ'], ['ɹ', 'æ', 't', 'ə', 'ɫ']...",5,12,nb,12,12.000246


In [9]:
binned_rollins = util.binInto6(merged_rollins_results)

In [None]:
util.ageSexSummary(binned_rollins,False)

In [None]:
corpusname = "Rollins"
aggregation  = util.ageSexSummary(binned_rollins,False)
aggregation.to_csv("aggregation_"+corpusname+".csv",index=False)

In [28]:
merged_rollins_results.to_csv(os.path.join(directory_of_output,'Rollins_id_fixed.csv'))

## Providence

In [None]:
subset, aggr = util.tokenDescriptions(directory_of_output,'Providence')

In [30]:
subset['child_id']= subset['target_child_id']

In [79]:
binned_subset = util.binInto6(subset) 

In [None]:
util.ageSexSummary(binned_subset)

In [None]:
corpusname = "Providence"
aggregation  = util.ageSexSummary(binned_subset)
aggregation.to_csv("aggregation_"+corpusname+".csv",index=False)

## Vankleeck

In [31]:
subset, aggr = util.tokenDescriptions(directory_of_output,'Vankleeck')

Total number of words we had from CHILDES originally 38091
Total number of words we had after initial cleaning 38067
Total number of words we have transcriptions for 37863
Total number of phonemes we have  97470
Total number of  unique child IDs detected: 20
Total number of  unique speaker roles detected: 1


In [32]:
subset['child_id']= subset['target_child_id']

In [83]:
binned_subset = util.binInto6(subset) 

In [None]:
util.ageSexSummary(binned_subset)

In [None]:
corpusname = "Vankleeck"
aggregation  = util.ageSexSummary(binned_subset)
aggregation.to_csv("aggregation_"+corpusname+".csv",index=False)

## Total aggregation

In [89]:
corpora_names = ['Providence', 'NewmanRatner', 'Vankleeck', 'Rollins', 'Brent']
summaries = []
for corpus in corpora_names:
    df = pd.read_csv("aggregation_"+corpus+".csv")
    df['corpus'] = corpus
    summaries.append(df)
full_merge = pd.concat([d for d in summaries]).reset_index().drop('index',axis=1)  


In [96]:
full_merge

Unnamed: 0,age_bin,target_child_sex,total_words,total_phonemes,no_unique_child,corpus
0,3-9,female,0,0,0,Providence
1,3-9,male,0,0,0,Providence
2,9-15,female,46073,133548,3,Providence
3,9-15,male,27565,78968,1,Providence
4,15-21,female,179814,515536,3,Providence
...,...,...,...,...,...,...
67,33-39,male,0,0,0,Brent
68,39-45,female,0,0,0,Brent
69,39-45,male,0,0,0,Brent
70,45-51,female,0,0,0,Brent


In [98]:
tp = full_merge
tp['age_bin'] = pd.Categorical(full_merge['age_bin'], categories=[str(i)+"-"+str(i+6) for i in range(3,47,6)], ordered=True)


In [99]:
all_grouped = tp.groupby(['age_bin', 'target_child_sex']).agg(total_words = ('total_words', 'sum'),total_phonemes = ('total_phonemes','sum'), no_unique_child = ('no_unique_child', 'sum')).reset_index()

  tp.groupby(['age_bin', 'target_child_sex']).agg(total_words = ('total_words', 'sum'),total_phonemes = ('total_phonemes','sum'), no_unique_child = ('no_unique_child', 'sum')).reset_index()


Unnamed: 0,age_bin,target_child_sex,total_words,total_phonemes,no_unique_child
0,3-9,female,81273,221594,62
1,3-9,male,75424,205491,64
2,9-15,female,331731,930054,72
3,9-15,male,355061,980234,72
4,15-21,female,221678,629824,31
5,15-21,male,145369,404059,36
6,21-27,female,310965,873592,64
7,21-27,male,214465,588780,62
8,27-33,female,206927,592396,3
9,27-33,male,131512,365550,3


In [None]:
all_grouped = tp.groupby(['age_bin', 'target_child_sex']).agg(total_words = ('total_words', 'sum'),total_phonemes = ('total_phonemes','sum'), no_unique_child = ('no_unique_child', 'sum')).reset_index()

In [102]:
all_grouped.to_csv("grouped_aggregation.csv",index=False)

In [103]:
full_merge.to_csv("grouped_aggregation_corpuslevel.csv",index=False)