# Prep textgrids and convert to dataframes

In [1]:
import pandas as pd
import os
from audiolabel import read_label
import re
import numpy as np
from syltippy import syllabize

Import `syll_dict.txt` which contains GlobalPhone transcriptions for each syllable produced in the data set.

In [2]:
syllable_dictionary = pd.read_csv("data/syllable_dictionary.txt", sep = "\t", header = None)
syllable_dictionary = syllable_dictionary.rename(columns = {0: "each_syllable", 1: "GP_syll"})
syllable_dictionary.head()

Unnamed: 0,each_syllable,GP_syll
0,vos,b o s
1,nos,n o s
2,bar,b a rf
3,gua,G w a
4,ja,x a


In [121]:
# create function that will take in textgrid and output data frame including stress assignment
def tg2df_stress(filepath, wd_tier, ph_tier, syll):
    
    [wddf, phdf] = read_label(filepath,
        ftype='praat',
        tiers=[wd_tier, ph_tier]
    )

    # Throw an error if tiers are not strictly hierarchical.
    # words contain phones
    assert(wddf.t1.isin(phdf.t1).all())
    assert(wddf.t2.isin(phdf.t2).all())
    
    # Add phone duration and speaker
    phdf['dur_ph'] = phdf.t2 - phdf.t1

    # add filename to dataframe
    phdf['Participant'] = filepath[9:13]
    
    if filepath[9]=="s":
        phdf['Corpus'] = "DIMEx100"
    else:
        phdf['Corpus'] = "CBAS"
    
    # Merge phone and word tiers.
    newdf = pd.merge_asof(
        phdf.rename({'t1': 't1_ph', 't2': 't2_ph', "label": "phone"}, axis='columns'),
        wddf.drop(["fname"], axis = 1) \
        .rename({'t1': 't1_wd', 't2': 't2_wd', "label": "word"}, axis='columns'),
        left_on='t1_ph',
        right_on='t1_wd'
    )

    # Add word-init and -final columns
    newdf['is_wdinit_ph'] = newdf.t1_ph == newdf.t1_wd
    newdf['is_wdfin_ph'] = newdf.t2_ph == newdf.t2_wd
        
    # drop rows where word=="" or ".sil"
    newdf = newdf[(newdf["word"]!="") &
                (newdf["word"]!=".sil") &
             (newdf["word"]!=".bn")]

    # convert v_7 to accent over vowel
    # fix notation in dimex corpus, where V_7 yields accented V
    newdf['word'] = newdf['word'].apply(lambda x: re.sub("a_7", "á", x))
    newdf['word'] = newdf['word'].apply(lambda x: re.sub("i_7", "í", x))
    newdf['word'] = newdf['word'].apply(lambda x: re.sub("o_7", "ó", x))
    newdf['word'] = newdf['word'].apply(lambda x: re.sub("u_7", "ú", x))
    newdf['word'] = newdf['word'].apply(lambda x: re.sub("e_7", "é", x))

    # fix tildas
    newdf['word'] = newdf['word'].apply(lambda x: re.sub("n~", "ñ", x))
        
    # obtain syllables for each word
    newdf["syllable"] = newdf["word"].apply(lambda x: syllabize(x)[0])
    newdf["stress_ind"] = newdf['word'].apply(lambda x: syllabize(x)[1])
    newdf = newdf.reset_index(drop = True)
    len_nd = len(newdf)
    
    # one observation for each word
    words_sylls = newdf[['word', 't1_wd', 'stress_ind', 'syllable', 'Participant', 'phone']].copy()
    words_sylls = words_sylls.drop_duplicates(subset = ["word"])

    # create new df with each syllable contained in each word
    syll_cols = pd.DataFrame({'word':np.repeat(words_sylls.word.values, words_sylls.syllable.str.len()),
                          'stress_ind': np.repeat(words_sylls.stress_ind.values, words_sylls.syllable.str.len()),
                          't1_wd': np.repeat(words_sylls.t1_wd.values, words_sylls.syllable.str.len()),
                              'Participant': np.repeat(words_sylls.Participant.values, words_sylls.syllable.str.len()),
                        'each_syllable':np.concatenate(words_sylls.syllable.values)})
    syll_cols = syll_cols.reset_index(drop = True)

    # create new df that contains GP for each syllable
    stress_finder = syll_cols.merge(syll, how = "left", on = "each_syllable")

    # now create syllable index for each syllable
    stress_finder["syll_ind"] = stress_finder.groupby(["Participant", "t1_wd"]).cumcount()

    # now create column to indicate whether syllable is stressed
    stress_finder['is_stress'] = np.where(stress_finder.stress_ind == stress_finder.syll_ind, 1, 0)

    # create list from GP_syll column
    stress_finder['syll_list'] = stress_finder['GP_syll'].apply(lambda x: str(x).split(" "))

    # now create new columns
    phones_stressed = pd.DataFrame({'word':np.repeat(stress_finder.word.values, stress_finder.syll_list.str.len()),
                                'is_stress': np.repeat(stress_finder.is_stress.values, stress_finder.syll_list.str.len()),
                                    't1_wd': np.repeat(stress_finder.t1_wd.values, stress_finder.syll_list.str.len()),
                        'GP_ph':np.concatenate(stress_finder.syll_list.values)})  
    phones_stressed = phones_stressed.reset_index(drop = True)
    
    # count number of phones in each word
    counts = phones_stressed.groupby(['word', 't1_wd'])['GP_ph'].size().reset_index(name = 'num_ph')
    counts = counts.sort_values(by = ["t1_wd"], ignore_index = True)
    phones_stressed = phones_stressed.merge(counts, on = ["word", 't1_wd'])
    len_ph = len(phones_stressed)
    
    # add 'num_ph' to new df with info from newdf
    testdf = phones_stressed.drop(['t1_wd', 'is_stress', 'GP_ph'], axis = 1)
    testdf = testdf.drop_duplicates()
    test_3 = newdf.merge(testdf, on = "word")
    
    # count number of times each word appears
    word_counts = test_3.groupby(['word', 't1_wd']).size().reset_index(name = "count_word")
    word_counts = word_counts.sort_values(by = ["t1_wd"], ignore_index = True)
    test_3 = test_3.merge(word_counts, on = ["word", 't1_wd'])
    
    # define list of words to be removed and remove them
    removed = test_3[test_3['num_ph']!=test_3['count_word']]
    test_3 = test_3[test_3['num_ph']==test_3['count_word']]
    test_3 = test_3.reset_index(drop = True)
    rem_words = list(removed['word'].unique())
    
    # drop removed words from phones_stressed then merge
    phones_stressed = phones_stressed[~phones_stressed['word'].isin(rem_words)]
    phones_stressed = phones_stressed.reset_index(drop = True)
    test_3 = test_3.drop(['phone'], axis = 1)
    final_df = test_3.join(phones_stressed, how = "inner", rsuffix = "_dict")
    final_df = final_df.drop(['stress_ind', "syllable", "count_word", "num_ph", "word_dict", "t1_wd_dict", "num_ph_dict"], axis = 1)
    len_ph_rem = len(phones_stressed)
    
    #final_df = final_df.drop_duplicates()
    len_final = len(final_df)
    
    if len_ph != len_nd:
        print("length error with ", filepath)
        if len_final == len_ph_rem:
            print("resolved and following words dropped: ",rem_words)
            return(final_df)
        else:
            print('issue remains')
            return(final_df)
    
    else:
        return(final_df)

Now create a for loop to iterate through text grids and produce df with all data from all speakers.

In [122]:
tg_folder = "data/tgs/"
corpora = os.listdir(tg_folder)
df_list = []

for f in corpora:
    if f.endswith(".TextGrid"):
        filepath = tg_folder + f
    
        if f.startswith("p"):
            wd_tier = "default - words"
            ph_tier = "default - phones"
        else:
            wd_tier = "word"
            ph_tier = "phone"
    
        new_df = tg2df_stress(filepath, wd_tier, ph_tier, syllable_dictionary)
        df_list.append(new_df)

tg_data = pd.concat(df_list)

length error with  data/tgs/p111_spanish1.TextGrid
resolved and following words dropped:  ['hambriento', 'asado', 'hervir', 'payaso', 'curvado', 'oboe', 'guayabera', 'hembras']
length error with  data/tgs/p111_spanish2.TextGrid
resolved and following words dropped:  ['harto', 'convección', 'alcohol', 'hombre', 'hembras']
length error with  data/tgs/p111_spanish3.TextGrid
resolved and following words dropped:  ['hierba', 'hondos']
length error with  data/tgs/p113_spanish1.TextGrid
resolved and following words dropped:  ['hambriento', 'asado', 'hervir', 'payaso', 'emburjado', 'curvado', 'oboe', 'guayabera', 'hembras']
length error with  data/tgs/p113_spanish2.TextGrid
resolved and following words dropped:  ['harto', 'convección', 'alcohol', 'hombre', 'hembras']
length error with  data/tgs/p113_spanish3.TextGrid
resolved and following words dropped:  ['hierba', 'hondos']
length error with  data/tgs/p114_spanish1.TextGrid
resolved and following words dropped:  ['hambriento', 'asado', 'herv

In [125]:
vowels = ["a", "e", "i", "o", "u", "a+", "e+", "i+", "o+", "u+"]
tg_data["is_vowel"] = np.where(tg_data.GP_ph.isin(vowels), 1, 0)
tg_data.sample(25)

Unnamed: 0,t1_ph,t2_ph,fname,dur_ph,Participant,Corpus,t1_wd,t2_wd,word,is_wdinit_ph,is_wdfin_ph,is_stress,GP_ph,is_vowel
743,341.441,341.641,data/tgs/p114_spanish1.TextGrid,0.2,p114,CBAS,340.901,341.641,presente,False,True,0,l,0
24,1.56,1.668,data/tgs/s05122.TextGrid,0.108,s051,DIMEx100,1.443,2.138,legislación,False,False,0,x,0
486,189.02,189.17,data/tgs/p121_spanish3.TextGrid,0.15,p121,CBAS,188.61,189.17,cartas,False,True,0,d,0
7,0.461,0.514,data/tgs/s05317.TextGrid,0.053,s053,DIMEx100,0.296,0.661,entorno,False,False,1,o,1
67,5.297,5.407,data/tgs/s05530.TextGrid,0.11,s055,DIMEx100,4.739,5.407,universidad,False,True,1,e,1
426,169.45,169.49,data/tgs/p121_spanish3.TextGrid,0.04,p121,CBAS,169.45,169.74,pedí,True,False,1,l,0
180,86.079,86.259,data/tgs/p121_spanish2.TextGrid,0.18,p121,CBAS,85.619,86.259,cabeza,False,True,0,e,1
666,260.897,260.957,data/tgs/p113_spanish1.TextGrid,0.06,p113,CBAS,260.767,261.417,vanidad,False,False,1,p,0
17,1.891,1.973,data/tgs/s05627.TextGrid,0.082,s056,DIMEx100,1.81,2.283,mayoría,False,False,0,a,1
260,142.595,142.665,data/tgs/p118_spanish2.TextGrid,0.07,p118,CBAS,142.265,142.665,zanja,False,True,0,a,1


In [128]:
vowels = tg_data[tg_data["is_vowel"]==1].copy()
vowels.shape

(17105, 14)

In [129]:
vowels.groupby(['Corpus', "GP_ph"]).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,t1_ph,t2_ph,fname,dur_ph,Participant,t1_wd,t2_wd,word,is_wdinit_ph,is_wdfin_ph,is_stress,is_vowel
Corpus,GP_ph,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
CBAS,a,4003,4003,4003,4003,4003,4003,4003,4003,4003,4003,4003,4003
CBAS,a+,218,218,218,218,218,218,218,218,218,218,218,218
CBAS,e,2524,2524,2524,2524,2524,2524,2524,2524,2524,2524,2524,2524
CBAS,e+,75,75,75,75,75,75,75,75,75,75,75,75
CBAS,i,1414,1414,1414,1414,1414,1414,1414,1414,1414,1414,1414,1414
CBAS,i+,118,118,118,118,118,118,118,118,118,118,118,118
CBAS,o,2576,2576,2576,2576,2576,2576,2576,2576,2576,2576,2576,2576
CBAS,o+,284,284,284,284,284,284,284,284,284,284,284,284
CBAS,u,506,506,506,506,506,506,506,506,506,506,506,506
CBAS,u+,55,55,55,55,55,55,55,55,55,55,55,55


In [130]:
tg_data.to_csv("data/tg_data.csv", index = False)