# Processing and Normalization

Diphthongs and outlier productions will be excluded from analysis. Relative duration of vowels will be coded in addition to stress. Lastly, all formant data will be normalized following Delta F normalization (Johnson, 2018).

In [1]:
import pandas as pd

In [113]:
# import df
data = pd.read_csv("data/allvowels.csv")
print(len(data))
data.sample(10)

9289


Unnamed: 0,Participant,Vowel,F1.50,F2.50,F3.50,F1.25,F2.25,F3.25,F1.75,F2.75,...,t2_ph,fname,dur_ph,t1_wd,t2_wd,word,is_wdinit_ph,is_wdfin_ph,prev_ph,next_ph
5739,s053,e,359.610176,1723.044003,2811.361109,362.519882,1896.896967,2817.180194,329.213845,1532.686448,...,1.451,textgrids/dime/s05310.TextGrid,0.052,1.354,1.451,de,False,True,d,u
3385,s002,e,364.069748,1802.047915,2796.002276,368.413191,1772.551087,2794.068064,351.844407,1801.310384,...,0.556,textgrids/dime/s00210.TextGrid,0.031,0.525,0.579,el,True,False,s,l
2646,s001,a,570.509263,1423.971354,2382.398603,536.985107,1416.977236,2398.249213,532.367787,1414.558438,...,1.089,textgrids/dime/s00122.TextGrid,0.068,0.933,1.57,oración,False,False,r(,s
4220,s002,i,371.500904,2026.723527,2597.802704,345.163445,2051.305982,2594.894225,400.155135,1987.013655,...,3.038,textgrids/dime/s00248.TextGrid,0.041,2.797,3.088,medio,False,False,d,o
4269,s002,a,591.910766,1584.471532,2633.136731,582.207064,1537.27567,2617.153691,526.929892,1560.772388,...,3.665,textgrids/dime/s00250.TextGrid,0.061,3.459,3.967,fraterna,False,False,r(,t
4813,s051,a,1032.19798,1605.769696,2284.92925,766.576204,1728.425643,2019.075392,908.189505,1087.802061,...,1.446,textgrids/dime/s05121.TextGrid,0.068,0.844,1.771,intrínsecamente,False,False,k,m
8315,s056,o,565.615632,1996.830335,3205.620405,569.754034,2093.151981,3156.151606,552.356108,1975.612712,...,1.45,textgrids/dime/s05613.TextGrid,0.049,0.959,1.506,difusión,False,False,i,n
4407,s051,e,415.927843,2161.575975,2276.770025,420.998527,2196.135311,2299.585363,400.066451,2103.176276,...,0.711,textgrids/dime/s05107.TextGrid,0.036,0.675,0.74,en,True,False,e,n
627,p119,e,578.214817,1199.950221,1872.33355,595.526339,1454.283082,2304.880872,562.007894,1099.124158,...,204.467,textgrids/cbas/p119.TextGrid,0.03,204.377,204.847,jueves,False,False,w,b
2280,s001,o,369.387871,898.166547,2472.647734,404.105122,956.595904,2481.309671,338.309993,943.391601,...,0.915,textgrids/dime/s00102.TextGrid,0.04,0.549,0.915,méxico,False,True,k,p


In [114]:
# first some basic checks
data.groupby(["Participant", "Vowel"]).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,F1.50,F2.50,F3.50,F1.25,F2.25,F3.25,F1.75,F2.75,F3.75,Gender,...,t2_ph,fname,dur_ph,t1_wd,t2_wd,word,is_wdinit_ph,is_wdfin_ph,prev_ph,next_ph
Participant,Vowel,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
p112,a,145,145,145,145,145,145,145,145,145,145,...,145,145,145,145,145,145,145,145,145,145
p112,e,76,76,76,76,76,76,76,76,76,76,...,76,76,76,76,76,76,76,76,76,76
p112,i,52,52,52,52,52,52,52,52,52,52,...,52,52,52,52,52,52,52,52,52,52
p112,o,92,92,92,92,92,92,92,92,92,92,...,92,92,92,92,92,92,92,92,92,92
p112,u,12,12,12,12,12,12,12,12,12,12,...,12,12,12,12,12,12,12,12,12,12
p113,a,145,145,145,145,145,145,145,145,145,145,...,145,145,145,145,145,145,145,145,143,144
p113,e,75,75,75,75,75,75,75,75,75,75,...,75,75,75,75,75,75,75,75,75,75
p113,i,51,51,51,51,51,51,51,51,51,51,...,51,51,51,51,51,51,51,51,51,51
p113,o,92,92,92,92,92,92,92,92,92,92,...,92,92,92,92,92,92,92,92,92,88
p113,u,12,12,12,12,12,12,12,12,12,12,...,12,12,12,12,12,12,12,12,12,12


## Removal of diphthongs

In [115]:
# takes df input with columns `next_ph` and `prev_ph` and returns df without diphthongs

def rem_diph(df):
    print("Initial length: ", len(df))
    
    df = df[(df['next_ph']!="a") & 
                 (df['next_ph']!="e") & 
                 (df['next_ph']!="i") &
                 (df['next_ph']!="o") &
                 (df['next_ph']!="u")]
    df = df[(df['prev_ph']!="a") & 
                 (df['prev_ph']!="e") & 
                 (df['prev_ph']!="i") &
                 (df['prev_ph']!="o") &
                 (df['prev_ph']!="u")]
    df = df.reset_index(drop = True)
    
    print("Final length: ", len(df))
    return df    

In [116]:
data = rem_diph(data)

Initial length:  9289
Final length:  7284


## Removal of outliers

In [117]:
# removes productions with outlier formants
# input df has columns `Participant` and formants listed in format `F1.50`

def rem_outliers(df):
    print("Initial length: ", len(df))
    
    # establish 25% and 75% for each formant
    f1_qrts = df.groupby(['Participant', 'Vowel'])["F1.50"].describe()[['25%', '75%']]
    f2_qrts = df.groupby(['Participant', 'Vowel'])["F2.50"].describe()[['25%', '75%']]
    f3_qrts = df.groupby(['Participant', 'Vowel'])["F3.50"].describe()[['25%', '75%']]
    
    # find interquartile range for each formant
    f1_qrts['IQR'] = f1_qrts['75%'] - f1_qrts['25%']
    f2_qrts['IQR'] = f2_qrts['75%'] - f2_qrts['25%']
    f3_qrts['IQR'] = f3_qrts['75%'] - f3_qrts['25%']
    
    # determine upper limit for each formant
    f1_qrts['upper'] = f1_qrts['75%'] + (1.5 * f1_qrts['IQR'])
    f2_qrts['upper'] = f2_qrts['75%'] + (1.5 * f2_qrts['IQR'])
    f3_qrts['upper'] = f3_qrts['75%'] + (1.5 * f3_qrts['IQR'])
    
    # determine lower limit for each formant
    f1_qrts['lower'] = f1_qrts['25%'] - (1.5 * f1_qrts['IQR'])
    f2_qrts['lower'] = f2_qrts['25%'] - (1.5 * f2_qrts['IQR'])
    f3_qrts['lower'] = f3_qrts['25%'] - (1.5 * f3_qrts['IQR'])
    
    # create smaller df with only limits for each formant
    f1_limits = f1_qrts[['upper','lower']]
    f2_limits = f2_qrts[['upper','lower']]
    f3_limits = f3_qrts[['upper','lower']]
    
    # merge limits into original df
    df = df.merge(f1_limits, left_on = ["Participant", "Vowel"], right_index = True)
    df = df.merge(f2_limits, left_on = ["Participant", "Vowel"], right_index = True, suffixes = ("_f1", "_f2"))
    df = df.merge(f3_limits, left_on = ["Participant", "Vowel"], right_index = True)
    
    # drop rows with outlier formants
    df = df[(df["F1.50"] > df["lower_f1"]) & (df["F1.50"] < df["upper_f1"])]
    df = df[(df["F2.50"] > df["lower_f2"]) & (df["F2.50"] < df["upper_f2"])]
    df = df[(df["F3.50"] > df["lower"]) & (df["F3.50"] < df["upper"])]

    print("Final length: ", len(df))
    return df  

In [118]:
data = rem_outliers(data)

Initial length:  7284
Final length:  6341


## Speech rate

First we will take the number of vowels a speaker produces to be equal to the number of syllables they utter. Then we will take the unique values from the `t1_wd` and `t2_wd` columns and subtract t2 from t1 to obtain an array of the duration of each word uttered. Then we will sum the durations of all words and divide the number of syllables by this value.

In [119]:
def speech_rate(df):
    import numpy as np

    Participant = []
    speech_rate = []

    for i in df.Participant.unique():
        data = df[df["Participant"]==i]
        syllables = len(data.Vowel)
        end_times = data["t2_wd"]
        start_times = data["t1_wd"]
        durations = np.subtract(end_times, start_times)
        duration = sum(durations)
        rate = syllables/duration
    
        Participant.append(i)
        speech_rate.append(rate)

    rates = {k:v for k,v in zip(Participant, speech_rate)}
    rates_df = pd.DataFrame.from_dict(rates, orient = "index", columns = ['Speech Rate'])
    rates_df = rates_df.rename_axis('Participant').reset_index()
    
    df = pd.merge(left = df, right = rates_df, on = 'Participant', how = 'outer')
    return df

In [120]:
data = speech_rate(data)
len(data)

6341

## Stress

The syltippy package (https://github.com/nur-ag/syltippy) will be used to generate syllabified (stress-indicated) outputs for each word found in the transcriptions. Then, the corresponding vowels in the TextGrid-formant dataframes will be marked as either stressed or unstressed.

In [121]:
# function takes into dictionary.txt file with cols `word` and `ipa`
# input formants df with cols `Participant`, `word`, `t1_wd`, and `t1_ph`

def get_stress(vowels):
    # import required packages
    import pandas as pd
    import csv
    import re
    import numpy as np
    from syltippy import syllabize
    
    # def fxn to create stress column in dictionary
    def stress(word):
        syllables, stress = syllabize(word)
        return ','.join(s if stress != i else s.upper() for (i, s) in enumerate(syllables))
    
    # add column to dictionary
    vowels["stress_syll"] = vowels["word"].apply(lambda x : stress(str(x)))
    
    # create separate column to hold only the vowels in each word
    vowels["syll_vowels"] = vowels["stress_syll"].apply(lambda x: re.sub(r'[^,aeiouAEIOUáéíóúÁÉÍÓÚ]', '', x))
    
    # define function to return index of 'vowels' column with stress
    def is_stress(word):
        # convert to list
        word = word.split(",")
        stress_vowel = 0
        for syllable in word:
            if syllable.isupper():
                stress_vowel = word.index(syllable)
        return stress_vowel
    
    # create new column which gives vowel number in given word that has stress
    vowels["stress_vowel"] = vowels["syll_vowels"].apply(lambda x: is_stress(x))
    
    # determine index of vowel in df
    vowels["vowel_ind"] = vowels.groupby(["Participant", "t1_wd"]).cumcount()
    vowels = vowels.reset_index(drop = True)
    
    # add column to formants to indicate stress
    vowels["stress"] = np.where(vowels['stress_vowel'] == vowels['vowel_ind'], "stressed", "unstressed")
    
    # drop unnecesary columns
    vowels = vowels.drop(["syll_vowels", "stress_syll"], axis = 1)
       
    return vowels

In [122]:
data = get_stress(data)
len(data)

6341

## Normalization of vowel formants

Because both male and female speakers are represented in this data set, the formant frequencies need to be normalized to minimized vocal tract length differences.

Following Johnson (2018), I will use the line-fitting Delta F Normalization method, which makes use of the entire vowel space. To do so, the average vowel space will be calculated for each participant, and then each F1 and F2 measurement will be divided by this value.

In [123]:
import numpy as np

def delta_f(vowels, x): # df as argument, percent of interval (ie 25%, 50% or 75%)
    
    Participant = []
    ll = []
    
    for i in vowels.Participant.unique():
        data = vowels[vowels['Participant']==i]
        
        delta = np.mean([np.true_divide(data["F1."+x], 0.5), 
                        np.true_divide(data["F2."+x], 1.5), 
                        np.true_divide(data["F3."+x], 2.5)
                       ])
        
        Participant.append(i)
        ll.append(delta)
    
    deltas = {k:v for k,v in zip(Participant, ll)}
    delta_df = pd.DataFrame.from_dict(deltas, orient = "index", columns = ['Delta F.'+x])
    delta_df = delta_df.rename_axis('Participant').reset_index()
        
    return(delta_df)

In [124]:
def normalization(vowels):
    delta_df_50 = delta_f(vowels, "50")
    delta_df_25 = delta_f(vowels, "25")
    delta_df_75 = delta_f(vowels, "75")
    
    deltas = delta_df_50.set_index("Participant").join([delta_df_25.set_index("Participant"), delta_df_75.set_index("Participant")])
    deltas = deltas.reset_index()
    
    vowels_normalized = pd.merge(left = vowels,
                                 right = deltas,
                                 on = 'Participant',
                                 how = 'outer')
    vowels_normalized['F1.50_norm'] = vowels_normalized['F1.50']/vowels_normalized['Delta F.50']
    vowels_normalized['F2.50_norm'] = vowels_normalized['F2.50']/vowels_normalized['Delta F.50']
    
    vowels_normalized['F1.25_norm'] = vowels_normalized['F1.25']/vowels_normalized['Delta F.25']
    vowels_normalized['F2.25_norm'] = vowels_normalized['F2.25']/vowels_normalized['Delta F.25']
    
    vowels_normalized['F1.75_norm'] = vowels_normalized['F1.75']/vowels_normalized['Delta F.75']
    vowels_normalized['F2.75_norm'] = vowels_normalized['F2.75']/vowels_normalized['Delta F.75']
    
    return(vowels_normalized)

In [125]:
data = normalization(data)
print(len(data))
data.sample(10)

6341


Unnamed: 0,Participant,Vowel,F1.50,F2.50,F3.50,F1.25,F2.25,F3.25,F1.75,F2.75,...,stress,Delta F.50,Delta F.25,Delta F.75,F1.50_norm,F2.50_norm,F1.25_norm,F2.25_norm,F1.75_norm,F2.75_norm
2832,s002,o,447.940482,1272.18113,2989.702971,419.637711,1260.272561,3056.925793,355.478829,1246.492148,...,stressed,1046.087601,1039.69639,1046.921418,0.428206,1.216132,0.403616,1.212154,0.339547,1.190626
2174,s001,a,601.563732,1416.137676,2340.796275,623.362125,1542.629512,2353.283348,608.287206,1377.674849,...,stressed,966.374551,959.13929,962.433837,0.622495,1.465413,0.649918,1.608348,0.63203,1.431449
726,p113,a,852.671463,1630.390788,1885.871806,847.574545,1617.372394,2192.373338,675.500437,1570.717414,...,unstressed,1132.340556,1147.259947,1138.033212,0.753017,1.439841,0.738782,1.40977,0.593568,1.380203
5693,s056,a,751.628576,1747.479718,3125.544903,718.865249,1781.612696,3159.497377,724.420031,1604.86603,...,unstressed,1176.407441,1156.685138,1131.824542,0.638919,1.485437,0.621487,1.540275,0.640046,1.417946
4480,s053,i,372.182462,2832.14509,3320.053446,375.970385,2817.770515,3266.855853,350.913573,2793.688491,...,stressed,1168.123304,1153.656169,1140.630451,0.318616,2.424526,0.325895,2.44247,0.307649,2.449249
6041,s056,e,468.910477,2178.047226,3006.490687,444.992232,2192.292491,3054.653594,457.243056,2030.078829,...,unstressed,1176.407441,1156.685138,1131.824542,0.398595,1.85144,0.384713,1.895323,0.403988,1.793634
4860,s055,a,702.76606,1848.812458,3025.179988,712.085904,1856.486781,3032.339519,579.117778,1849.393832,...,stressed,1190.538946,1160.746163,1144.075686,0.590292,1.552921,0.613473,1.599391,0.506188,1.616496
400,p119,a,776.644004,1616.641501,2612.846227,723.449745,1640.143469,2594.211617,775.875343,1542.436953,...,unstressed,1084.219365,1073.12413,1116.834981,0.716316,1.491065,0.674153,1.528382,0.694709,1.381079
6243,s056,i,272.181257,2936.245061,3619.824968,318.318336,2862.332935,3574.620284,250.956778,2735.413495,...,unstressed,1176.407441,1156.685138,1131.824542,0.231366,2.495942,0.275199,2.4746,0.221728,2.416818
2904,s002,i,312.390614,2149.839621,2809.221383,345.92331,2095.061101,2796.216502,293.601597,2195.400756,...,unstressed,1046.087601,1039.69639,1046.921418,0.298628,2.055124,0.332716,2.01507,0.280443,2.097006


In [126]:
data.groupby(["Participant", "Vowel"]).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,F1.50,F2.50,F3.50,F1.25,F2.25,F3.25,F1.75,F2.75,F3.75,Gender,...,stress,Delta F.50,Delta F.25,Delta F.75,F1.50_norm,F2.50_norm,F1.25_norm,F2.25_norm,F1.75_norm,F2.75_norm
Participant,Vowel,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
p112,a,128,128,128,128,128,128,128,128,128,128,...,128,128,128,128,128,128,128,128,128,128
p112,e,65,65,65,65,65,65,65,65,65,65,...,65,65,65,65,65,65,65,65,65,65
p112,i,39,39,39,39,39,39,39,39,39,39,...,39,39,39,39,39,39,39,39,39,39
p112,o,81,81,81,81,81,81,81,81,81,81,...,81,81,81,81,81,81,81,81,81,81
p112,u,10,10,10,10,10,10,10,10,10,10,...,10,10,10,10,10,10,10,10,10,10
p113,a,113,113,113,113,113,113,113,113,113,113,...,113,113,113,113,113,113,113,113,113,113
p113,e,60,60,60,60,60,60,60,60,60,60,...,60,60,60,60,60,60,60,60,60,60
p113,i,46,46,46,46,46,46,46,46,46,46,...,46,46,46,46,46,46,46,46,46,46
p113,o,76,76,76,76,76,76,76,76,76,76,...,76,76,76,76,76,76,76,76,76,76
p113,u,9,9,9,9,9,9,9,9,9,9,...,9,9,9,9,9,9,9,9,9,9


In [127]:
data.to_csv("data/allnormdata.csv", index = False)