# Preparing contour txt files for SS-ANOVA in R

With a Praat script (Mietta Lennes, 2003), the first three formants were extracted at 2ms intervals within the DIMEx100 and CBAS corpora. The resulting .txt files need to be cleaned and turned into .csv files. To clean, the formant data needs to be merged with the textgrids and vowels need to be isolated. The formant measurements can then be normalized, and then they will be ready to upload to R and fitted with SS-ANOVA.

In [1]:
import pandas as pd
import os
from audiolabel import read_label

### Import formant data

First import DIME data and add new columns for gender and corpus.

In [14]:
# import dime female
dime_fem_con = pd.read_csv("data/dime_female_contour.txt", sep = "\t")
dime_fem_con = dime_fem_con.rename(columns = {"Filename": "Participant",
                                              "F3 ": "F3"})
dime_fem_con["Gender"] = "Female"
dime_fem_con["Corpus"] = "DIMEx100"

# import dime male
dime_male_con = pd.read_csv("data/dime_male_contour.txt", sep = "\t")
dime_male_con = dime_male_con.rename(columns = {"Filename": "Participant",
                                                "F3 ": "F3"})
dime_male_con["Gender"] = "Male"
dime_male_con["Corpus"] = "DIMEx100"

# concatenate male and female data
dime = pd.concat([dime_male_con, dime_fem_con], ignore_index = True)
dime.head()

Unnamed: 0,Participant,phone,Time,Interval,F1,F2,F3,Gender,Corpus
0,s00101,e,0.069,0.002,2094.613890102079,2623.550627119573,3466.0182725549303,Male,DIMEx100
1,s00101,e,0.071,0.004,2094.042033082855,2624.0054225415506,3465.6262512556896,Male,DIMEx100
2,s00101,e,0.073,0.006,2093.470176063631,2624.4602179635285,3465.2342299564493,Male,DIMEx100
3,s00101,e,0.075,0.008,2092.8983190444064,2624.915013385506,3464.842208657209,Male,DIMEx100
4,s00101,e,0.077,0.01,2092.326462025182,2625.3698088074834,3464.450187357969,Male,DIMEx100


In [15]:
dime = dime[dime["F1"]!="--undefined--"]
dime = dime[dime["F2"]!="--undefined--"]
dime = dime[dime["F3"]!="--undefined--"]

dime["F1"] = dime["F1"].astype(float)
dime["F2"] = dime["F2"].astype(float)
dime["F3"] = dime["F3"].astype(float)

Now do the same with the CBAS corpus.

In [16]:
# import cbas female
cbas_fem = pd.read_csv("data/cbas_female_contour.txt", sep = "\t")
cbas_fem = cbas_fem[cbas_fem["F1"]!="--undefined--"]
cbas_fem = cbas_fem[cbas_fem["F2"]!="--undefined--"]
cbas_fem = cbas_fem[cbas_fem["F3"]!="--undefined--"]
cbas_fem = cbas_fem.rename(columns = {"Filename": "Participant"})
cbas_fem["Gender"] = "Female"
cbas_fem["Corpus"] = "CBAS"

# import cbas male
cbas_male = pd.read_csv("data/cbas_male_contour.txt",sep = "\t")
cbas_male = cbas_male[cbas_male["F1"]!="--undefined--"]
cbas_male = cbas_male[cbas_male["F2"]!="--undefined--"]
cbas_male = cbas_male[cbas_male["F3"]!="--undefined--"]
cbas_male = cbas_male.rename(columns = {"Filename": "Participant"})
cbas_male["Gender"] = "Male"
cbas_male["Corpus"] = "CBAS"

# combine cbas female and male
cbas = pd.concat([cbas_male, cbas_fem], ignore_index = True)
cbas.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,Participant,phone,Time,Interval,F1,F2,F3,Gender,Corpus
0,p112,sil,0.024,0.024,585.4543266155326,1605.5204150247002,2808.0991985643755,Male,CBAS
1,p112,sil,0.026,0.026,585.4543266155326,1605.5204150247002,2808.0991985643755,Male,CBAS
2,p112,sil,0.028,0.028,581.7237336607236,1605.1187739250004,2798.6458648530893,Male,CBAS
3,p112,sil,0.03,0.03,552.605300162352,1601.983840740519,2724.859667231298,Male,CBAS
4,p112,sil,0.032,0.032,523.4868666639803,1598.8489075560378,2651.0734696095064,Male,CBAS


In [17]:
cbas["F1"] = cbas["F1"].astype(float)
cbas["F2"] = cbas["F2"].astype(float)
cbas["F3"] = cbas["F3"].astype(float)

In [18]:
cbas.dtypes

Participant     object
phone           object
Time           float64
Interval       float64
F1             float64
F2             float64
F3             float64
Gender          object
Corpus          object
dtype: object

Now concatenate the cbas and dime dataframes.

In [19]:
# combine cbas and dime
formants = pd.concat([cbas, dime], ignore_index = True)
formants.head()

Unnamed: 0,Participant,phone,Time,Interval,F1,F2,F3,Gender,Corpus
0,p112,sil,0.024,0.024,585.454327,1605.520415,2808.099199,Male,CBAS
1,p112,sil,0.026,0.026,585.454327,1605.520415,2808.099199,Male,CBAS
2,p112,sil,0.028,0.028,581.723734,1605.118774,2798.645865,Male,CBAS
3,p112,sil,0.03,0.03,552.6053,1601.983841,2724.859667,Male,CBAS
4,p112,sil,0.032,0.032,523.486867,1598.848908,2651.07347,Male,CBAS


In order to later merge with the associated TextGrids, create a new column called `t1_ph` that contains the timestamp of the interval start.

In [20]:
formants["t1_ph"] = formants["Time"]-formants["Interval"]
formants.head()

Unnamed: 0,Participant,phone,Time,Interval,F1,F2,F3,Gender,Corpus,t1_ph
0,p112,sil,0.024,0.024,585.454327,1605.520415,2808.099199,Male,CBAS,0.0
1,p112,sil,0.026,0.026,585.454327,1605.520415,2808.099199,Male,CBAS,0.0
2,p112,sil,0.028,0.028,581.723734,1605.118774,2798.645865,Male,CBAS,0.0
3,p112,sil,0.03,0.03,552.6053,1601.983841,2724.859667,Male,CBAS,0.0
4,p112,sil,0.032,0.032,523.486867,1598.848908,2651.07347,Male,CBAS,0.0


### Import TextGrids

In [21]:
cbasdf = pd.DataFrame({
    'relpath': 'textgrids/cbas',
    'fname': ['p112.TextGrid',
              'p119.TextGrid',
              'p113.TextGrid',
              'p115.TextGrid',
              'p120.TextGrid',
              'p124.TextGrid'],
    'subject': ['p112', 'p119', 'p113', 'p115', 'p120', 'p124']
})

dimedf = pd.DataFrame({
    'relpath': 'textgrids/dime',
    'fname' : os.listdir("textgrids/dime")})
dimedf['subject'] = dimedf['fname'].apply(lambda x: x[:6])

tgdf = pd.concat([cbasdf, dimedf], ignore_index = True)

In [22]:
# inputs 

def tg2df(row):
    '''Load 'phone' and 'word' tiers from a textgrid and merge them.
    
    Parameters
    ----------
    
    row: named tuple
    A namedtuple as provided by `itertuples` that can be used to load a Praat
    textgrid from a path identified by row.relpath and row.fname. The textgrid is
    expected to have 'phone' and 'word' tiers.

    Returns
    -------
    
    mergedf: the merged dataframe.
    '''
    [wddf, phdf] = read_label(
        os.path.join(row.relpath, row.fname).replace("\\","/"),
        ftype='praat',
        tiers=['word', 'phone']
    )
    # Throw an error if tiers are not strictly hierarchical.
    # words contain phones
    assert(wddf.t1.isin(phdf.t1).all())
    assert(wddf.t2.isin(phdf.t2).all())
    
    # Add phone duration and speaker
    phdf['dur_ph'] = phdf.t2 - phdf.t1
    phdf['Participant'] = row.subject

    # Merge phone and word tiers.
    phwddf = pd.merge_asof(
        phdf.rename({'t1': 't1_ph', 't2': 't2_ph'}, axis='columns'),
        wddf.drop('fname', axis='columns') \
            .rename({'t1': 't1_wd', 't2': 't2_wd'}, axis='columns'),
        left_on='t1_ph',
        right_on='t1_wd'
    )

    # Add word-init and -final columns
    phwddf['is_wdinit_ph'] = phwddf.t1_ph == phwddf.t1_wd
    phwddf['is_wdfin_ph'] = phwddf.t2_ph == phwddf.t2_wd

    # Merge context tier and return the result.
    return phwddf

In [23]:
dflist = [tg2df(row) for row in tgdf.itertuples()]

In [24]:
alldf = pd.concat(dflist, ignore_index=True)

alldf.sample(10)

Unnamed: 0,t1_ph,t2_ph,phone,fname,dur_ph,Participant,t1_wd,t2_wd,word,is_wdinit_ph,is_wdfin_ph
18836,0.967,1.02,e,textgrids/dime/s05533.TextGrid,0.053,s05533,0.879,1.481,gemelos,False,False
10199,1.825,1.894,t,textgrids/dime/s00231.TextGrid,0.069,s00231,1.696,2.361,naturaleza,False,False
9296,2.198,2.234,r(,textgrids/dime/s00214.TextGrid,0.036,s00214,1.564,2.468,beneficiarios,False,False
15971,3.272,3.35,a,textgrids/dime/s05335.TextGrid,0.078,s05335,3.045,3.418,armas,False,False
14379,2.317,2.396,a,textgrids/dime/s05305.TextGrid,0.079,s05305,2.137,2.52,llegada,False,False
4888,158.493,161.92,sp,textgrids/cbas/p120.TextGrid,3.427,p120,158.493,161.92,,True,True
17169,1.694,1.796,p,textgrids/dime/s05505.TextGrid,0.102,s05505,1.694,2.178,premios,True,False
22022,1.488,1.537,l,textgrids/dime/s05635.TextGrid,0.049,s05635,1.369,1.537,del,False,True
18614,2.002,2.11,p,textgrids/dime/s05529.TextGrid,0.108,s05529,2.002,2.221,para,True,False
20611,0.457,0.513,e,textgrids/dime/s05615.TextGrid,0.056,s05615,0.377,0.513,se,False,True


Create cols `prev_ph` and `next_ph` containing previous and following phones.

In [25]:
alldf['prev_ph'] = alldf.phone.shift(1).fillna('')
alldf['next_ph'] = alldf.phone.shift(-1).fillna('')
alldf = alldf[alldf["phone"]!=""]
alldf = alldf.reset_index(drop = True)

In [26]:
alldf

Unnamed: 0,t1_ph,t2_ph,phone,fname,dur_ph,Participant,t1_wd,t2_wd,word,is_wdinit_ph,is_wdfin_ph,prev_ph,next_ph
0,0.000,0.710,sil,textgrids/cbas/p112.TextGrid,0.710,p112,0.000,0.710,,True,True,,b
1,0.710,0.820,b,textgrids/cbas/p112.TextGrid,0.110,p112,0.710,1.140,bajo,True,False,sil,a
2,0.820,0.920,a,textgrids/cbas/p112.TextGrid,0.100,p112,0.710,1.140,bajo,False,False,b,x
3,0.920,1.000,x,textgrids/cbas/p112.TextGrid,0.080,p112,0.710,1.140,bajo,False,False,a,o
4,1.000,1.140,o,textgrids/cbas/p112.TextGrid,0.140,p112,0.710,1.140,bajo,False,True,x,sp
...,...,...,...,...,...,...,...,...,...,...,...,...,...
22107,5.368,5.432,k,textgrids/dime/s05650.TextGrid,0.064,s05650,5.185,5.622,sector,False,False,e,t
22108,5.432,5.525,t,textgrids/dime/s05650.TextGrid,0.093,s05650,5.185,5.622,sector,False,False,k,o
22109,5.525,5.584,o,textgrids/dime/s05650.TextGrid,0.059,s05650,5.185,5.622,sector,False,False,t,r(
22110,5.584,5.622,r(,textgrids/dime/s05650.TextGrid,0.038,s05650,5.185,5.622,sector,False,True,o,.sil


### Merge formant and TextGrid data

In [27]:
data = formants.merge(alldf, how='left', left_on=['Participant','t1_ph'], right_on = ['Participant','t1_ph'])
data = data.drop(["phone_y"], axis = 1)
data = data.rename(columns = {"phone_x": "phone"})
data.head()

Unnamed: 0,Participant,phone,Time,Interval,F1,F2,F3,Gender,Corpus,t1_ph,t2_ph,fname,dur_ph,t1_wd,t2_wd,word,is_wdinit_ph,is_wdfin_ph,prev_ph,next_ph
0,p112,sil,0.024,0.024,585.454327,1605.520415,2808.099199,Male,CBAS,0.0,0.71,textgrids/cbas/p112.TextGrid,0.71,0.0,0.71,,True,True,,b
1,p112,sil,0.026,0.026,585.454327,1605.520415,2808.099199,Male,CBAS,0.0,0.71,textgrids/cbas/p112.TextGrid,0.71,0.0,0.71,,True,True,,b
2,p112,sil,0.028,0.028,581.723734,1605.118774,2798.645865,Male,CBAS,0.0,0.71,textgrids/cbas/p112.TextGrid,0.71,0.0,0.71,,True,True,,b
3,p112,sil,0.03,0.03,552.6053,1601.983841,2724.859667,Male,CBAS,0.0,0.71,textgrids/cbas/p112.TextGrid,0.71,0.0,0.71,,True,True,,b
4,p112,sil,0.032,0.032,523.486867,1598.848908,2651.07347,Male,CBAS,0.0,0.71,textgrids/cbas/p112.TextGrid,0.71,0.0,0.71,,True,True,,b


In [34]:
data = data.dropna()

Now to fix errors/inconsistencies:

In [35]:
import re

# remove phones `sp` and `.sil`
data = data[(data['phone'] != ".sil") & (data['phone'] != "sp")]

# fix phones from txt file, remove + following some vowels
data["phone"] = data["phone"].apply(lambda x: re.sub("([aeiou])\+", "\1", x))

# replace `r(` with `rf` for consistency
data['word'] = data['word'].apply(lambda x: re.sub(r"r\(", r"rf", x))

# fix notation in dimex corpus, where V_7 yields accented V
data['word'] = data['word'].apply(lambda x: re.sub("a_7", "á", x))
data['word'] = data['word'].apply(lambda x: re.sub("i_7", "í", x))
data['word'] = data['word'].apply(lambda x: re.sub("o_7", "ó", x))
data['word'] = data['word'].apply(lambda x: re.sub("u_7", "ú", x))
data['word'] = data['word'].apply(lambda x: re.sub("e_7", "é", x))

# fix tildas
data['word'] = data['word'].apply(lambda x: re.sub("n~", "ñ", x))

data = data.reset_index(drop = True)

Now that we have made use of the Participant naming system in DIMEx to combine the tg and formant data, we can rename the Participant column, dropping the indication of the task number.

In [36]:
# fix naming of participant col
data["Participant"] = data["Participant"].apply(lambda x: x[:4])

Now to isolate the vowels.

In [37]:
# remove rows not containing vowels
vowelsdf = data[(data['phone']=="a") | 
                 (data['phone']=="e") | 
                 (data['phone']=="i") | 
                 (data['phone']=="u") | 
                 (data['phone']=="o")]
vowelsdf = vowelsdf.reset_index(drop = True)
vowelsdf = vowelsdf.rename(columns = {"phone": "Vowel"})
len(vowelsdf)

224049

### Speech rate

First we will take the number of vowels a speaker produces to be equal to the number of syllables they utter. Then we will take the unique values from the `t1_wd` and `t2_wd` columns and subtract t2 from t1 to obtain an array of the duration of each word uttered. Then we will sum the durations of all words and divide the number of syllables by this value.

In [38]:
def speech_rate(df):
    import numpy as np

    Participant = []
    speech_rate = []

    for i in df.Participant.unique():
        data = df[df["Participant"]==i]
        syllables = len(data.Vowel)
        end_times = data["t2_wd"]
        start_times = data["t1_wd"]
        durations = np.subtract(end_times, start_times)
        duration = sum(durations)
        rate = syllables/duration
    
        Participant.append(i)
        speech_rate.append(rate)

    rates = {k:v for k,v in zip(Participant, speech_rate)}
    rates_df = pd.DataFrame.from_dict(rates, orient = "index", columns = ['Speech Rate'])
    rates_df = rates_df.rename_axis('Participant').reset_index()
    
    df = pd.merge(left = df, right = rates_df, on = 'Participant', how = 'outer')
    return df

In [39]:
vowelsdf = speech_rate(vowelsdf)

### Stress

The syltippy package (https://github.com/nur-ag/syltippy) will be used to generate syllabified (stress-indicated) outputs for each word found in the transcriptions. Then, the corresponding vowels in the TextGrid-formant dataframes will be marked as either stressed or unstressed.

In [40]:
# function takes into dictionary.txt file with cols `word` and `ipa`
# input formants df with cols `Participant`, `word`, `t1_wd`, and `t1_ph`

def get_stress(vowels):
    # import required packages
    import csv
    import numpy as np
    from syltippy import syllabize
    
    # def fxn to create stress column in dictionary
    def stress(word):
        syllables, stress = syllabize(word)
        return ','.join(s if stress != i else s.upper() for (i, s) in enumerate(syllables))
    
    # add column to dictionary
    vowels["stress_syll"] = vowels["word"].apply(lambda x : stress(str(x)))
    
    # create separate column to hold only the vowels in each word
    vowels["syll_vowels"] = vowels["stress_syll"].apply(lambda x: re.sub(r'[^,aeiouAEIOUáéíóúÁÉÍÓÚ]', '', x))
    
    # define function to return index of 'vowels' column with stress
    def is_stress(word):
        # convert to list
        word = word.split(",")
        stress_vowel = 0
        for syllable in word:
            if syllable.isupper():
                stress_vowel = word.index(syllable)
        return stress_vowel
    
    # create new column which gives vowel number in given word that has stress
    vowels["stress_vowel"] = vowels["syll_vowels"].apply(lambda x: is_stress(x))
    
    # determine index of vowel in df
    vowels["vowel_ind"] = vowels.groupby(["fname", "t1_wd"])["t1_ph"].apply(lambda x:x.astype('category').cat.codes).astype(int)

    #vowels = vowels.reset_index(drop = True)
    
    # add column to formants to indicate stress
    vowels["stress"] = np.where(vowels['stress_vowel'] == vowels['vowel_ind'], "stressed", "unstressed")
    
    # drop unnecesary columns
    vowels = vowels.drop(["syll_vowels", "stress_syll"], axis = 1)
       
    return vowels

In [41]:
vowelsdf = get_stress(vowelsdf)

In [42]:
vowelsdf.sample(50)

Unnamed: 0,Participant,Vowel,Time,Interval,F1,F2,F3,Gender,Corpus,t1_ph,...,t2_wd,word,is_wdinit_ph,is_wdfin_ph,prev_ph,next_ph,Speech Rate,stress_vowel,vowel_ind,stress
153657,s053,o,1.851,0.026,501.252067,1198.312395,3133.405276,Female,DIMEx100,1.825,...,1.956,clínicos,False,False,k,s,2.138111,0,2,unstressed
108781,s002,u,1.399,0.014,290.296435,1428.210259,2600.364259,Male,DIMEx100,1.385,...,1.794,estudiantes,False,False,t,d,2.239212,2,1,unstressed
211035,s056,e,3.812,0.01,370.905286,1353.537658,2825.621626,Female,DIMEx100,3.802,...,4.078,merece,False,False,m,r(,1.925044,1,0,unstressed
22295,p119,a,278.92,0.038,736.444395,1509.960628,2703.594453,Male,CBAS,278.882,...,279.132,abandonar,False,False,n,rf,1.44297,3,3,stressed
91097,s001,a,0.684,0.08,636.481677,1564.496592,2400.072799,Male,DIMEx100,0.604,...,1.088,humanidades,False,False,d,d,2.189077,3,3,stressed
220297,s056,a,1.058,0.018,741.623569,2039.023994,3034.003339,Female,DIMEx100,1.04,...,1.543,elaborada,False,False,l,b,1.925044,3,1,unstressed
97526,s002,e,1.986,0.042,407.21993,1809.5361,2775.821331,Male,DIMEx100,1.944,...,2.315,esto,True,False,o,s,2.239212,0,0,stressed
184457,s055,e,2.761,0.008,490.944208,1905.138598,2954.85915,Female,DIMEx100,2.753,...,2.871,establecer,False,False,s,r(,1.914898,3,3,stressed
128942,s051,o,2.29,0.036,593.93722,1118.743297,2878.245902,Female,DIMEx100,2.254,...,2.402,oro,True,False,e,r(,2.137838,0,0,stressed
157581,s053,i,0.763,0.018,370.545218,2414.324547,3029.670652,Female,DIMEx100,0.745,...,1.26,principales,False,False,r(,n,2.138111,2,0,unstressed


### Remove outliers

In [45]:
# removes productions with outlier formants
# input df has columns `Participant` and formants listed in format `F1.50`

def rem_outliers(df):
    print("Initial length: ", len(df))
    
    # establish 25% and 75% for each formant
    f1_qrts = df.groupby(['Participant', 'Vowel'])["F1"].describe()[['25%', '75%']]
    f2_qrts = df.groupby(['Participant', 'Vowel'])["F2"].describe()[['25%', '75%']]
    f3_qrts = df.groupby(['Participant', 'Vowel'])["F3"].describe()[['25%', '75%']]
    
    # find interquartile range for each formant
    f1_qrts['IQR'] = f1_qrts['75%'] - f1_qrts['25%']
    f2_qrts['IQR'] = f2_qrts['75%'] - f2_qrts['25%']
    f3_qrts['IQR'] = f3_qrts['75%'] - f3_qrts['25%']
    
    # determine upper limit for each formant
    f1_qrts['upper'] = f1_qrts['75%'] + (1.5 * f1_qrts['IQR'])
    f2_qrts['upper'] = f2_qrts['75%'] + (1.5 * f2_qrts['IQR'])
    f3_qrts['upper'] = f3_qrts['75%'] + (1.5 * f3_qrts['IQR'])
    
    # determine lower limit for each formant
    f1_qrts['lower'] = f1_qrts['25%'] - (1.5 * f1_qrts['IQR'])
    f2_qrts['lower'] = f2_qrts['25%'] - (1.5 * f2_qrts['IQR'])
    f3_qrts['lower'] = f3_qrts['25%'] - (1.5 * f3_qrts['IQR'])
    
    # create smaller df with only limits for each formant
    f1_limits = f1_qrts[['upper','lower']]
    f2_limits = f2_qrts[['upper','lower']]
    f3_limits = f3_qrts[['upper','lower']]
    
    # merge limits into original df
    df = df.merge(f1_limits, left_on = ["Participant", "Vowel"], right_index = True)
    df = df.merge(f2_limits, left_on = ["Participant", "Vowel"], right_index = True, suffixes = ("_f1", "_f2"))
    df = df.merge(f3_limits, left_on = ["Participant", "Vowel"], right_index = True)
    
    # drop rows with outlier formants
    df = df[(df["F1"] > df["lower_f1"]) & (df["F1"] < df["upper_f1"])]
    df = df[(df["F2"] > df["lower_f2"]) & (df["F2"] < df["upper_f2"])]
    df = df[(df["F3"] > df["lower"]) & (df["F3"] < df["upper"])]

    print("Final length: ", len(df))
    return df  

In [46]:
vowelsdf = rem_outliers(vowelsdf)

Initial length:  224049
Final length:  196637


### Normalization of vowel formants

Because both male and female speakers are represented in this data set, the formant frequencies need to be normalized to minimized vocal tract length differences.

Following Johnson (2018), I will use the line-fitting Delta F Normalization method, which makes use of the entire vowel space. To do so, the average vowel space will be calculated for each participant, and then each F1 and F2 measurement will be divided by this value.

First we will calculate the average formant measurements over each vowel production, to get an estimate of the 'midpoint'.

In [43]:
import numpy as np

def delta_f(vowels): # df as argument
    
    Participant = []
    ll = []
    
    for i in vowels.Participant.unique():
        data = vowels[vowels['Participant']==i]
        
        delta = np.mean([np.true_divide(data["F1"], 0.5), 
                        np.true_divide(data["F2"], 1.5), 
                        np.true_divide(data["F3"], 2.5)
                       ])
        
        Participant.append(i)
        ll.append(delta)
    
    deltas = {k:v for k,v in zip(Participant, ll)}
    delta_df = pd.DataFrame.from_dict(deltas, orient = "index", columns = ['Delta F'])
    delta_df = delta_df.rename_axis('Participant').reset_index()
        
    return(delta_df)

In [44]:
def normalization(vowels):
    delta_df = delta_f(vowels)
    
    deltas = delta_df.set_index("Participant")
    deltas = deltas.reset_index()
    
    vowels_normalized = pd.merge(left = vowels,
                                 right = deltas,
                                 on = 'Participant',
                                 how = 'outer')
    vowels_normalized['F1_norm'] = vowels_normalized['F1']/vowels_normalized['Delta F']
    vowels_normalized['F2_norm'] = vowels_normalized['F2']/vowels_normalized['Delta F']
    
    return(vowels_normalized)

In [47]:
vowels_norm = normalization(vowelsdf)
vowels_norm.sample(10)

Unnamed: 0,Participant,Vowel,Time,Interval,F1,F2,F3,Gender,Corpus,t1_ph,...,stress,upper_f1,lower_f1,upper_f2,lower_f2,upper,lower,Delta F,F1_norm,F2_norm
99452,s002,e,2.031,0.028,373.744683,1729.38254,2459.190147,Male,DIMEx100,2.003,...,unstressed,563.040694,277.456654,2329.66688,1453.145894,3102.295189,2441.824271,1037.247738,0.360323,1.66728
151665,s055,a,0.45,0.054,714.927318,1778.019019,3068.798472,Female,DIMEx100,0.396,...,unstressed,889.015787,390.791446,2351.677305,1272.946289,3486.485457,2393.2302,1172.972349,0.609501,1.515823
32731,p115,a,66.51,0.158,803.858897,1607.647003,2525.193368,Female,CBAS,66.352,...,unstressed,1020.310451,419.557917,2150.615839,1151.107602,3935.815306,1243.836517,1100.37173,0.730534,1.461004
72398,s001,a,1.212,0.044,518.282833,1485.902028,2357.044046,Male,DIMEx100,1.168,...,unstressed,767.248154,297.414174,1871.150175,1227.180647,2816.471796,2068.653858,969.869749,0.534384,1.532063
72520,s001,a,1.454,0.01,556.86296,1505.901374,2199.353018,Male,DIMEx100,1.444,...,unstressed,767.248154,297.414174,1871.150175,1227.180647,2816.471796,2068.653858,969.869749,0.574163,1.552684
52919,p120,e,151.366,0.002,561.362451,1819.385362,2617.657245,Female,CBAS,151.364,...,unstressed,822.064359,346.976127,3371.114925,778.203275,4526.119578,1596.498461,1200.250605,0.467704,1.515838
176084,s056,a,2.11,0.004,509.22944,2372.405487,3208.70173,Female,DIMEx100,2.106,...,unstressed,1075.916925,278.742886,2391.976743,1394.870408,3618.56403,2044.477974,1165.696748,0.436846,2.035182
51546,p120,i,162.394,0.014,430.852744,1432.750576,3393.614464,Female,CBAS,162.38,...,stressed,601.869636,173.416478,5367.4859,-2047.200892,4221.339362,1989.947059,1200.250605,0.358969,1.19371
169521,s055,e,2.965,0.07,455.909836,2300.349537,2899.835241,Female,DIMEx100,2.895,...,unstressed,726.385377,334.094784,2614.155822,1363.644818,3542.090061,2299.281938,1172.972349,0.388679,1.961129
76930,s001,o,2.36,0.068,388.243491,1483.704892,2313.305393,Male,DIMEx100,2.292,...,unstressed,578.038288,272.964599,1811.910888,585.584578,2996.286738,2021.881491,969.869749,0.400305,1.529798


For SS-ANOVA, duration of each vowel needs to be scaled from 0 to 1. To do this, create a new column `RTime` that is the result of `Interval` divided by `dur_ph`.

In [48]:
vowels_norm["RTime"] = vowels_norm["Interval"]/vowels_norm["dur_ph"]
vowels_norm.head()

Unnamed: 0,Participant,Vowel,Time,Interval,F1,F2,F3,Gender,Corpus,t1_ph,...,upper_f1,lower_f1,upper_f2,lower_f2,upper,lower,Delta F,F1_norm,F2_norm,RTime
0,p112,a,0.826,0.006,557.121005,912.986166,2698.598084,Male,CBAS,0.82,...,821.296486,290.964028,1934.279776,620.442955,3102.150919,1860.367543,921.075539,0.604859,0.991217,0.06
1,p112,a,0.828,0.008,578.827805,928.680747,2713.743317,Male,CBAS,0.82,...,821.296486,290.964028,1934.279776,620.442955,3102.150919,1860.367543,921.075539,0.628426,1.008257,0.08
2,p112,a,0.83,0.01,595.006045,925.034463,2727.140941,Male,CBAS,0.82,...,821.296486,290.964028,1934.279776,620.442955,3102.150919,1860.367543,921.075539,0.64599,1.004298,0.1
3,p112,a,0.836,0.016,643.540763,914.095612,2767.333814,Male,CBAS,0.82,...,821.296486,290.964028,1934.279776,620.442955,3102.150919,1860.367543,921.075539,0.698684,0.992422,0.16
4,p112,a,0.838,0.018,658.764921,908.418713,2777.59082,Male,CBAS,0.82,...,821.296486,290.964028,1934.279776,620.442955,3102.150919,1860.367543,921.075539,0.715213,0.986259,0.18


Create a grouping factor that will uniquely identify each vowel produced by each speaker.

In [49]:
vowels_norm["unique"] = vowels_norm["fname"] + vowels_norm["t1_ph"].astype(str)

In [50]:
vowels_norm.to_csv("data/contour_norm.csv", index = False)

In [56]:
vowels_norm.groupby(["Participant", "Vowel"])["unique"].nunique()

Participant  Vowel
p112         a        145
             e         75
             i         52
             o         92
             u         12
p113         a        145
             e         74
             i         51
             o         92
             u         11
p115         a        147
             e         76
             i         53
             o         90
             u         13
p119         a        144
             e         77
             i         54
             o         87
             u         15
p120         a        147
             e         78
             i         52
             o         90
             u         14
p124         a        139
             e         76
             i         53
             o         89
             u         14
s001         a        238
             e        279
             i        169
             o        184
             u         61
s002         a        272
             e        326
             i     

In [59]:
# make slimmed and balanced data set for plotting and modeling in R
slim_a = vowels_norm[vowels_norm["Vowel"]=="a"].copy()
slim_e = vowels_norm[vowels_norm["Vowel"]=="e"].copy()
slim_i = vowels_norm[vowels_norm["Vowel"]=="i"].copy()
slim_o = vowels_norm[vowels_norm["Vowel"]=="o"].copy()
slim_u = vowels_norm[vowels_norm["Vowel"]=="u"].copy()

slim_a = slim_a.groupby(["Participant"])["unique"].apply(lambda x: x.sample(n=50))
slim_e = slim_e.groupby(["Participant"])["unique"].apply(lambda x: x.sample(n=50))
slim_i = slim_i.groupby(["Participant"])["unique"].apply(lambda x: x.sample(n=50))
slim_o = slim_o.groupby(["Participant"])["unique"].apply(lambda x: x.sample(n=50))
slim_u = slim_u.groupby(["Participant"])["unique"].apply(lambda x: x.sample(n=9))

slim = pd.concat([slim_a, slim_e, slim_i, slim_o, slim_u], ignore_index = True)
len(slim)

2508

In [69]:
slim_a = vowels_norm[vowels_norm["Vowel"]=="a"].copy()
len(slim_a)

63853

In [68]:
slim_a.groupby(["Participant"])["unique"].nunique()

Participant
p112    145
p113    145
p115    147
p119    144
p120    147
p124    139
s001    238
s002    272
s051    301
s053    306
s055    349
s056    296
Name: unique, dtype: int64

In [71]:
df = slim_a.groupby('Participant')["unique"].unique()
# Sort the over-represented class to the head.
df = df[df.apply(len).sort_values(ascending=False).index]
excess = len(df.iloc[0]) - len(df.iloc[1])
remove = np.random.choice(df.iloc[0], excess, replace=False)
df2 = slim_a[~slim_a["Participant"].isin(remove)]
df2

Unnamed: 0,Participant,Vowel,Time,Interval,F1,F2,F3,Gender,Corpus,t1_ph,...,lower_f1,upper_f2,lower_f2,upper,lower,Delta F,F1_norm,F2_norm,RTime,unique
0,p112,a,0.826,0.006,557.121005,912.986166,2698.598084,Male,CBAS,0.820,...,290.964028,1934.279776,620.442955,3102.150919,1860.367543,921.075539,0.604859,0.991217,0.060000,textgrids/cbas/p112.TextGrid0.82
1,p112,a,0.828,0.008,578.827805,928.680747,2713.743317,Male,CBAS,0.820,...,290.964028,1934.279776,620.442955,3102.150919,1860.367543,921.075539,0.628426,1.008257,0.080000,textgrids/cbas/p112.TextGrid0.82
2,p112,a,0.830,0.010,595.006045,925.034463,2727.140941,Male,CBAS,0.820,...,290.964028,1934.279776,620.442955,3102.150919,1860.367543,921.075539,0.645990,1.004298,0.100000,textgrids/cbas/p112.TextGrid0.82
3,p112,a,0.836,0.016,643.540763,914.095612,2767.333814,Male,CBAS,0.820,...,290.964028,1934.279776,620.442955,3102.150919,1860.367543,921.075539,0.698684,0.992422,0.160000,textgrids/cbas/p112.TextGrid0.82
4,p112,a,0.838,0.018,658.764921,908.418713,2777.590820,Male,CBAS,0.820,...,290.964028,1934.279776,620.442955,3102.150919,1860.367543,921.075539,0.715213,0.986259,0.180000,textgrids/cbas/p112.TextGrid0.82
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
180006,s056,a,4.865,0.074,425.885702,1961.739397,2969.186083,Female,DIMEx100,4.791,...,278.742886,2391.976743,1394.870408,3618.564030,2044.477974,1165.696748,0.365349,1.682890,0.870588,textgrids/dime/s05650.TextGrid4.791
180007,s056,a,4.867,0.076,410.485017,1959.946847,2965.663706,Female,DIMEx100,4.791,...,278.742886,2391.976743,1394.870408,3618.564030,2044.477974,1165.696748,0.352137,1.681352,0.894118,textgrids/dime/s05650.TextGrid4.791
180008,s056,a,4.871,0.080,387.268006,1968.693835,2963.790632,Female,DIMEx100,4.791,...,278.742886,2391.976743,1394.870408,3618.564030,2044.477974,1165.696748,0.332220,1.688856,0.941176,textgrids/dime/s05650.TextGrid4.791
180009,s056,a,4.873,0.082,375.659501,1973.067329,2962.854094,Female,DIMEx100,4.791,...,278.742886,2391.976743,1394.870408,3618.564030,2044.477974,1165.696748,0.322262,1.692608,0.964706,textgrids/dime/s05650.TextGrid4.791
