# Processing and Normalization

Diphthongs and outlier productions will be excluded from analysis. Relative duration of vowels will be coded in addition to stress. Lastly, all formant data will be normalized following Delta F normalization (Johnson, 2018).

In [2]:
import pandas as pd

In [3]:
# import df

data = pd.read_csv("data/allvowels.csv")
print(len(data))
data.sample(10)

2006

## Removal of diphthongs

In [10]:
# takes df input with columns `next_ph` and `prev_ph` and returns df without diphthongs

def rem_diph(df):
    print("Initial length: ", len(df))
    
    df = df[(df['next_ph']!="a") & 
                 (df['next_ph']!="e") & 
                 (df['next_ph']!="i") &
                 (df['next_ph']!="o") &
                 (df['next_ph']!="u")]
    df = df[(df['prev_ph']!="a") & 
                 (df['prev_ph']!="e") & 
                 (df['prev_ph']!="i") &
                 (df['prev_ph']!="o") &
                 (df['prev_ph']!="u")]
    df = df.reset_index(drop = True)
    
    print("Final length: ", len(df))
    return df    

In [None]:
data = rem_diph(data)

## Removal of outliers

In [13]:
# removes productions with outlier formants
# input df has columns `Participant` and formants listed in format `F1 (Hz)`

def rem_outliers(df):
    print("Initial length: ", len(df))
    
    # establish 25% and 75% for each formant
    f1_qrts = df.groupby(['Participant', 'Vowel'])["F1 (Hz)"].describe()[['25%', '75%']]
    f2_qrts = df.groupby(['Participant', 'Vowel'])["F2 (Hz)"].describe()[['25%', '75%']]
    f3_qrts = df.groupby(['Participant', 'Vowel'])["F3 (Hz)"].describe()[['25%', '75%']]
    
    # find interquartile range for each formant
    f1_qrts['IQR'] = f1_qrts['75%'] - f1_qrts['25%']
    f2_qrts['IQR'] = f2_qrts['75%'] - f2_qrts['25%']
    f3_qrts['IQR'] = f3_qrts['75%'] - f3_qrts['25%']
    
    # determine upper limit for each formant
    f1_qrts['upper'] = f1_qrts['75%'] + (1.5 * f1_qrts['IQR'])
    f2_qrts['upper'] = f2_qrts['75%'] + (1.5 * f2_qrts['IQR'])
    f3_qrts['upper'] = f3_qrts['75%'] + (1.5 * f3_qrts['IQR'])
    
    # determine lower limit for each formant
    f1_qrts['lower'] = f1_qrts['25%'] - (1.5 * f1_qrts['IQR'])
    f2_qrts['lower'] = f2_qrts['25%'] - (1.5 * f2_qrts['IQR'])
    f3_qrts['lower'] = f3_qrts['25%'] - (1.5 * f3_qrts['IQR'])
    
    # create smaller df with only limits for each formant
    f1_limits = f1_qrts[['upper','lower']]
    f2_limits = f2_qrts[['upper','lower']]
    f3_limits = f3_qrts[['upper','lower']]
    
    # merge limits into original df
    df = df.merge(f1_limits, left_on = ["Participant", "Vowel"], right_index = True)
    df = df.merge(f2_limits, left_on = ["Participant", "Vowel"], right_index = True, suffixes = ("_f1", "_f2"))
    df = df.merge(f3_limits, left_on = ["Participant", "Vowel"], right_index = True)
    
    # drop rows with outlier formants
    df = df[(df["F1 (Hz)"] > df["lower_f1"]) & (df["F1 (Hz)"] < df["upper_f1"])]
    df = df[(df["F2 (Hz)"] > df["lower_f2"]) & (df["F2 (Hz)"] < df["upper_f2"])]
    df = df[(df["F3 (Hz)"] > df["lower"]) & (df["F3 (Hz)"] < df["upper"])]

    print("Final length: ", len(df))
    return df  

In [None]:
data = rem_outliers(data)

## Speech rate

First we will take the number of vowels a speaker produces to be equal to the number of syllables they utter. Then we will take the unique values from the `t1_wd` and `t2_wd` columns and subtract t2 from t1 to obtain an array of the duration of each word uttered. Then we will sum the durations of all words and divide the number of syllables by this value.

In [None]:
def speech_rate(df):
    import numpy as np

    Participant = []
    speech_rate = []

    for i in df.Participant.unique():
        data = vowels[vowels["Participant"]==i]
        syllables = len(data.Vowel)
        end_times = data.t2_wd.unique()
        start_times = data.t1_wd.unique()
        durations = np.subtract(end_times, start_times)
        duration = sum(durations)
        rate = syllables/duration
    
        Participant.append(i)
        speech_rate.append(rate)

    rates = {k:v for k,v in zip(Participant, speech_rate)}
    rates_df = pd.DataFrame.from_dict(rates, orient = "index", columns = ['Speech Rate'])
    rates_df = rates_df.rename_axis('Participant').reset_index()
    
    df = pd.merge(left = df, right = rates_df, on = 'Participant', how = 'outer')
    return df

In [None]:
data = speech_rate(data)

## Stress

The syltippy package (https://github.com/nur-ag/syltippy) will be used to generate syllabified (stress-indicated) outputs for each word found in the transcriptions. Then, the corresponding vowels in the TextGrid-formant dataframes will be marked as either stressed or unstressed.

In [17]:
# function takes into dictionary.txt file with cols `word` and `ipa`
# input formants df with cols `Participant`, `word`, `t1_wd`, and `t1_ph`

def get_stress(df):
    # import required packages
    import pandas as pd
    import csv
    import re
    import numpy as np
    from syltippy import syllabize
    
    formants = df.copy()
    
    # def fxn to create stress column in dictionary
    def stress(word):
        syllables, stress = syllabize(word)
        return ','.join(s if stress != i else s.upper() for (i, s) in enumerate(syllables))
    
    # add column to dictionary
    formants["stress_pattern"] = formants["word"].apply(lambda x : stress(x))
    
    # create separate column to hold only the vowels in each word
    formants["stress_vowels"] = formants["stress"].apply(lambda x: re.sub(r'[^,aeiouAEIOUáéíóúÁÉÍÓÚ]', '', x))
    
    # define function to return index of 'vowels' column with stress
    def is_stress(word):
        # convert to list
        word = word.split(",")
        for syllable in word:
            if syllable.isupper():
                stress = word.index(syllable)
        return stress
    
    # create new column which gives vowel number in given word that has stress
    formants["stress_index"] = formants["vowels"].apply(lambda x: is_stress(x))
    
    # save as new variable
    stress_indices = formants[['word', 'stress_index']].copy()
    
    # add column to formants indicating stressed vowel index
    df = df.merge(stress_indices, on = "word", how = "outer")
    df = df.sort_values(["Participant", "t1_wd", "t1_ph"])
    
    # add column to formants indicating index of vowel in each row
    df["vowel_ind"] = df.groupby(["Participant", "t1_wd"]).cumcount()
    df = df.reset_index(drop = True)
    
    # add column to formants to indicate stress
    df["stress"] = np.where(df['stress_index'] == df['vowel_ind'], "stressed", "unstressed")
       
    return df

In [None]:
data = get_stress(data)

## Normalization of vowel formants

Because both male and female speakers are represented in this data set, the formant frequencies need to be normalized to minimized vocal tract length differences.

Following Johnson (2018), I will use the line-fitting Delta F Normalization method, which makes use of the entire vowel space. To do so, the average vowel space will be calculated for each participant, and then each F1 and F2 measurement will be divided by this value.

In [15]:
import numpy as np

def delta_f(vowels): # df as argument
    
    Participant = []
    ll = []
    
    for i in vowels.Participant.unique():
        data = vowels[vowels['Participant']==i]
        
        delta = np.mean([np.true_divide(data['F1 (Hz)'], 0.5), 
                        np.true_divide(data['F2 (Hz)'], 1.5), 
                        np.true_divide(data['F3 (Hz)'], 2.5)
                       ])
        
        Participant.append(i)
        ll.append(delta)
    
    deltas = {k:v for k,v in zip(Participant, ll)}
    delta_df = pd.DataFrame.from_dict(deltas, orient = "index", columns = ['Delta F'])
    delta_df = delta_df.rename_axis('Participant').reset_index()
        
    return(delta_df)

In [16]:
def normalization(vowels):
    delta_df = delta_f(vowels)
    
    vowels_normalized = pd.merge(left = vowels,
                                 right = delta_df,
                                 on = 'Participant',
                                 how = 'outer')
    vowels_normalized['F1_norm'] = vowels_normalized['F1 (Hz)']/vowels_normalized['Delta F']
    vowels_normalized['F2_norm'] = vowels_normalized['F2 (Hz)']/vowels_normalized['Delta F']
    
    return(vowels_normalized)

In [None]:
data = normalization(data)
data.sample(10)

In [None]:
data.to_csv("data/allnormdata.csv", index = False)