# Processing and Normalization

Diphthongs and outlier productions will be excluded from analysis. Relative duration of vowels will be coded in addition to stress. Lastly, all formant data will be normalized following Delta F normalization (Johnson, 2018).

In [1]:
import pandas as pd

In [2]:
# import df
data = pd.read_csv("data/allvowels.csv")
print(len(data))
data.sample(10)

9289


Unnamed: 0,Participant,Vowel,F1.50,F2.50,F3.50,F1.25,F2.25,F3.25,F1.75,F2.75,...,t2_ph,fname,dur_ph,t1_wd,t2_wd,word,is_wdinit_ph,is_wdfin_ph,prev_ph,next_ph
6619,s053,i,281.109497,2105.162508,2967.629985,264.950842,2147.438552,2864.123697,291.282202,1876.344586,...,3.259,textgrids/dime/s05348.TextGrid,0.04,2.666,3.383,información,False,False,s,o
6797,s055,a,711.402287,2090.841224,2991.325739,664.297643,2213.797213,2977.231943,761.857457,2020.831234,...,4.86,textgrids/dime/s05503.TextGrid,0.123,3.997,4.86,antropología,False,True,i,.sil
4284,s051,a,632.269985,1863.568276,2318.311524,665.760207,1852.331522,2300.763613,598.865086,1891.760775,...,0.935,textgrids/dime/s05101.TextGrid,0.038,0.897,1.656,actualización,True,False,a,k
5107,s051,i,399.675252,2625.803804,3804.305455,392.264616,2605.764791,3712.611858,414.678888,2689.505482,...,3.643,textgrids/dime/s05133.TextGrid,0.07,3.204,4.011,disciplinas,False,False,l,n
349,p112,o,423.798096,972.213997,2575.875246,453.826103,995.70195,2502.845686,419.357463,911.173214,...,281.591,textgrids/cbas/p112.TextGrid,0.08,281.411,282.101,lotería,False,False,l,t
2052,p124,o,562.495601,1283.868075,3267.686911,575.470635,1537.26647,3260.161955,546.53343,1220.239713,...,213.342,textgrids/cbas/p124.TextGrid,0.05,213.292,213.972,obreros,True,False,s,b
6871,s055,e,561.306285,2059.432844,2787.800043,580.355618,1965.180215,2801.904344,494.327266,2101.535388,...,0.423,textgrids/dime/s05506.TextGrid,0.04,0.072,0.756,reflexión,False,False,l,k
109,p112,a,455.632918,1204.177353,2630.569215,463.562478,1059.134693,2567.86136,302.554819,1308.086226,...,87.915,textgrids/cbas/p112.TextGrid,0.07,87.395,87.915,basura,False,True,rf,sp
1178,p115,a,899.614601,1717.854278,2606.819257,825.03567,1684.338696,1895.649611,666.735576,1577.687147,...,38.523,textgrids/cbas/p115.TextGrid,0.17,37.933,38.523,lógica,False,True,k,sp
1644,p120,a,781.491875,1622.150245,2088.446474,922.267149,1079.318193,2272.030647,756.082753,1580.527165,...,122.943,textgrids/cbas/p120.TextGrid,0.1,122.843,123.273,apio,True,False,sp,p


In [3]:
# first some basic checks
data.groupby(["Participant", "Vowel"]).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,F1.50,F2.50,F3.50,F1.25,F2.25,F3.25,F1.75,F2.75,F3.75,Gender,...,t2_ph,fname,dur_ph,t1_wd,t2_wd,word,is_wdinit_ph,is_wdfin_ph,prev_ph,next_ph
Participant,Vowel,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
p112,a,145,145,145,145,145,145,145,145,145,145,...,145,145,145,145,145,145,145,145,145,145
p112,e,76,76,76,76,76,76,76,76,76,76,...,76,76,76,76,76,76,76,76,76,76
p112,i,52,52,52,52,52,52,52,52,52,52,...,52,52,52,52,52,52,52,52,52,52
p112,o,92,92,92,92,92,92,92,92,92,92,...,92,92,92,92,92,92,92,92,92,92
p112,u,12,12,12,12,12,12,12,12,12,12,...,12,12,12,12,12,12,12,12,12,12
p113,a,145,145,145,145,145,145,145,145,145,145,...,145,145,145,145,145,145,145,145,143,144
p113,e,75,75,75,75,75,75,75,75,75,75,...,75,75,75,75,75,75,75,75,75,75
p113,i,51,51,51,51,51,51,51,51,51,51,...,51,51,51,51,51,51,51,51,51,51
p113,o,92,92,92,92,92,92,92,92,92,92,...,92,92,92,92,92,92,92,92,92,88
p113,u,12,12,12,12,12,12,12,12,12,12,...,12,12,12,12,12,12,12,12,12,12


## Removal of diphthongs

In [4]:
# takes df input with columns `next_ph` and `prev_ph` and returns df without diphthongs

def rem_diph(df):
    print("Initial length: ", len(df))
    
    df = df[(df['next_ph']!="a") & 
                 (df['next_ph']!="e") & 
                 (df['next_ph']!="i") &
                 (df['next_ph']!="o") &
                 (df['next_ph']!="u")]
    df = df[(df['prev_ph']!="a") & 
                 (df['prev_ph']!="e") & 
                 (df['prev_ph']!="i") &
                 (df['prev_ph']!="o") &
                 (df['prev_ph']!="u")]
    df = df.reset_index(drop = True)
    
    print("Final length: ", len(df))
    return df    

In [5]:
data = rem_diph(data)

Initial length:  9289
Final length:  7284


## Removal of outliers

In [6]:
# removes productions with outlier formants
# input df has columns `Participant` and formants listed in format `F1.50`

def rem_outliers(df):
    print("Initial length: ", len(df))
    
    # establish 25% and 75% for each formant
    f1_qrts = df.groupby(['Participant', 'Vowel'])["F1.50"].describe()[['25%', '75%']]
    f2_qrts = df.groupby(['Participant', 'Vowel'])["F2.50"].describe()[['25%', '75%']]
    f3_qrts = df.groupby(['Participant', 'Vowel'])["F3.50"].describe()[['25%', '75%']]
    
    # find interquartile range for each formant
    f1_qrts['IQR'] = f1_qrts['75%'] - f1_qrts['25%']
    f2_qrts['IQR'] = f2_qrts['75%'] - f2_qrts['25%']
    f3_qrts['IQR'] = f3_qrts['75%'] - f3_qrts['25%']
    
    # determine upper limit for each formant
    f1_qrts['upper'] = f1_qrts['75%'] + (1.5 * f1_qrts['IQR'])
    f2_qrts['upper'] = f2_qrts['75%'] + (1.5 * f2_qrts['IQR'])
    f3_qrts['upper'] = f3_qrts['75%'] + (1.5 * f3_qrts['IQR'])
    
    # determine lower limit for each formant
    f1_qrts['lower'] = f1_qrts['25%'] - (1.5 * f1_qrts['IQR'])
    f2_qrts['lower'] = f2_qrts['25%'] - (1.5 * f2_qrts['IQR'])
    f3_qrts['lower'] = f3_qrts['25%'] - (1.5 * f3_qrts['IQR'])
    
    # create smaller df with only limits for each formant
    f1_limits = f1_qrts[['upper','lower']]
    f2_limits = f2_qrts[['upper','lower']]
    f3_limits = f3_qrts[['upper','lower']]
    
    # merge limits into original df
    df = df.merge(f1_limits, left_on = ["Participant", "Vowel"], right_index = True)
    df = df.merge(f2_limits, left_on = ["Participant", "Vowel"], right_index = True, suffixes = ("_f1", "_f2"))
    df = df.merge(f3_limits, left_on = ["Participant", "Vowel"], right_index = True)
    
    # drop rows with outlier formants
    df = df[(df["F1.50"] > df["lower_f1"]) & (df["F1.50"] < df["upper_f1"])]
    df = df[(df["F2.50"] > df["lower_f2"]) & (df["F2.50"] < df["upper_f2"])]
    df = df[(df["F3.50"] > df["lower"]) & (df["F3.50"] < df["upper"])]

    print("Final length: ", len(df))
    return df  

In [7]:
data = rem_outliers(data)

Initial length:  7284
Final length:  6341


## Speech rate

First we will take the number of vowels a speaker produces to be equal to the number of syllables they utter. Then we will take the unique values from the `t1_wd` and `t2_wd` columns and subtract t2 from t1 to obtain an array of the duration of each word uttered. Then we will sum the durations of all words and divide the number of syllables by this value.

In [8]:
def speech_rate(df):
    import numpy as np

    Participant = []
    speech_rate = []

    for i in df.Participant.unique():
        data = df[df["Participant"]==i]
        syllables = len(data.Vowel)
        end_times = data["t2_wd"]
        start_times = data["t1_wd"]
        durations = np.subtract(end_times, start_times)
        duration = sum(durations)
        rate = syllables/duration
    
        Participant.append(i)
        speech_rate.append(rate)

    rates = {k:v for k,v in zip(Participant, speech_rate)}
    rates_df = pd.DataFrame.from_dict(rates, orient = "index", columns = ['Speech Rate'])
    rates_df = rates_df.rename_axis('Participant').reset_index()
    
    df = pd.merge(left = df, right = rates_df, on = 'Participant', how = 'outer')
    return df

In [9]:
data = speech_rate(data)
len(data)

6341

## Stress

The syltippy package (https://github.com/nur-ag/syltippy) will be used to generate syllabified (stress-indicated) outputs for each word found in the transcriptions. Then, the corresponding vowels in the TextGrid-formant dataframes will be marked as either stressed or unstressed.

In [10]:
# function takes into dictionary.txt file with cols `word` and `ipa`
# input formants df with cols `Participant`, `word`, `t1_wd`, and `t1_ph`

def get_stress(vowels):
    # import required packages
    import pandas as pd
    import csv
    import re
    import numpy as np
    from syltippy import syllabize
    
    # def fxn to create stress column in dictionary
    def stress(word):
        syllables, stress = syllabize(word)
        return ','.join(s if stress != i else s.upper() for (i, s) in enumerate(syllables))
    
    # add column to dictionary
    vowels["stress_syll"] = vowels["word"].apply(lambda x : stress(str(x)))
    
    # create separate column to hold only the vowels in each word
    vowels["syll_vowels"] = vowels["stress_syll"].apply(lambda x: re.sub(r'[^,aeiouAEIOUáéíóúÁÉÍÓÚ]', '', x))
    
    # define function to return index of 'vowels' column with stress
    def is_stress(word):
        # convert to list
        word = word.split(",")
        stress_vowel = 0
        for syllable in word:
            if syllable.isupper():
                stress_vowel = word.index(syllable)
        return stress_vowel
    
    # create new column which gives vowel number in given word that has stress
    vowels["stress_vowel"] = vowels["syll_vowels"].apply(lambda x: is_stress(x))
    
    # determine index of vowel in df
    vowels["vowel_ind"] = vowels.groupby(["Participant", "t1_wd"]).cumcount()
    vowels = vowels.reset_index(drop = True)
    
    # add column to formants to indicate stress
    vowels["stress"] = np.where(vowels['stress_vowel'] == vowels['vowel_ind'], "stressed", "unstressed")
    
    # drop unnecesary columns
    vowels = vowels.drop(["syll_vowels", "stress_syll"], axis = 1)
       
    return vowels

In [11]:
data = get_stress(data)
len(data)

6341

## Normalization of vowel formants

Because both male and female speakers are represented in this data set, the formant frequencies need to be normalized to minimized vocal tract length differences.

Following Johnson (2018), I will use the line-fitting Delta F Normalization method, which makes use of the entire vowel space. To do so, the average vowel space will be calculated for each participant, and then each F1 and F2 measurement will be divided by this value.

In [12]:
import numpy as np

def delta_f(vowels, x): # df as argument, percent of interval (ie 25%, 50% or 75%)
    
    Participant = []
    ll = []
    
    for i in vowels.Participant.unique():
        data = vowels[vowels['Participant']==i]
        
        delta = np.mean([np.true_divide(data["F1."+x], 0.5), 
                        np.true_divide(data["F2."+x], 1.5), 
                        np.true_divide(data["F3."+x], 2.5)
                       ])
        
        Participant.append(i)
        ll.append(delta)
    
    deltas = {k:v for k,v in zip(Participant, ll)}
    delta_df = pd.DataFrame.from_dict(deltas, orient = "index", columns = ['Delta F.'+x])
    delta_df = delta_df.rename_axis('Participant').reset_index()
        
    return(delta_df)

In [13]:
def normalization(vowels):
    delta_df_50 = delta_f(vowels, "50")
    delta_df_25 = delta_f(vowels, "25")
    delta_df_75 = delta_f(vowels, "75")
    
    deltas = delta_df_50.set_index("Participant").join([delta_df_25.set_index("Participant"), delta_df_75.set_index("Participant")])
    deltas = deltas.reset_index()
    
    vowels_normalized = pd.merge(left = vowels,
                                 right = deltas,
                                 on = 'Participant',
                                 how = 'outer')
    vowels_normalized['F1.50_norm'] = vowels_normalized['F1.50']/vowels_normalized['Delta F.50']
    vowels_normalized['F2.50_norm'] = vowels_normalized['F2.50']/vowels_normalized['Delta F.50']
    
    vowels_normalized['F1.25_norm'] = vowels_normalized['F1.25']/vowels_normalized['Delta F.25']
    vowels_normalized['F2.25_norm'] = vowels_normalized['F2.25']/vowels_normalized['Delta F.25']
    
    vowels_normalized['F1.75_norm'] = vowels_normalized['F1.75']/vowels_normalized['Delta F.75']
    vowels_normalized['F2.75_norm'] = vowels_normalized['F2.75']/vowels_normalized['Delta F.75']
    
    return(vowels_normalized)

In [14]:
data = normalization(data)
print(len(data))
data.sample(10)

6341


Unnamed: 0,Participant,Vowel,F1.50,F2.50,F3.50,F1.25,F2.25,F3.25,F1.75,F2.75,...,stress,Delta F.50,Delta F.25,Delta F.75,F1.50_norm,F2.50_norm,F1.25_norm,F2.25_norm,F1.75_norm,F2.75_norm
5786,s056,o,599.567488,1270.639465,2356.605558,564.606677,1269.504991,2796.205824,598.208607,1309.613917,...,stressed,1176.407441,1156.685138,1131.824542,0.50966,1.080102,0.488125,1.097537,0.528535,1.157082
2657,s002,a,599.375176,1639.652327,2409.049325,577.832433,1592.469832,1788.129143,638.777631,1742.424924,...,stressed,1046.087601,1039.69639,1046.921418,0.572968,1.567414,0.55577,1.531668,0.610149,1.664332
1041,p115,a,721.623163,1776.368769,2179.343332,664.818203,1431.324928,1814.857374,765.8714,1863.713504,...,unstressed,1091.54986,1112.367074,1087.322616,0.6611,1.627382,0.597661,1.286738,0.704364,1.714039
1231,p115,e,589.67422,2390.403524,3381.265461,566.596866,2488.408428,3340.691379,619.254934,2193.579684,...,stressed,1091.54986,1112.367074,1087.322616,0.540217,2.189917,0.509361,2.237039,0.569523,2.017414
6316,s056,i,328.528591,2255.121387,3270.542286,325.395195,2144.817154,2575.788703,351.042268,2279.279244,...,stressed,1176.407441,1156.685138,1131.824542,0.279264,1.916956,0.281317,1.854279,0.310156,2.01381
3866,s051,e,548.959167,2126.228261,2870.502474,549.425038,2003.673253,2741.086986,517.954408,2098.839205,...,stressed,1164.954452,1151.989991,1135.543586,0.471228,1.82516,0.476936,1.739315,0.456129,1.848312
473,p119,o,506.69778,1170.601838,2154.417153,484.495976,1188.316847,1903.036135,515.433074,1146.887225,...,unstressed,1084.219365,1073.12413,1116.834981,0.467339,1.079673,0.451482,1.107343,0.461512,1.026908
2288,s001,o,413.457059,1336.144504,2573.464253,400.144599,1321.378386,2642.647718,410.652303,1355.385863,...,stressed,966.374551,959.13929,962.433837,0.427843,1.382636,0.417191,1.377671,0.426681,1.40829
2541,s002,a,709.425097,1479.688769,2512.450441,762.411727,1483.244437,2423.277084,768.590348,1490.951067,...,stressed,1046.087601,1039.69639,1046.921418,0.67817,1.414498,0.733302,1.426613,0.734143,1.424129
3032,s002,e,390.597345,1951.359605,2813.374387,395.851346,1957.21195,2889.70134,381.216371,1905.53692,...,stressed,1046.087601,1039.69639,1046.921418,0.373389,1.865388,0.380737,1.882484,0.364131,1.820134


In [15]:
data.groupby(["Participant", "Vowel"]).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,F1.50,F2.50,F3.50,F1.25,F2.25,F3.25,F1.75,F2.75,F3.75,Gender,...,stress,Delta F.50,Delta F.25,Delta F.75,F1.50_norm,F2.50_norm,F1.25_norm,F2.25_norm,F1.75_norm,F2.75_norm
Participant,Vowel,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
p112,a,128,128,128,128,128,128,128,128,128,128,...,128,128,128,128,128,128,128,128,128,128
p112,e,65,65,65,65,65,65,65,65,65,65,...,65,65,65,65,65,65,65,65,65,65
p112,i,39,39,39,39,39,39,39,39,39,39,...,39,39,39,39,39,39,39,39,39,39
p112,o,81,81,81,81,81,81,81,81,81,81,...,81,81,81,81,81,81,81,81,81,81
p112,u,10,10,10,10,10,10,10,10,10,10,...,10,10,10,10,10,10,10,10,10,10
p113,a,113,113,113,113,113,113,113,113,113,113,...,113,113,113,113,113,113,113,113,113,113
p113,e,60,60,60,60,60,60,60,60,60,60,...,60,60,60,60,60,60,60,60,60,60
p113,i,46,46,46,46,46,46,46,46,46,46,...,46,46,46,46,46,46,46,46,46,46
p113,o,76,76,76,76,76,76,76,76,76,76,...,76,76,76,76,76,76,76,76,76,76
p113,u,9,9,9,9,9,9,9,9,9,9,...,9,9,9,9,9,9,9,9,9,9


## Balance the data set

At this point, the data is fairly unbalanced, where speakers in the Corpus DIMEx100 have about twice as many productions as speakers in CBAS. DIMEx100 productions will be isolated and 50% of tokens across `Participant` and `Vowel` will be randomly removed. The smaller dataset will then be merged back with the total CBAS set, yielding a more balanced distribution across corpora.

In [16]:
dimex = data[data['Corpus']=="DIMEx100"].copy()
len(dimex)

4435

In [19]:
slim = dimex.groupby(["Participant", "Vowel"]).apply(lambda x: x.sample(frac = 0.5))
len(slim)

2217


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Participant,Vowel,F1.50,F2.50,F3.50,F1.25,F2.25,F3.25,F1.75,F2.75,...,stress,Delta F.50,Delta F.25,Delta F.75,F1.50_norm,F2.50_norm,F1.25_norm,F2.25_norm,F1.75_norm,F2.75_norm
Participant,Vowel,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
s001,a,2130,s001,a,443.830788,1699.951630,2181.039985,441.384966,1687.636197,2279.974465,407.258385,1705.342153,...,stressed,966.374551,959.139290,962.433837,0.459274,1.759102,0.460189,1.759532,0.423155,1.771906
s001,a,2117,s001,a,553.122140,1581.385279,2291.761244,520.906667,1604.158829,2254.908964,562.914989,1584.571712,...,stressed,966.374551,959.139290,962.433837,0.572368,1.636410,0.543098,1.672498,0.584887,1.646421
s001,a,2139,s001,a,542.516832,1627.672560,2531.690912,470.328820,1686.431817,2831.913046,567.279097,1672.295550,...,unstressed,966.374551,959.139290,962.433837,0.561394,1.684308,0.490366,1.758276,0.589421,1.737569
s001,a,2226,s001,a,444.759787,1531.881874,2367.052005,394.492702,1433.924213,2377.422723,390.030772,1512.731077,...,unstressed,966.374551,959.139290,962.433837,0.460235,1.585184,0.411299,1.495011,0.405255,1.571777
s001,a,2206,s001,a,584.976337,1640.711142,2406.463223,575.240645,1638.547956,2503.839464,557.723345,1643.365136,...,stressed,966.374551,959.139290,962.433837,0.605331,1.697800,0.599747,1.708352,0.579493,1.707510
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
s056,u,5946,s056,u,309.175875,805.933644,2879.178825,306.947758,874.888802,2856.620860,331.088961,973.255085,...,stressed,1176.407441,1156.685138,1131.824542,0.262814,0.685080,0.265368,0.756376,0.292527,0.859899
s056,u,5989,s056,u,330.936398,1076.758534,2948.559518,376.157651,1646.428468,2966.840161,341.270395,1034.932002,...,stressed,1176.407441,1156.685138,1131.824542,0.281311,0.915294,0.325203,1.423402,0.301522,0.914393
s056,u,5938,s056,u,419.249913,1176.122576,3108.839059,400.330998,1146.111220,3000.408975,394.315840,1166.746485,...,unstressed,1176.407441,1156.685138,1131.824542,0.356382,0.999758,0.346102,0.990858,0.348390,1.030855
s056,u,5944,s056,u,344.196169,1320.456714,3007.453716,344.425149,1333.120584,2995.000185,323.634642,1415.879043,...,stressed,1176.407441,1156.685138,1131.824542,0.292582,1.122448,0.297769,1.152535,0.285941,1.250970


In [21]:
slim = slim.reset_index(drop = True)
slim

Unnamed: 0,Participant,Vowel,F1.50,F2.50,F3.50,F1.25,F2.25,F3.25,F1.75,F2.75,...,stress,Delta F.50,Delta F.25,Delta F.75,F1.50_norm,F2.50_norm,F1.25_norm,F2.25_norm,F1.75_norm,F2.75_norm
0,s001,a,443.830788,1699.951630,2181.039985,441.384966,1687.636197,2279.974465,407.258385,1705.342153,...,stressed,966.374551,959.139290,962.433837,0.459274,1.759102,0.460189,1.759532,0.423155,1.771906
1,s001,a,553.122140,1581.385279,2291.761244,520.906667,1604.158829,2254.908964,562.914989,1584.571712,...,stressed,966.374551,959.139290,962.433837,0.572368,1.636410,0.543098,1.672498,0.584887,1.646421
2,s001,a,542.516832,1627.672560,2531.690912,470.328820,1686.431817,2831.913046,567.279097,1672.295550,...,unstressed,966.374551,959.139290,962.433837,0.561394,1.684308,0.490366,1.758276,0.589421,1.737569
3,s001,a,444.759787,1531.881874,2367.052005,394.492702,1433.924213,2377.422723,390.030772,1512.731077,...,unstressed,966.374551,959.139290,962.433837,0.460235,1.585184,0.411299,1.495011,0.405255,1.571777
4,s001,a,584.976337,1640.711142,2406.463223,575.240645,1638.547956,2503.839464,557.723345,1643.365136,...,stressed,966.374551,959.139290,962.433837,0.605331,1.697800,0.599747,1.708352,0.579493,1.707510
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2212,s056,u,309.175875,805.933644,2879.178825,306.947758,874.888802,2856.620860,331.088961,973.255085,...,stressed,1176.407441,1156.685138,1131.824542,0.262814,0.685080,0.265368,0.756376,0.292527,0.859899
2213,s056,u,330.936398,1076.758534,2948.559518,376.157651,1646.428468,2966.840161,341.270395,1034.932002,...,stressed,1176.407441,1156.685138,1131.824542,0.281311,0.915294,0.325203,1.423402,0.301522,0.914393
2214,s056,u,419.249913,1176.122576,3108.839059,400.330998,1146.111220,3000.408975,394.315840,1166.746485,...,unstressed,1176.407441,1156.685138,1131.824542,0.356382,0.999758,0.346102,0.990858,0.348390,1.030855
2215,s056,u,344.196169,1320.456714,3007.453716,344.425149,1333.120584,2995.000185,323.634642,1415.879043,...,stressed,1176.407441,1156.685138,1131.824542,0.292582,1.122448,0.297769,1.152535,0.285941,1.250970


In [22]:
cbas = data[data["Corpus"]=="CBAS"].copy()
balanced = pd.concat([cbas, slim], ignore_index = True)
print(len(balanced))
balanced

4123


Unnamed: 0,Participant,Vowel,F1.50,F2.50,F3.50,F1.25,F2.25,F3.25,F1.75,F2.75,...,stress,Delta F.50,Delta F.25,Delta F.75,F1.50_norm,F2.50_norm,F1.25_norm,F2.25_norm,F1.75_norm,F2.75_norm
0,p112,a,695.790976,1139.953282,2667.258621,689.324612,840.183174,2738.685252,672.665361,1108.588353,...,stressed,942.492654,919.248109,938.576788,0.738246,1.209509,0.749879,0.913990,0.716687,1.181138
1,p112,a,443.062885,1309.168397,2011.333640,520.926899,1104.669400,2083.767349,419.261712,1494.078973,...,unstressed,942.492654,919.248109,938.576788,0.470097,1.389049,0.566688,1.201710,0.446699,1.591856
2,p112,a,478.555024,1666.197690,2364.429452,448.329355,1780.151671,2379.829776,479.318232,1623.235615,...,stressed,942.492654,919.248109,938.576788,0.507755,1.767863,0.487713,1.936530,0.510686,1.729465
3,p112,a,585.348400,1228.037395,2661.984488,568.024702,1270.564615,2630.829785,583.179941,1281.414924,...,stressed,942.492654,919.248109,938.576788,0.621064,1.302968,0.617923,1.382178,0.621345,1.365274
4,p112,a,670.202795,1274.963776,2512.335538,466.960249,881.838158,2259.366631,673.582029,1268.173803,...,unstressed,942.492654,919.248109,938.576788,0.711096,1.352757,0.507981,0.959304,0.717663,1.351167
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4118,s056,u,309.175875,805.933644,2879.178825,306.947758,874.888802,2856.620860,331.088961,973.255085,...,stressed,1176.407441,1156.685138,1131.824542,0.262814,0.685080,0.265368,0.756376,0.292527,0.859899
4119,s056,u,330.936398,1076.758534,2948.559518,376.157651,1646.428468,2966.840161,341.270395,1034.932002,...,stressed,1176.407441,1156.685138,1131.824542,0.281311,0.915294,0.325203,1.423402,0.301522,0.914393
4120,s056,u,419.249913,1176.122576,3108.839059,400.330998,1146.111220,3000.408975,394.315840,1166.746485,...,unstressed,1176.407441,1156.685138,1131.824542,0.356382,0.999758,0.346102,0.990858,0.348390,1.030855
4121,s056,u,344.196169,1320.456714,3007.453716,344.425149,1333.120584,2995.000185,323.634642,1415.879043,...,stressed,1176.407441,1156.685138,1131.824542,0.292582,1.122448,0.297769,1.152535,0.285941,1.250970


In [127]:
data.to_csv("data/allnormdata.csv", index = False)

In [23]:
balanced.to_csv("data/balancednormdata.csv", index = False)