# Exploratory analyses

In [1]:
import pandas as pd

In [2]:
# import data
df = pd.read_csv("data/balancednormdata.csv")

First, calculate the median `F1.50_norm` and `F2.50_norm` by `Vowel` for each `Participant`.

In [3]:
# for creating a table in paper
df.groupby(["Corpus", "Gender", "Vowel"])[["F1.50_norm", "F2.50_norm"]].agg(["mean", "std"])

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,F1.50_norm,F1.50_norm,F2.50_norm,F2.50_norm
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,mean,std,mean,std
Corpus,Gender,Vowel,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
CBAS,Female,a,0.690153,0.078677,1.4404,0.160557
CBAS,Female,e,0.522196,0.044102,1.639447,0.360626
CBAS,Female,i,0.363788,0.073881,1.509845,0.712785
CBAS,Female,o,0.513432,0.099156,1.00499,0.2714
CBAS,Female,u,0.353459,0.075293,1.000293,0.273739
CBAS,Male,a,0.642233,0.073389,1.350621,0.167825
CBAS,Male,e,0.495278,0.064839,1.718878,0.17666
CBAS,Male,i,0.420471,0.056701,2.118412,0.197966
CBAS,Male,o,0.446486,0.057099,0.948819,0.134466
CBAS,Male,u,0.384613,0.034018,0.914845,0.156361


In [4]:
# for plotting in R
vowel_plot = df.groupby(["Corpus", "Gender", "Vowel"])[["F1.50_norm", "F2.50_norm"]].mean()
vowel_plot = vowel_plot.reset_index()
vowel_plot

Unnamed: 0,Corpus,Gender,Vowel,F1.50_norm,F2.50_norm
0,CBAS,Female,a,0.690153,1.4404
1,CBAS,Female,e,0.522196,1.639447
2,CBAS,Female,i,0.363788,1.509845
3,CBAS,Female,o,0.513432,1.00499
4,CBAS,Female,u,0.353459,1.000293
5,CBAS,Male,a,0.642233,1.350621
6,CBAS,Male,e,0.495278,1.718878
7,CBAS,Male,i,0.420471,2.118412
8,CBAS,Male,o,0.446486,0.948819
9,CBAS,Male,u,0.384613,0.914845


In [5]:
vowel_plot.to_csv("plot_model/area_plot.csv", index = False)

In [50]:
# add duration_norm, duration_ms and duration_log cols
vowel_model = df.copy()
vowel_model["Dur_ms"] = vowel_model["dur_ph"].apply(lambda x: x*1000)
vowel_model["Dur_norm"] = vowel_model["Dur_ms"] * vowel_model["Speech Rate"]
vowel_model["Dur_log"] = vowel_model["Dur_ms"].apply(lambda x: math.log(x))

In [51]:
vowel_model.to_csv("plot_model/vowel_model.csv", index = False)

## Vowel space area by speaker

In [12]:
parts = df.groupby(["Participant", "Vowel"])[["F1.50_norm", "F2.50_norm"]].mean()
parts_dict = parts.groupby(level=0).apply(lambda parts: parts.xs(parts.name).to_dict()).to_dict()
parts_dict

{'p112': {'F1.50_norm': {'a': 0.6277432295294227,
   'e': 0.45249439162084304,
   'i': 0.38787887295781504,
   'o': 0.43235958909509203,
   'u': 0.38076868657335117},
  'F2.50_norm': {'a': 1.3466508265490336,
   'e': 1.8272374625426173,
   'i': 2.2767362369181785,
   'o': 0.8970828050414119,
   'u': 0.8138116231867631}},
 'p113': {'F1.50_norm': {'a': 0.6925982470276436,
   'e': 0.5576053462214523,
   'i': 0.41129617314830963,
   'o': 0.5329422143127972,
   'u': 0.37350956957989706},
  'F2.50_norm': {'a': 1.460948020705278,
   'e': 1.750439550135898,
   'i': 1.5338558344829991,
   'o': 0.901101866423492,
   'u': 0.8281231993962865}},
 'p115': {'F1.50_norm': {'a': 0.682382598990521,
   'e': 0.5335102099954813,
   'i': 0.4134488754970261,
   'o': 0.5263231140564589,
   'u': 0.42835687831170666},
  'F2.50_norm': {'a': 1.5377428666535964,
   'e': 1.4542072611000627,
   'i': 1.0293834575205194,
   'o': 1.102312170977519,
   'u': 1.1693400928477862}},
 'p119': {'F1.50_norm': {'a': 0.656074928

In [13]:
# takes dicts in form {participant: {F1: {a: formant, e: formant, etc.}, F2: {a: etc.}}}
import math

def PolygonArea(d):
    areas = {}
    
    def area(a, b, c):
        def distance(p1, p2):
            return math.hypot(p1[0]-p2[0], p1[1]-p2[1])

        side_a = distance(a, b)
        side_b = distance(b, c)
        side_c = distance(c, a)
        s = 0.5 * ( side_a + side_b + side_c)
        return math.sqrt(s * (s - side_a) * (s - side_b) * (s - side_c))
    
    for p in d.keys():
        # coordinates in form (-F2, -F1)
        a = (d[p]["F2.50_norm"]["a"], d[p]["F1.50_norm"]["a"])
        e = (d[p]["F2.50_norm"]["e"], d[p]["F1.50_norm"]["e"])
        i = (d[p]["F2.50_norm"]["i"], d[p]["F1.50_norm"]["i"])
        o = (d[p]["F2.50_norm"]["o"], d[p]["F1.50_norm"]["o"])
        u = (d[p]["F2.50_norm"]["u"], d[p]["F1.50_norm"]["u"])
        
        a1 = area(a, e, o)
        a2 = area(e, o, u)
        a3 = area(i, e, u)
        
        areas[p] = a1 + a2 + a3
    return areas

In [14]:
parts_area = PolygonArea(parts_dict)
areas = pd.DataFrame.from_dict(parts_area, orient = "index").reset_index()
areas.columns = ["Participant", "Area"]
areas

Unnamed: 0,Participant,Area
0,p112,0.15836
1,p113,0.175239
2,p115,0.048606
3,p119,0.12063
4,p120,0.193921
5,p124,0.206452
6,s001,0.119608
7,s002,0.153924
8,s051,0.183478
9,s053,0.179042


In [24]:
# save into dataset
area_model = df.merge(areas, on = "Participant", how = "outer")
area_model.head(1)

Unnamed: 0,Participant,Vowel,F1.50,F2.50,F3.50,F1.25,F2.25,F3.25,F1.75,F2.75,...,Delta F.50,Delta F.25,Delta F.75,F1.50_norm,F2.50_norm,F1.25_norm,F2.25_norm,F1.75_norm,F2.75_norm,Area
0,p112,a,695.790976,1139.953282,2667.258621,689.324612,840.183174,2738.685252,672.665361,1108.588353,...,942.492654,919.248109,938.576788,0.738246,1.209509,0.749879,0.91399,0.716687,1.181138,0.15836


In [25]:
area_model.to_csv("plot_model/area_model.csv", index = False)

## Add in vs centroid and category centroid by speaker

In [17]:
cent = df.groupby("Participant")[["F1.50_norm", "F2.50_norm"]].mean().reset_index()
cent.rename(columns={"F1.50_norm": "F1_mean", "F2.50_norm": "F2_mean"}, inplace = True)
cent

Unnamed: 0,Participant,F1_mean,F2_mean
0,p112,0.506871,1.426428
1,p113,0.574029,1.37042
2,p115,0.568055,1.340613
3,p119,0.555661,1.418055
4,p120,0.55256,1.339259
5,p124,0.522945,1.453591
6,s001,0.444156,1.637488
7,s002,0.450072,1.601071
8,s051,0.489975,1.640119
9,s053,0.432863,1.638171


In [18]:
vowel_means = df.groupby(["Participant", "Vowel"])[["F1.50_norm", "F2.50_norm"]].mean().reset_index()
vowel_means.rename(columns={"F1.50_norm": "F1_v", "F2.50_norm": "F2_v"}, inplace = True)
vowel_means.head()

Unnamed: 0,Participant,Vowel,F1_v,F2_v
0,p112,a,0.627743,1.346651
1,p112,e,0.452494,1.827237
2,p112,i,0.387879,2.276736
3,p112,o,0.43236,0.897083
4,p112,u,0.380769,0.813812


In [19]:
dispersion = vowel_means.merge(cent, on = "Participant", how = "outer")
dispersion.sample(5)

Unnamed: 0,Participant,Vowel,F1_v,F2_v,F1_mean,F2_mean
6,p113,e,0.557605,1.75044,0.574029,1.37042
54,s055,u,0.368131,1.113534,0.47494,1.585498
55,s056,a,0.650558,1.62631,0.448972,1.631898
10,p115,a,0.682383,1.537743,0.568055,1.340613
39,s002,u,0.331139,1.143181,0.450072,1.601071


In [21]:
import numpy as np

dispersion['Dispersion_v'] = np.sqrt((abs(dispersion["F1_mean"] - dispersion["F1_v"])**2) + (abs(dispersion["F2_mean"] - dispersion["F2_v"])**2))
disperse = dispersion.groupby("Participant")["Dispersion_v"].mean().reset_index()
disperse.rename(columns = {"Dispersion_v": "Dispersion"}, inplace = True)
disperse

Unnamed: 0,Participant,Dispersion
0,p112,0.513585
1,p113,0.361897
2,p115,0.231415
3,p119,0.348556
4,p120,0.335895
5,p124,0.376462
6,s001,0.319485
7,s002,0.415879
8,s051,0.364521
9,s053,0.426683


In [26]:
# save dispersion to dataset
disp_model = df.merge(disperse, on = "Participant", how = "outer")
disp_model.sample(5)

Unnamed: 0,Participant,Vowel,F1.50,F2.50,F3.50,F1.25,F2.25,F3.25,F1.75,F2.75,...,Delta F.50,Delta F.25,Delta F.75,F1.50_norm,F2.50_norm,F1.25_norm,F2.25_norm,F1.75_norm,F2.75_norm,Dispersion
436,p119,a,767.796459,1593.865521,2267.413095,906.944069,1785.014436,2977.308279,737.792737,1595.886732,...,1084.219365,1073.12413,1116.834981,0.708156,1.470058,0.845144,1.663381,0.66061,1.428937,0.348556
2155,s001,o,451.374176,1384.861925,2519.489421,455.245027,1390.676055,2582.521616,430.935219,1427.82889,...,966.374551,959.13929,962.433837,0.46708,1.433049,0.474639,1.449921,0.447756,1.483561,0.319485
2271,s002,a,624.324819,1668.643788,2845.700972,713.182396,1695.916039,2910.936903,1318.587822,2368.662874,...,1046.087601,1039.69639,1046.921418,0.596819,1.595128,0.685953,1.631165,1.259491,2.262503,0.415879
2108,s001,i,325.732626,2151.224239,2734.843928,377.134311,2130.958011,2753.253901,330.399432,2060.386435,...,966.374551,959.13929,962.433837,0.337067,2.226077,0.393201,2.22174,0.343296,2.140808,0.319485
2181,s001,o,455.058337,1078.262643,2314.317285,445.005419,1175.187545,2422.890899,472.050256,1110.584194,...,966.374551,959.13929,962.433837,0.470892,1.115781,0.463963,1.225252,0.490476,1.153933,0.319485


In [27]:
disp_model.to_csv("plot_model/disp_model.csv", index = False)

In [28]:
# now create new set for disp plotting
disp_plot1 = df.groupby(["Corpus", "Gender", "Vowel"])[["F1.50_norm", "F2.50_norm"]].mean().reset_index()
disp_plot1.rename(columns = {"F1.50_norm": "F1_v", "F2.50_norm": "F2_v"}, inplace = True)
disp_plot1

Unnamed: 0,Corpus,Gender,Vowel,F1_v,F2_v
0,CBAS,Female,a,0.690153,1.4404
1,CBAS,Female,e,0.522196,1.639447
2,CBAS,Female,i,0.363788,1.509845
3,CBAS,Female,o,0.513432,1.00499
4,CBAS,Female,u,0.353459,1.000293
5,CBAS,Male,a,0.642233,1.350621
6,CBAS,Male,e,0.495278,1.718878
7,CBAS,Male,i,0.420471,2.118412
8,CBAS,Male,o,0.446486,0.948819
9,CBAS,Male,u,0.384613,0.914845


In [29]:
disp_plot2 = df.groupby(["Corpus", "Gender"])[["F1.50_norm", "F2.50_norm"]].mean().reset_index()
disp_plot2.rename(columns = {"F1.50_norm": "F1_cent", "F2.50_norm": "F2_cent"}, inplace = True)
disp_plot2

Unnamed: 0,Corpus,Gender,F1_cent,F2_cent
0,CBAS,Female,0.554748,1.373808
1,CBAS,Male,0.531152,1.422261
2,DIMEx100,Female,0.461986,1.623247
3,DIMEx100,Male,0.447281,1.618256


In [30]:
disp_plot = disp_plot1.merge(disp_plot2, on = ["Corpus", "Gender"], how = "outer")
disp_plot

Unnamed: 0,Corpus,Gender,Vowel,F1_v,F2_v,F1_cent,F2_cent
0,CBAS,Female,a,0.690153,1.4404,0.554748,1.373808
1,CBAS,Female,e,0.522196,1.639447,0.554748,1.373808
2,CBAS,Female,i,0.363788,1.509845,0.554748,1.373808
3,CBAS,Female,o,0.513432,1.00499,0.554748,1.373808
4,CBAS,Female,u,0.353459,1.000293,0.554748,1.373808
5,CBAS,Male,a,0.642233,1.350621,0.531152,1.422261
6,CBAS,Male,e,0.495278,1.718878,0.531152,1.422261
7,CBAS,Male,i,0.420471,2.118412,0.531152,1.422261
8,CBAS,Male,o,0.446486,0.948819,0.531152,1.422261
9,CBAS,Male,u,0.384613,0.914845,0.531152,1.422261


In [31]:
# save disp_plot
disp_plot.to_csv("plot_model/disp_plot.csv", index = False)

## /a/ reduction across stress

In [34]:
# data set for plotting
a = df[df["Vowel"]=="a"].copy()
a_stress = a.groupby(["Corpus", "Gender", "stress"])[["F1.50_norm", "F2.50_norm"]].mean()
a_stress = a_stress.reset_index()
a_stress

Unnamed: 0,Corpus,Gender,stress,F1.50_norm,F2.50_norm
0,CBAS,Female,stressed,0.704012,1.445211
1,CBAS,Female,unstressed,0.683834,1.438206
2,CBAS,Male,stressed,0.659799,1.363305
3,CBAS,Male,unstressed,0.633798,1.344531
4,DIMEx100,Female,stressed,0.646446,1.536986
5,DIMEx100,Female,unstressed,0.620249,1.558152
6,DIMEx100,Male,stressed,0.595249,1.528071
7,DIMEx100,Male,unstressed,0.584057,1.533586


In [36]:
# save for plotting
a_stress.to_csv("plot_model/a_stress_plot.csv", index = False)

In [37]:
# save for model
a.to_csv("plot_model/a_stress_model.csv", index = False)

## Consonantal context

In [38]:
cons_context = df[(df["prev_ph"]=="t") |
                  (df["prev_ph"]=="p") |
                  (df["prev_ph"]=="s")].copy()
len(cons_context)

1039

In [39]:
cons_context.groupby(["Corpus", "prev_ph", "Vowel"]).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Participant,F1.50,F2.50,F3.50,F1.25,F2.25,F3.25,F1.75,F2.75,F3.75,...,stress,Delta F.50,Delta F.25,Delta F.75,F1.50_norm,F2.50_norm,F1.25_norm,F2.25_norm,F1.75_norm,F2.75_norm
Corpus,prev_ph,Vowel,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
CBAS,p,a,2,2,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2
CBAS,p,e,13,13,13,13,13,13,13,13,13,13,...,13,13,13,13,13,13,13,13,13,13
CBAS,p,i,9,9,9,9,9,9,9,9,9,9,...,9,9,9,9,9,9,9,9,9,9
CBAS,p,o,18,18,18,18,18,18,18,18,18,18,...,18,18,18,18,18,18,18,18,18,18
CBAS,p,u,5,5,5,5,5,5,5,5,5,5,...,5,5,5,5,5,5,5,5,5,5
CBAS,s,a,66,66,66,66,66,66,66,66,66,66,...,66,66,66,66,66,66,66,66,66,66
CBAS,s,e,80,80,80,80,80,80,80,80,80,80,...,80,80,80,80,80,80,80,80,80,80
CBAS,s,i,31,31,31,31,31,31,31,31,31,31,...,31,31,31,31,31,31,31,31,31,31
CBAS,s,o,33,33,33,33,33,33,33,33,33,33,...,33,33,33,33,33,33,33,33,33,33
CBAS,s,u,10,10,10,10,10,10,10,10,10,10,...,10,10,10,10,10,10,10,10,10,10


In [48]:
cons_context["Dur_ms"] = cons_context["dur_ph"].apply(lambda x: x*1000)
cons_context["Dur_norm"] = cons_context["Dur_ms"] * cons_context["Speech Rate"]
cons_context["Dur_log"] = cons_context["Dur_ms"].apply(lambda x: math.log(x))

In [49]:
# save for modeling
cons_context.to_csv("plot_model/cons_cont.csv", index = False)

In [45]:
# for plotting
cons_plot = cons_context.groupby(["Corpus", "Gender", "Vowel", "prev_ph"])[["F1.50_norm", "F2.50_norm", "dur_ph"]].mean()
cons_plot = cons_plot.reset_index()
cons_plot["Dur_ms"] = cons_plot["dur_ph"].apply(lambda x: x*1000)
cons_plot

Unnamed: 0,Corpus,Gender,Vowel,prev_ph,F1.50_norm,F2.50_norm,dur_ph,Dur_ms
0,CBAS,Female,a,p,0.636944,1.574071,0.065,65.0
1,CBAS,Female,a,s,0.655539,1.403049,0.099048,99.047619
2,CBAS,Female,a,t,0.680445,1.489619,0.141667,141.666667
3,CBAS,Female,e,p,0.501689,1.678223,0.068889,68.888889
4,CBAS,Female,e,s,0.525875,1.555358,0.088269,88.269231
5,CBAS,Female,e,t,0.514525,1.615135,0.099184,99.183673
6,CBAS,Female,i,p,0.324221,1.393843,0.097143,97.142857
7,CBAS,Female,i,s,0.362757,1.859344,0.072857,72.857143
8,CBAS,Female,i,t,0.380081,2.131091,0.09,90.0
9,CBAS,Female,o,p,0.478317,0.764494,0.1175,117.5


In [46]:
# save for plotting
cons_plot.to_csv("plot_model/cons_cont_plot.csv", index = False)