# Get Formant Trajectories

This notebook is adapted from a script by [Isaac Bleaman](https://www.isaacbleaman.com/) and utilizes `parselmouth` to extract formant measures from a .wav file with corresponding TextGrid. 

For English data, with transcriptions generated from the CMU pronunciation dictionary (where vowels are labels with numbers to indicate stress), we can substitute the regular expression <pre>r'\d+$'</pre>

In [1]:
import parselmouth
from parselmouth.praat import call
import os
import re
import csv
import pandas as pd
from audiolabel import read_label

## First get formants for female speakers, setting max formant to 5500 Hz

In [15]:
# directory of MFA'ed TextGrids
tg_path = 'data/cbas_english/female/'
# directory of wav files
wav_path = 'data/cbas_english/female/'
# directory to save vowel measurement data
data_path = 'data/'


# names of tiers with phonemes of interest
phones_tiers = ['default - phones']

# formant analysis parameters
time_step = 0.1
maximum_number_of_formants = 5
maximum_formant = 5500 # Hz for adult female
window_length = 0.025 # seconds
preemphasis_from = 50

# take formant measurements every X seconds?
segment_window = 0.005 # 5 milliseconds

In [16]:
# list (of dictionaries) to save results to
data = []

# counter to keep track of how long this script takes to run
file_counter = 1

for file in os.listdir(wav_path):
    if file.endswith('.wav'):
        
        # Check if corresponding TextGrid exists
        if os.path.exists(tg_path + file[:-4] + '.TextGrid'):
            
            # Print out which file we're currently working on
            print(file_counter, 'Processing file', file)
            file_counter += 1
            
            # Create sound object
            wav = parselmouth.Sound(wav_path + file)
            
            # Create formant object
            formant = wav.to_formant_burg(time_step, maximum_number_of_formants, maximum_formant, window_length, preemphasis_from)
            
            # Open textgrid
            tg = parselmouth.Data.read(tg_path + file[:-4] + '.TextGrid')
            
            # Iterate over the tiers and find the ones w/ phonemes
            numTiers = call(tg, 'Get number of tiers')
            for tierIdx in range(1, numTiers + 1): # Praat counts from 1, not 0
                tierName = call(tg, 'Get tier name', tierIdx)
                
                if tierName in phones_tiers:
                    numIntervals = call(tg, 'Get number of intervals', tierIdx)
                    
                    for intervalIdx in range(1, numIntervals + 1):
                        label = call(tg, 'Get label of interval', tierIdx, intervalIdx)
                        
                        # check if interval label is a vowel,
                        # i.e., it is not blank, and it ends in a digit
                        if label and re.search(r'\d+$', label):
                            start = call(tg, 'Get starting point', tierIdx, intervalIdx)
                            end = call(tg, 'Get end point', tierIdx, intervalIdx)
                            dur = end - start
                            
                            # number of X millisecond segments to take measurements from
                            numSegments = int(dur / segment_window) # note: this must be an integer
                            
                            # loop over segments and calculate F1/F2
                            for segmentIdx in range(1, numSegments + 1):
                                spot = start + (segmentIdx * segment_window)
                                tim_int = spot - start
                                
                                f1 = call(formant, 'Get value at time', 1, spot, 'Hertz', 'Linear')
                                f2 = call(formant, 'Get value at time', 2, spot, 'Hertz', 'Linear')
                                f3 = call(formant, 'Get value at time', 3, spot, 'Hertz', 'Linear')
                                
                                # add to our data set
                                data.append({'Video ID': file[:-4],
                                             'Vowel': label,
                                             't1_ph': start,
                                             'Time_of_formant_measurements': spot,
                                             'Time_from_vowel_onset': tim_int,
                                             'F1': f1,
                                             'F2': f2,
                                             'F3': f3,
                                             'Tier': tierName, # necessary so we can name the speaker
                                            })
        
print('Done')

1 Processing file p111_english.wav
2 Processing file p113_english.wav
3 Processing file p114_english.wav
4 Processing file p117_english.wav
5 Processing file p118_english.wav
6 Processing file p120_english.wav
7 Processing file p121_english.wav
8 Processing file p122_english.wav
9 Processing file p123_english.wav
10 Processing file p124_english.wav
11 Processing file p126_english.wav
Done


In [17]:
data[0]

{'Video ID': 'p111_english',
 'Vowel': 'AO1',
 't1_ph': 20.936,
 'Time_of_formant_measurements': 20.941,
 'Time_from_vowel_onset': 0.004999999999999005,
 'F1': 730.3533283028938,
 'F2': 1256.4168723083967,
 'F3': 3199.721207777982,
 'Tier': 'default - phones'}

Now to add in male data.

In [19]:
# directory of MFA'ed TextGrids
tg_path = 'data/cbas_english/male/'
# directory of wav files
wav_path = 'data/cbas_english/male/'
# directory to save vowel measurement data
data_path = 'data/'


# names of tiers with phonemes of interest
phones_tiers = ['default - phones']

# formant analysis parameters
time_step = 0.1
maximum_number_of_formants = 5
maximum_formant = 5000 # Hz for adult male
# counter to keep track of how long this script takes to run
file_counter = 1

for file in os.listdir(wav_path):
    if file.endswith('.wav'):
        
        # Check if corresponding TextGrid exists
        if os.path.exists(tg_path + file[:-4] + '.TextGrid'):
            
            # Print out which file we're currently working on
            print(file_counter, 'Processing file', file)
            file_counter += 1
            
            # Create sound object
            wav = parselmouth.Sound(wav_path + file)
            
            # Create formant object
            formant = wav.to_formant_burg(time_step, maximum_number_of_formants, maximum_formant, window_length, preemphasis_from)
            
            # Open textgrid
            tg = parselmouth.Data.read(tg_path + file[:-4] + '.TextGrid')
            
            # Iterate over the tiers and find the ones w/ phonemes
            numTiers = call(tg, 'Get number of tiers')
            for tierIdx in range(1, numTiers + 1): # Praat counts from 1, not 0
                tierName = call(tg, 'Get tier name', tierIdx)
                
                if tierName in phones_tiers:
                    numIntervals = call(tg, 'Get number of intervals', tierIdx)
                    
                    for intervalIdx in range(1, numIntervals + 1):
                        label = call(tg, 'Get label of interval', tierIdx, intervalIdx)
                        
                        # check if interval label is a vowel,
                        # i.e., it is not blank, and it ends in a digit
                        if label and re.search(r'\d+$', label):
                            start = call(tg, 'Get starting point', tierIdx, intervalIdx)
                            end = call(tg, 'Get end point', tierIdx, intervalIdx)
                            dur = end - start
                            
                            # number of X millisecond segments to take measurements from
                            numSegments = int(dur / segment_window) # note: this must be an integer
                            
                            # loop over segments and calculate F1/F2
                            for segmentIdx in range(1, numSegments + 1):
                                spot = start + (segmentIdx * segment_window)
                                tim_int = spot - start
                                
                                f1 = call(formant, 'Get value at time', 1, spot, 'Hertz', 'Linear')
                                f2 = call(formant, 'Get value at time', 2, spot, 'Hertz', 'Linear')
                                f3 = call(formant, 'Get value at time', 3, spot, 'Hertz', 'Linear')
                                
                                # add to our data set
                                data.append({'Video ID': file[:-4],
                                             'Vowel': label,
                                             't1_ph': start,
                                             'Time_of_formant_measurements': spot,
                                             'Time_from_vowel_onset': tim_int,
                                             'F1': f1,
                                             'F2': f2,
                                             'F3': f3,
                                             'Tier': tierName, # necessary so we can name the speaker
                                            })
        
print('Done')

1 Processing file p119_english.wav
Done


In [20]:
# to df
eng = pd.DataFrame(data)
eng["Participant"] = eng['Video ID'].apply(lambda x: x[:4])
eng["phone"] = eng["Vowel"]
eng.head()

Unnamed: 0,Video ID,Vowel,t1_ph,Time_of_formant_measurements,Time_from_vowel_onset,F1,F2,F3,Tier,Participant,phone
0,p111_english,AO1,20.936,20.941,0.005,730.353328,1256.416872,3199.721208,default - phones,p111,AO1
1,p111_english,AO1,20.936,20.946,0.01,702.264207,1198.106035,3187.823037,default - phones,p111,AO1
2,p111_english,AO1,20.936,20.951,0.015,685.222735,1159.214113,3177.774615,default - phones,p111,AO1
3,p111_english,AO1,20.936,20.956,0.02,686.09921,1151.817316,3170.726259,default - phones,p111,AO1
4,p111_english,AO1,20.936,20.961,0.025,686.975685,1144.42052,3163.677904,default - phones,p111,AO1


In [21]:
# get info from textgrids

fem_tg = pd.DataFrame({
    'relpath': 'data/cbas_english/female/',
    'fname': ['p111_english.TextGrid',
              'p113_english.TextGrid',
              'p114_english.TextGrid',
              'p117_english.TextGrid',
              'p118_english.TextGrid',
              'p120_english.TextGrid',
              'p121_english.TextGrid',
              'p122_english.TextGrid',
              'p123_english.TextGrid',
              'p124_english.TextGrid',
              'p126_english.TextGrid'],
    'Participant': ['p111', 'p113', 'p114', 'p117','p118', 'p120','p121', 'p122', 'p123', 'p124', 'p126']
})

male_tg = pd.DataFrame({
    'relpath': 'data/cbas_english/male/',
    'fname': ['p119_english.TextGrid'],
    'Participant': ['p119']
})

tgdf = pd.concat([fem_tg, male_tg], ignore_index = True)

In [22]:
# inputs 

def tg2df(row):
    '''Load 'phone' and 'word' tiers from a textgrid and merge them.
    
    Parameters
    ----------
    
    row: named tuple
    A namedtuple as provided by `itertuples` that can be used to load a Praat
    textgrid from a path identified by row.relpath and row.fname. The textgrid is
    expected to have 'phone' and 'word' tiers.

    Returns
    -------
    
    mergedf: the merged dataframe.
    '''
    [wddf, phdf] = read_label(
        os.path.join(row.relpath, row.fname).replace("\\","/"),
        ftype='praat',
        tiers=['default - words', 'default - phones']
    )
    # Throw an error if tiers are not strictly hierarchical.
    # words contain phones
    assert(wddf.t1.isin(phdf.t1).all())
    assert(wddf.t2.isin(phdf.t2).all())
    
    # Add phone duration and speaker
    phdf['dur_ph'] = phdf.t2 - phdf.t1
    phdf['Participant'] = row.Participant

    # Merge phone and word tiers.
    phwddf = pd.merge_asof(
        phdf.rename({'t1': 't1_ph', 't2': 't2_ph'}, axis='columns'),
        wddf.drop('fname', axis='columns') \
            .rename({'t1': 't1_wd', 't2': 't2_wd'}, axis='columns'),
        left_on='t1_ph',
        right_on='t1_wd'
    )

    # Add word-init and -final columns
    phwddf['is_wdinit_ph'] = phwddf.t1_ph == phwddf.t1_wd
    phwddf['is_wdfin_ph'] = phwddf.t2_ph == phwddf.t2_wd

    # Merge context tier and return the result.
    return phwddf

In [23]:
dflist = [tg2df(row) for row in tgdf.itertuples()]

In [24]:
english_df = pd.concat(dflist, ignore_index=True)
english_df = english_df.rename(columns={"label_x": "phone",
                                       "label_y": "word"})
english_df.sample(10)

Unnamed: 0,t1_ph,t2_ph,phone,fname,dur_ph,Participant,t1_wd,t2_wd,word,is_wdinit_ph,is_wdfin_ph
2384,34.27,34.4,V,data/cbas_english/female/p126_english.TextGrid,0.13,p126,34.27,34.92,vexing,True,False
1965,48.63,48.66,R,data/cbas_english/female/p123_english.TextGrid,0.03,p123,48.55,48.73,for,False,False
2,20.716,20.866,S,data/cbas_english/female/p111_english.TextGrid,0.15,p111,20.716,21.246,stall,True,False
2632,50.683,50.713,F,data/cbas_english/male/p119_english.TextGrid,0.03,p119,50.423,50.713,leaf,False,True
1893,30.188,30.348,IY1,data/cbas_english/female/p123_english.TextGrid,0.16,p123,29.968,30.468,steel,False,False
2469,56.959,57.069,N,data/cbas_english/female/p126_english.TextGrid,0.11,p126,56.189,57.069,zip-line,False,True
2644,53.86,53.91,F,data/cbas_english/male/p119_english.TextGrid,0.05,p119,53.86,53.99,for,True,False
2685,64.457,64.547,P,data/cbas_english/male/p119_english.TextGrid,0.09,p119,64.237,64.877,zip-line,False,False
837,63.97,64.03,ER0,data/cbas_english/female/p117_english.TextGrid,0.06,p117,63.89,64.03,for,False,True
58,47.671,47.751,IY1,data/cbas_english/female/p111_english.TextGrid,0.08,p111,47.581,47.881,least,False,False


In [25]:
english_df['prev_ph'] = english_df.phone.shift(1).fillna('')
english_df['next_ph'] = english_df.phone.shift(-1).fillna('')
english_df = english_df[english_df["phone"]!=""]
english_df = english_df.reset_index(drop = True)

In [26]:
english_df.sample(20)

Unnamed: 0,t1_ph,t2_ph,phone,fname,dur_ph,Participant,t1_wd,t2_wd,word,is_wdinit_ph,is_wdfin_ph,prev_ph,next_ph
314,33.24,33.49,S,data/cbas_english/female/p113_english.TextGrid,0.25,p113,33.24,33.66,sue,True,False,sp,UW1
508,49.8,50.0,ER0,data/cbas_english/female/p114_english.TextGrid,0.2,p114,49.55,50.0,dinner,False,True,N,sp
728,36.734,36.804,V,data/cbas_english/female/p117_english.TextGrid,0.07,p117,36.734,37.174,very,True,False,sp,EH1
480,41.26,41.4,L,data/cbas_english/female/p114_english.TextGrid,0.14,p114,40.86,41.4,call,False,True,AO1,sp
1941,59.458,60.725,sp,data/cbas_english/female/p123_english.TextGrid,1.267,p123,59.458,60.725,,True,True,L,Z
2233,22.433,22.483,sil,data/cbas_english/female/p126_english.TextGrid,0.05,p126,21.34,22.483,,False,True,,L
85,54.793,54.903,HH,data/cbas_english/female/p111_english.TextGrid,0.11,p111,54.793,55.213,hardened,True,False,sp,AA1
2280,30.998,31.328,sil,data/cbas_english/female/p126_english.TextGrid,0.33,p126,30.39,31.328,,False,True,,Z
1573,17.17,17.26,S,data/cbas_english/female/p122_english.TextGrid,0.09,p122,16.96,17.32,least,False,False,IY1,T
960,28.999,29.269,S,data/cbas_english/female/p118_english.TextGrid,0.27,p118,28.999,29.689,small,True,False,sp,M


In [27]:
# merge tg and formant data

english_vowels = eng.merge(english_df, on = ['phone', 't1_ph', "Participant"])

In [28]:
langdom = pd.DataFrame({"Participant": ['p111', 'p113', 'p114', 'p117','p118', 'p119', 'p120', 'p121', 'p122', 'p123', 'p124', 'p126'],
                       "Dom": ["L2", "biling", "biling", "L2", "biling", "biling", "biling", "L2", "biling", "L2", "biling", "L2"]})
english_vowels = english_vowels.merge(langdom, on = ["Participant"])
english_vowels.sample(10)

Unnamed: 0,Video ID,Vowel,t1_ph,Time_of_formant_measurements,Time_from_vowel_onset,F1,F2,F3,Tier,Participant,...,fname,dur_ph,t1_wd,t2_wd,word,is_wdinit_ph,is_wdfin_ph,prev_ph,next_ph,Dom
9858,p122_english,AO1,3.516,3.621,0.105,661.282068,1003.028915,3257.893247,default - phones,p122,...,data/cbas_english/female/p122_english.TextGrid,0.21,3.206,3.866,stall,False,False,T,L,biling
5124,p117_english,UW1,44.21,44.33,0.12,380.525164,963.19441,2582.310827,default - phones,p117,...,data/cbas_english/female/p117_english.TextGrid,0.24,43.96,44.45,sue,False,True,S,sp,L2
15763,p119_english,ER0,25.24,25.25,0.01,442.048578,1220.251045,1743.40621,default - phones,p119,...,data/cbas_english/male/p119_english.TextGrid,0.07,25.16,25.31,for,False,True,F,D,biling
13106,p124_english,IY0,25.479,25.554,0.075,456.141697,2904.713527,2933.258797,default - phones,p124,...,data/cbas_english/female/p124_english.TextGrid,0.23,25.159,25.709,very,False,True,R,sp,biling
7912,p120_english,UW1,56.84,56.88,0.04,484.876455,1180.860974,2540.894577,default - phones,p120,...,data/cbas_english/female/p120_english.TextGrid,0.1,56.72,57.26,tool,False,False,T,L,biling
9871,p122_english,AO1,3.516,3.686,0.17,667.943556,1004.657741,3641.833786,default - phones,p122,...,data/cbas_english/female/p122_english.TextGrid,0.21,3.206,3.866,stall,False,False,T,L,biling
13173,p124_english,IY1,27.645,27.74,0.095,542.040214,1859.949384,3051.520675,default - phones,p124,...,data/cbas_english/female/p124_english.TextGrid,0.12,27.405,27.915,steel,False,False,T,L,biling
11778,p123_english,IY0,28.224,28.234,0.01,561.399976,2274.318153,2438.347131,default - phones,p123,...,data/cbas_english/female/p123_english.TextGrid,0.19,27.864,28.414,very,False,True,R,sp,L2
15207,p126_english,AY2,56.829,56.844,0.015,780.314723,1322.444707,2825.008065,default - phones,p126,...,data/cbas_english/female/p126_english.TextGrid,0.13,56.189,57.069,zip-line,False,False,L,N,L2
9666,p121_english,IY1,73.41,73.585,0.175,561.18697,1575.577437,3254.401987,default - phones,p121,...,data/cbas_english/female/p121_english.TextGrid,0.18,73.33,73.82,meal,False,False,M,L,L2


In [29]:
import numpy as np

In [30]:
english_vowels['is_stress'] = english_vowels['Vowel'].apply(lambda x: 1 if x.endswith("1") else 0)
english_vowels.sample(15)

Unnamed: 0,Video ID,Vowel,t1_ph,Time_of_formant_measurements,Time_from_vowel_onset,F1,F2,F3,Tier,Participant,...,dur_ph,t1_wd,t2_wd,word,is_wdinit_ph,is_wdfin_ph,prev_ph,next_ph,Dom,is_stress
7729,p120_english,UW1,47.14,47.39,0.25,446.804278,1407.656466,2826.615177,default - phones,p120,...,0.31,46.86,47.45,sue,False,True,S,sp,biling,1
14750,p126_english,IY0,35.92,36.015,0.095,319.101883,2565.682086,3009.134653,default - phones,p126,...,0.12,35.68,36.04,heavy,False,True,V,T,L2,0
12783,p124_english,UW1,10.15,10.215,0.065,252.543132,1233.1054,2878.5808,default - phones,p124,...,0.07,9.99,10.5,looming,False,False,L,M,biling,1
15040,p126_english,UH1,50.661,50.706,0.045,927.429048,1508.891535,3358.95204,default - phones,p126,...,0.13,50.621,50.921,bull,False,False,B,L,L2,1
3425,p114_english,IY0,51.977,52.142,0.165,472.891055,2846.600759,2937.464179,default - phones,p114,...,0.35,51.627,52.327,very,False,True,R,sp,biling,0
16059,p119_english,EH1,45.07,45.09,0.02,663.503189,1923.04231,2721.585576,default - phones,p119,...,0.08,44.95,45.51,vexing,False,False,V,K,biling,1
3291,p114_english,ER0,49.46,49.515,0.055,399.979931,1881.221815,2579.027584,default - phones,p114,...,0.09,49.36,49.55,for,False,True,R,D,biling,0
4453,p114_english,UW1,93.248,93.418,0.17,614.500746,1073.945737,3481.73117,default - phones,p114,...,0.21,93.008,93.558,mule,False,False,Y,L,biling,1
1080,p111_english,IY0,90.842,90.867,0.025,497.122316,2005.094319,3000.061617,default - phones,p111,...,0.18,90.382,91.022,vastly,False,True,L,sp,L2,0
15505,p119_english,ER0,14.513,14.673,0.16,761.091858,1876.001372,2507.152027,default - phones,p119,...,0.25,14.223,14.763,figure,False,True,Y,sp,biling,0


In [31]:
# create new col called `Vowel_bare` which is vowel identity minus stress
english_vowels["Vowel_bare"] = english_vowels["Vowel"].apply(lambda x: x[:-1])
english_vowels.sample(5)

Unnamed: 0,Video ID,Vowel,t1_ph,Time_of_formant_measurements,Time_from_vowel_onset,F1,F2,F3,Tier,Participant,...,t1_wd,t2_wd,word,is_wdinit_ph,is_wdfin_ph,prev_ph,next_ph,Dom,is_stress,Vowel_bare
15371,p119_english,ER0,10.68,10.69,0.01,438.783182,1730.812865,2600.995066,default - phones,p119,...,10.34,10.88,treasure,False,True,ZH,sp,biling,0,ER
10919,p122_english,ER0,53.62,53.625,0.005,643.244169,1617.67031,2377.304165,default - phones,p122,...,53.5,53.68,for,False,True,R,D,biling,0,ER
10934,p122_english,AO1,53.76,53.78,0.02,692.124585,1199.757109,2912.670682,default - phones,p122,...,53.68,54.27,dogs,False,False,D,G,biling,1,AO
14962,p126_english,AA1,42.278,42.343,0.065,826.765379,1299.580221,2556.191568,default - phones,p126,...,42.218,42.698,dogs,False,False,D,G,L2,1,AA
11651,p123_english,EH1,24.437,24.472,0.035,738.810307,1596.98946,2242.064824,default - phones,p123,...,24.267,24.857,zest,False,False,Z,S,L2,1,EH


In [32]:
avg_dur = pd.DataFrame(english_vowels.groupby(["Participant"])["dur_ph"].mean())
avg_dur = avg_dur.rename(columns={"dur_ph": "avg_dur"})
avg_dur = avg_dur.reset_index(drop = False)
avg_dur.head()

Unnamed: 0,Participant,avg_dur
0,p111,0.121433
1,p113,0.148272
2,p114,0.176165
3,p117,0.130144
4,p118,0.133981


In [33]:
english_vowels = english_vowels.merge(avg_dur, on = ["Participant"])
english_vowels.sample(10)

Unnamed: 0,Video ID,Vowel,t1_ph,Time_of_formant_measurements,Time_from_vowel_onset,F1,F2,F3,Tier,Participant,...,t2_wd,word,is_wdinit_ph,is_wdfin_ph,prev_ph,next_ph,Dom,is_stress,Vowel_bare,avg_dur
4540,p117_english,UW1,13.99,14.055,0.065,397.079834,1412.719458,2356.78044,default - phones,p117,...,14.28,drew,False,True,R,sp,L2,1,UW,0.130144
8801,p121_english,AO1,34.916,35.156,0.24,630.52529,936.753718,3422.298677,default - phones,p121,...,35.456,call,False,False,K,L,L2,1,AO,0.15769
16236,p119_english,IY0,52.333,52.418,0.085,785.849234,1856.131508,2704.129384,default - phones,p119,...,52.503,zucchini,False,True,N,sp,biling,0,IY,0.126013
1699,p113_english,IY1,24.23,24.325,0.095,443.504217,2370.568534,2867.226208,default - phones,p113,...,24.53,leeks,False,False,L,K,biling,1,IY,0.148272
7679,p120_english,AO1,43.446,43.506,0.06,550.299235,971.089463,2626.73615,default - phones,p120,...,43.776,small,False,False,M,L,biling,1,AO,0.140266
5870,p118_english,IH0,10.97,10.99,0.02,423.203408,1265.788013,2941.764757,default - phones,p118,...,11.19,looming,False,False,M,NG,biling,0,IH,0.133981
8155,p120_english,AE1,67.395,67.48,0.085,720.857023,1801.911087,2721.631226,default - phones,p120,...,67.975,vapid,False,False,V,P,biling,1,AE,0.140266
2089,p113_english,EH1,41.901,41.951,0.05,748.071277,1586.780987,2200.186195,default - phones,p113,...,42.471,vexing,False,False,V,K,biling,1,EH,0.148272
7045,p118_english,AY2,58.454,58.574,0.12,582.9589,2082.261536,3134.265977,default - phones,p118,...,58.724,zip-line,False,False,L,N,biling,0,AY,0.133981
3164,p114_english,IY1,44.21,44.315,0.105,428.305203,1519.810522,2773.629633,default - phones,p114,...,44.51,least,False,False,L,S,biling,1,IY,0.176165


In [34]:
english_vowels.to_csv("data/english_vowels.csv", index = False)

In [15]:
df = pd.read_csv("data/english_vowels.csv")
df_no126 = df[df["Participant"]!="p126"].copy()
df_no126.groupby(["t1_ph","Participant"])["Vowel"].count().shape

(711,)