# Create .csv for CBAS word list data

Dataframes for female and male data are concatenated to yield a single dataframe of word list elicitations from CBAS.

In [2]:
import pandas as pd
import os
from audiolabel import read_label

## Import textgrids

In [15]:
cbasdf = pd.DataFrame({
    'relpath': 'textgrids/cbas',
    'fname': ['p112.TextGrid',
              'p119.TextGrid',
              'p113.TextGrid',
              'p115.TextGrid',
              'p120.TextGrid',
              'p124.TextGrid'],
    'subject': ['p112', 'p119', 'p113', 'p115', 'p120', 'p124']
})

dimedf = pd.DataFrame({
    'relpath': 'textgrids/dime',
    'fname' : os.listdir("textgrids/dime")})
dimedf['subject'] = dimedf['fname'].apply(lambda x: x[:4])

tgdf = pd.concat([cbasdf, dimedf], ignore_index = True)

In [16]:
# inputs 

def tg2df(row):
    '''Load 'phone' and 'word' tiers from a textgrid and merge them.
    
    Parameters
    ----------
    
    row: named tuple
    A namedtuple as provided by `itertuples` that can be used to load a Praat
    textgrid from a path identified by row.relpath and row.fname. The textgrid is
    expected to have 'phone' and 'word' tiers.

    Returns
    -------
    
    mergedf: the merged dataframe.
    '''
    [wddf, phdf] = read_label(
        os.path.join(row.relpath, row.fname).replace("\\","/"),
        ftype='praat',
        tiers=['word', 'phone']
    )
    # Throw an error if tiers are not strictly hierarchical.
    # words contain phones
    assert(wddf.t1.isin(phdf.t1).all())
    assert(wddf.t2.isin(phdf.t2).all())
    
    # Add phone duration and speaker
    phdf['dur_ph'] = phdf.t2 - phdf.t1
    phdf['Participant'] = row.subject

    # Merge phone and word tiers.
    phwddf = pd.merge_asof(
        phdf.rename({'t1': 't1_ph', 't2': 't2_ph'}, axis='columns'),
        wddf.drop('fname', axis='columns') \
            .rename({'t1': 't1_wd', 't2': 't2_wd'}, axis='columns'),
        left_on='t1_ph',
        right_on='t1_wd'
    )

    # Add word-init and -final columns
    phwddf['is_wdinit_ph'] = phwddf.t1_ph == phwddf.t1_wd
    phwddf['is_wdfin_ph'] = phwddf.t2_ph == phwddf.t2_wd

    # Merge context tier and return the result.
    return phwddf

In [17]:
dflist = [tg2df(row) for row in tgdf.itertuples()]

In [33]:
alldf = pd.concat(dflist, ignore_index=True)

alldf.sample(10)

Unnamed: 0,t1_ph,t2_ph,phone,fname,dur_ph,Participant,t1_wd,t2_wd,word,is_wdinit_ph,is_wdfin_ph
15693,0.846,0.933,a,textgrids/dime/s05331.TextGrid,0.087,s053,0.584,1.149,prosaica,False,False
18341,1.118,1.207,k,textgrids/dime/s05524.TextGrid,0.089,s055,1.118,1.444,cada,True,False
12627,2.776,2.832,i,textgrids/dime/s05124.TextGrid,0.056,s051,2.529,2.885,brasil,False,False
5139,238.143,238.273,a,textgrids/cbas/p120.TextGrid,0.13,p120,237.783,238.473,general,False,False
22448,4.358,4.454,n,textgrids/dime/s05644.TextGrid,0.096,s056,4.292,4.454,en,False,True
7735,0.255,0.371,a,textgrids/dime/s00131.TextGrid,0.116,s001,0.074,0.723,citatorios,False,False
17317,1.584,1.671,n,textgrids/dime/s05507.TextGrid,0.087,s055,1.584,1.758,no,True,False
4724,109.751,111.439,sp,textgrids/cbas/p120.TextGrid,1.688,p120,109.751,111.439,,True,True
2277,37.984,38.044,a,textgrids/cbas/p113.TextGrid,0.06,p113,37.834,38.534,amarillas,False,False
9173,1.695,1.781,p,textgrids/dime/s00211.TextGrid,0.086,s002,1.613,2.098,repu_7blica,False,False


Create cols `prev_ph` and `next_ph` containing previous and following phones.

In [34]:
alldf['prev_ph'] = alldf.phone.shift(1).fillna('')
alldf['next_ph'] = alldf.phone.shift(-1).fillna('')
alldf = alldf[alldf["phone"]!=""]
alldf = alldf.reset_index(drop = True)

In [35]:
alldf

Unnamed: 0,t1_ph,t2_ph,phone,fname,dur_ph,Participant,t1_wd,t2_wd,word,is_wdinit_ph,is_wdfin_ph,prev_ph,next_ph
0,0.000,0.710,sil,textgrids/cbas/p112.TextGrid,0.710,p112,0.000,0.710,,True,True,,b
1,0.710,0.820,b,textgrids/cbas/p112.TextGrid,0.110,p112,0.710,1.140,bajo,True,False,sil,a
2,0.820,0.920,a,textgrids/cbas/p112.TextGrid,0.100,p112,0.710,1.140,bajo,False,False,b,x
3,0.920,1.000,x,textgrids/cbas/p112.TextGrid,0.080,p112,0.710,1.140,bajo,False,False,a,o
4,1.000,1.140,o,textgrids/cbas/p112.TextGrid,0.140,p112,0.710,1.140,bajo,False,True,x,sp
...,...,...,...,...,...,...,...,...,...,...,...,...,...
22107,5.368,5.432,k,textgrids/dime/s05650.TextGrid,0.064,s056,5.185,5.622,sector,False,False,e,t
22108,5.432,5.525,t,textgrids/dime/s05650.TextGrid,0.093,s056,5.185,5.622,sector,False,False,k,o
22109,5.525,5.584,o,textgrids/dime/s05650.TextGrid,0.059,s056,5.185,5.622,sector,False,False,t,r(
22110,5.584,5.622,r(,textgrids/dime/s05650.TextGrid,0.038,s056,5.185,5.622,sector,False,True,o,.sil


In [36]:
# save df

alldf.to_csv("data/all_phones.csv", index = False)

## Import formant data

In [37]:
# import cbas female
cbas_fem = pd.read_csv("data/cbas_female_formants.txt", sep = "\t")
cbas_fem = cbas_fem.rename(columns = {"Filename": "Participant",
                           "Segment label":"phone"})
cbas_fem["Gender"] = "Female"
cbas_fem["Corpus"] = "CBAS"

# import cbas male
cbas_male = pd.read_csv("data/cbas_male_formants.txt", sep = "\t")
cbas_male = cbas_male.rename(columns = {"Filename": "Participant",
                           "Segment label":"phone"})
cbas_male["Gender"] = "Male"
cbas_male["Corpus"] = "CBAS"

# combine cbas female and male
cbas = pd.concat([cbas_male, cbas_fem], ignore_index = True)
cbas.head()

Unnamed: 0,Participant,phone,F1.50,F2.50,F3.50,F1.25,F2.25,F3.25,F1.75,F2.75,F3.75,Gender,Corpus
0,p112,sil,1349.549295,2591.478429,3249.782912,1640.24642,2418.644376,3493.133471,703.933103,2157.879849,2892.455533,Male,CBAS
1,p112,b,538.241047,1256.248187,2799.236009,884.928365,2273.697186,3419.016247,203.380184,972.14597,2675.148493,Male,CBAS
2,p112,a,695.790976,1139.953282,2667.258621,689.324612,840.183174,2738.685252,672.665361,1108.588353,2626.355793,Male,CBAS
3,p112,x,751.966861,1524.518652,2973.220585,702.809393,1206.953717,2902.09406,765.207925,1004.810302,2809.626524,Male,CBAS
4,p112,o,423.620505,737.502557,2374.974921,438.92995,803.823366,2369.001274,442.314537,785.356913,2490.71866,Male,CBAS


In [38]:
# import dime female
dime_fem = pd.read_csv("data/dime_female_formants.txt", sep = "\t")
dime_fem = dime_fem.rename(columns = {"Filename": "Participant",
                           "Segment label":"phone"})
dime_fem["Gender"] = "Female"
dime_fem["Corpus"] = "DIMEx100"

# import dime male
dime_male = pd.read_csv("data/dime_male_formants.txt", sep = "\t")
dime_male = dime_male.rename(columns = {"Filename": "Participant",
                           "Segment label":"phone"})
dime_male["Gender"] = "Male"
dime_male["Corpus"] = "DIMEx100"

dime = pd.concat([dime_male, dime_fem], ignore_index = True)
dime.head()

Unnamed: 0,Participant,phone,F1.50,F2.50,F3.50,F1.25,F2.25,F3.25,F1.75,F2.75,F3.75,Gender,Corpus
0,s00101,e,355.2633477885047,2082.628727748988,2606.7122225295493,432.2855807947169,2111.016626172527,2564.703533205836,327.22694999240707,1921.137337459745,2745.610286846611,Male,DIMEx100
1,s00101,n,264.26283791989255,1352.960148040204,2493.050542796756,256.35403799952866,1351.2672023829266,2497.6198015702744,277.3963325271278,1513.0240015190643,2515.159279522293,Male,DIMEx100
2,s00101,e,408.2120379191299,1773.8873587391736,2705.202878234465,345.6710062207199,1783.636308558772,2586.463559233574,422.0806104713125,1734.7389659592388,2716.0998083458703,Male,DIMEx100
3,s00101,l,297.3418407194621,1793.8137717606337,2414.063185907122,303.8681945988882,2068.7748774752845,2693.6862198043245,297.0997307147349,1994.629593841888,2278.6034707710114,Male,DIMEx100
4,s00101,k,1724.5663340780563,2133.3977605820296,3206.997014599973,453.9522175759093,1868.9915807418784,2532.047138366204,1257.87458143167,1749.567600959356,2656.2880070602605,Male,DIMEx100


In [39]:
# combine cbas and dime, female and male
formants = pd.concat([cbas, dime], ignore_index = True)
formants

Unnamed: 0,Participant,phone,F1.50,F2.50,F3.50,F1.25,F2.25,F3.25,F1.75,F2.75,F3.75,Gender,Corpus
0,p112,sil,1349.55,2591.48,3249.78,1640.25,2418.64,3493.13,703.933,2157.88,2892.46,Male,CBAS
1,p112,b,538.241,1256.25,2799.24,884.928,2273.7,3419.02,203.38,972.146,2675.15,Male,CBAS
2,p112,a,695.791,1139.95,2667.26,689.325,840.183,2738.69,672.665,1108.59,2626.36,Male,CBAS
3,p112,x,751.967,1524.52,2973.22,702.809,1206.95,2902.09,765.208,1004.81,2809.63,Male,CBAS
4,p112,o,423.621,737.503,2374.97,438.93,803.823,2369,442.315,785.357,2490.72,Male,CBAS
...,...,...,...,...,...,...,...,...,...,...,...,...,...
22107,s05650,k,1165.100807346732,2426.3920041797787,3470.2768848317864,1423.63,2836.48,4490.62,1024.578525310766,1736.3660434388864,3247.1437918433794,Female,DIMEx100
22108,s05650,t,1126.8852770868596,2027.6264468532945,3328.2844082332117,860.457,1951.17,3195.8,1142.4562588933588,2322.173941161411,3251.5501199831447,Female,DIMEx100
22109,s05650,o,435.91077915763856,1352.6603942285858,3158.138847527358,418.333,1522.29,3062.46,410.0946932152609,1370.264611031101,3089.980191544936,Female,DIMEx100
22110,s05650,r(,187.95601391927022,1466.034875716236,2744.761849555006,312.541,1379.25,2685.89,300.7082730988796,1511.2612699396043,2839.9733575529513,Female,DIMEx100


## Merge formant and textgrid data

In [45]:
data = formants.merge(alldf, left_index = True, right_index = True, how = "outer")
data = data.drop(["Participant_y", "phone_y"], axis = 1)
data = data.rename(columns = {"Participant_x": "Participant",
                             "phone_x": "phone"})
data.sample(10)

Unnamed: 0,Participant,phone,F1.50,F2.50,F3.50,F1.25,F2.25,F3.25,F1.75,F2.75,...,t2_ph,fname,dur_ph,t1_wd,t2_wd,word,is_wdinit_ph,is_wdfin_ph,prev_ph,next_ph
9547,s00224,k,481.98244744429394,1767.103486358395,2721.280626437371,326.3743013702274,1748.7101579487044,2543.0180297688003,862.3625452694424,1743.075853539593,...,2.979,textgrids/dime/s00224.TextGrid,0.09,2.889,3.033,que,True,False,s,e
19650,s05606,o,421.419753072062,1101.849822891012,2784.462756730797,405.62,1049.49,2881.33,378.4998442461492,1218.842856039796,...,3.286,textgrids/dime/s05606.TextGrid,0.074,2.447,3.381,identificamos,False,False,m,s
301,p112,l,353.783,1292.53,2457.88,336.3,1044.69,2554.79,341.003,1180.07,...,84.917,textgrids/cbas/p112.TextGrid,0.14,84.377,84.917,nivel,False,True,e,sp
14887,s05324,k,1189.4006769871194,1534.9625414935242,3407.4125412714,1325.32,2348.76,3756.53,523.2828503531581,1907.6230072201668,...,1.478,textgrids/dime/s05324.TextGrid,0.077,1.401,1.956,cli_7nicos,True,False,s,l
17237,s05516,a,672.2371843317133,1897.8426051845056,2985.7865954224903,658.839,1925.25,2938.23,524.8781816606086,1665.004900406051,...,3.768,textgrids/dime/s05516.TextGrid,0.085,3.529,4.383,ecatepec,False,False,k,t
8314,s00147,k,394.7344391487064,1158.4236249937792,2189.334160840481,389.27900131931335,1723.6981304438757,2380.361985924618,697.4221221333216,1855.2181904198208,...,2.648,textgrids/dime/s00147.TextGrid,0.04,2.608,3.161,cuaresma,True,False,e,u
5428,p124,a,671.957,1382.66,2623.39,710.715,1790.12,3096.76,777.852,1865.68,...,56.912,textgrids/cbas/p124.TextGrid,0.06,56.852,57.512,amigas,True,False,s,m
15841,s05342,u,265.755884796343,946.7870562767508,2608.844040169354,264.987,906.816,2517.64,271.187057058045,1091.0008183693938,...,1.397,textgrids/dime/s05342.TextGrid,0.057,1.125,1.786,preguntamos,False,False,g,n
8368,s00148,o,457.60400433453054,1454.7049580962798,2250.649836832399,417.4161264403868,1512.823385548078,2314.94715162769,375.8736792368944,1348.1145376485756,...,3.46,textgrids/dime/s00148.TextGrid,0.075,3.095,3.46,adriano,False,True,n,n
21190,s05633,b,478.3019661286474,1724.0807282621645,2886.862009623848,506.33,1716.04,2880.0,520.5699350140468,1771.2360641283633,...,0.239,textgrids/dime/s05633.TextGrid,0.036,0.06,0.379,deben,False,False,e,e


In [64]:
import re

# fix phones from txt file, remove + following some vowels
data["phone"] = data["phone"].apply(lambda x: re.sub("([aeiou])\+", "\1", x))

# replace `r(` with `rf` for consistency
data['word'] = data['word'].apply(lambda x: re.sub("r\(", "rf", x))

# fix notation in dimex corpus, where V_7 yields accented V
data['word'] = data['word'].apply(lambda x: re.sub("a_7", "á", x))
data['word'] = data['word'].apply(lambda x: re.sub("i_7", "í", x))
data['word'] = data['word'].apply(lambda x: re.sub("o_7", "ó", x))
data['word'] = data['word'].apply(lambda x: re.sub("u_7", "ú", x))
data['word'] = data['word'].apply(lambda x: re.sub("e_7", "é", x))

# fix tildas
data['word'] = data['word'].apply(lambda x: re.sub("n~", "ñ", x))

# remove phones `sp` and `.sil`
data = data[(data['phone'] != ".sil") & (data['phone'] != "sp")]

data = data.reset_index(drop = True)

In [65]:
data.sample(25)

Unnamed: 0,Participant,phone,F1.50,F2.50,F3.50,F1.25,F2.25,F3.25,F1.75,F2.75,...,t2_ph,fname,dur_ph,t1_wd,t2_wd,word,is_wdinit_ph,is_wdfin_ph,prev_ph,next_ph
8990,s00230,e,423.3782512334989,1851.8583001258996,2620.058628632856,417.0612015299634,1653.050064897118,2623.177836250595,361.5890784324881,2074.557914244336,...,3.925,textgrids/dime/s00230.TextGrid,0.072,3.823,3.925,de,False,True,d,k
19050,s05618,a,740.6840866857959,1954.7858764317907,2998.5703040604258,681.677,1896.27,3001.27,700.6724979027721,1928.9630412384456,...,3.048,textgrids/dime/s05618.TextGrid,0.039,2.906,3.633,calificación,False,False,k,l
19483,s05624,rf,274.2701707127544,1898.851618891144,2818.8939015008955,294.055,2070.17,2655.28,289.26431114385724,1673.944452287678,...,2.669,textgrids/dime/s05624.TextGrid,0.026,2.461,2.894,periodo,False,False,e,i
14032,s05329,t,523.1763256918192,1430.007400707782,2922.9616460352195,525.836,1378.03,2704.47,833.4660664457192,1949.5625827978056,...,0.282,textgrids/dime/s05329.TextGrid,0.071,0.06,0.67,postgrado,False,False,o,s
7224,s00141,n,280.6582818103457,1860.6233681844224,2423.1494665106798,280.23067291376884,1896.1609887444456,2137.5617184659063,280.1631367584402,1856.9874353495104,...,2.236,textgrids/dime/s00141.TextGrid,0.05,2.078,2.545,principio,False,False,i,s
363,p112,e,440.467,1597.53,2337.95,418.256,1650.51,2501.15,458.886,1617.29,...,118.683,textgrids/cbas/p112.TextGrid,0.05,118.383,119.293,intervención,False,False,t,rf
13831,s05323,d,278.7631574469382,1469.5596465157728,3138.358934056901,314.704,1337.99,3197.55,395.28950020482506,1575.2458180809929,...,1.853,textgrids/dime/s05323.TextGrid,0.05,1.803,1.853,de,True,True,o,a
19722,s05628,e,285.5531262567632,2677.662581301646,3162.295525404391,315.333,2733.75,3228.78,279.9960968426973,2769.891567239376,...,2.064,textgrids/dime/s05628.TextGrid,0.07,1.631,2.145,jóvenes,False,False,n,s
15635,s05506,s,1074.2674831100717,2153.5311146813724,3219.5667696434475,872.173,1983.17,3173.9,1401.9037049954804,2027.764311740703,...,1.517,textgrids/dime/s05506.TextGrid,0.081,1.436,1.745,sobre,True,False,a,o
1193,p119,e,520.773,1622.32,2514.33,529.169,1589.14,2483.27,571.789,1646.01,...,86.252,textgrids/cbas/p119.TextGrid,0.08,85.812,86.552,balsero,False,False,s,rf


In [69]:
# fix naming of participant col
data["Participant"] = data["Participant"].apply(lambda x: x[:4])

In [70]:
data.to_csv("data/alldata.csv", index = False)

In [71]:
# remove rows not containing vowels
vowelsdf = data[(data['phone']=="a") | 
                 (data['phone']=="e") | 
                 (data['phone']=="i") | 
                 (data['phone']=="u") | 
                 (data['phone']=="o")]
vowelsdf = vowelsdf.reset_index(drop = True)
vowelsdf = vowelsdf.rename(columns = {"phone": "Vowel"})
len(vowelsdf)

9289

In [72]:
vowelsdf.to_csv("data/allvowels.csv", index = False)