# Create .csv for CBAS word list data

Dataframes for female and male data are concatenated to yield a single dataframe of word list elicitations from CBAS.

In [5]:
import pandas as pd
import os
from audiolabel import read_label

## Import textgrids

In [None]:
cbasdf = pd.DataFrame({
    'relpath': 'data/textgrids/cbas',
    'fname': ['p112.TextGrid',
              'p119.TextGrid',
              'p113.TextGrid',
              'p115.TextGrid',
              'p120.TextGrid',
              'p124.TextGrid'],
    'subject': ['p112', 'p119', 'p113', 'p115', 'p120', 'p124']
})

dimedf = pd.DataFrame({
    'relpath': 'data/textgrids/dime',
    'fname': ['p112.TextGrid',
              'p119.TextGrid',
              'p113.TextGrid',
              'p115.TextGrid',
              'p120.TextGrid',
              'p124.TextGrid'],
    'subject': ['p112', 'p119', 'p113', 'p115', 'p120', 'p124']

tgdf = pd.concat([cbasdf, dimedf], ignore_index = True)

In [None]:
# inputs 

def tg2df(row):
    '''Load 'phone' and 'word' tiers from a textgrid and merge them.
    
    Parameters
    ----------
    
    row: named tuple
    A namedtuple as provided by `itertuples` that can be used to load a Praat
    textgrid from a path identified by row.relpath and row.fname. The textgrid is
    expected to have 'phone' and 'word' tiers.

    Returns
    -------
    
    mergedf: the merged dataframe.
    '''
    [wddf, phdf] = read_label(
        os.path.join(row.relpath, row.fname).replace("\\","/"),
        ftype='praat',
        tiers=['word', 'phone']
    )
    # Throw an error if tiers are not strictly hierarchical.
    # words contain phones
    assert(wddf.t1.isin(phdf.t1).all())
    assert(wddf.t2.isin(phdf.t2).all())
    
    # Add phone duration and speaker
    phdf['dur_ph'] = phdf.t2 - phdf.t1
    phdf['Participant'] = row.subject

    # Merge phone and word tiers.
    phwddf = pd.merge_asof(
        phdf.rename({'t1': 't1_ph', 't2': 't2_ph'}, axis='columns'),
        wddf.drop('fname', axis='columns') \
            .rename({'t1': 't1_wd', 't2': 't2_wd'}, axis='columns'),
        left_on='t1_ph',
        right_on='t1_wd'
    )

    # Add word-init and -final columns
    phwddf['is_wdinit_ph'] = phwddf.t1_ph == phwddf.t1_wd
    phwddf['is_wdfin_ph'] = phwddf.t2_ph == phwddf.t2_wd

    # Merge context tier and return the result.
    return phwddf

In [None]:
dflist = [tg2df(row) for row in tgdf.itertuples()]

In [None]:
alldf = pd.concat(dflist, ignore_index=True)

# some phones are followed by `+` symbol, remove that
alldf = alldf.replace("a+", "a")
alldf = alldf.replace("e+", "e")
alldf = alldf.replace("i+", "i")
alldf = alldf.replace("o+", "o")
alldf = alldf.replace("u+", "u")
alldf

Create cols `prev_ph` and `next_ph` containing previous and following phones.

In [None]:
alldf['prev_ph'] = alldf.phone.shift(1).fillna('')
alldf['next_ph'] = alldf.phone.shift(-1).fillna('')
alldf

In [None]:
# save df

alldf.to_csv("data/all_phones.csv", index = False)

## Import formant data

In [2]:
# import cbas female
cbas_fem = pd.read_csv("data/cbas_female.txt", sep = "\t")
cbas_fem.rename(columns = {"Filename": "Participant",
                           "Segment label":"phone"})
cbas_fem["Gender"] = "Female"
cbas_fem["Corpus"] = "CBAS"

# import cbas male
cbas_male = pd.read_csv("data/cbas_male.txt", sep = "\t")
cbas_male.rename(columns = {"Filename": "Participant",
                           "Segment label":"phone"})
cbas_male["Gender"] = "Male"
cbas_male["Corpus"] = "CBAS"

# combine cbas female and male
cbas = pd.concat([cbas_fem, cbas_male], ignore_index = True)

In [None]:
# import dime female
dime_fem = pd.read_csv("data/dime_female.txt", sep = "\t")
dime_fem.rename(columns = {"Filename": "Participant",
                           "Segment label":"phone"})
dime_fem["Gender"] = "Female"
dime_fem["Corpus"] = "DIMEx100"

# import dime male
dime_male = pd.read_csv("data/dime_male.txt", sep = "\t")
dime_male.rename(columns = {"Filename": "Participant",
                           "Segment label":"phone"})
dime_male["Gender"] = "Male"
dime_male["Corpus"] = "DIMEx100"

# combine cbas and dime, female and male
formants = pd.concat([cbas_fem, cbas_male, dime_fem, dime_male], ignore_index = True)

## Merge formant and textgrid data

In [None]:
data = formants.merge(alldf, left_index = True, right_index = True, how = "outer")
data.sample(10)

In [None]:
data.to_csv("data/alldata.csv", index = False)

In [None]:
# fix phones from txt file, remove + following some vowels
import re

data["phone"] = data["phone"].apply(lambda x: re.sub("([aeiou])\+", "\1", x))

# remove rows not containing vowels
vowelsdf = data[(data['phone']=="a") | 
                 (data['phone']=="e") | 
                 (data['phone']=="i") | 
                 (data['phone']=="u") | 
                 (data['phone']=="o")]
vowelsdf = vowelsdf.reset_index(drop = True)
len(vowelsdf)

In [None]:
vowelsdf.to_csv("data/allvowels.csv", index = False)