# Stress Assignment to Words in Corpora

To begin data analysis, each word transcribed in the [Corpus of Bay Area Spanish (CBAS)](https://spanish-portuguese.berkeley.edu/people/justin-davidson/) and the [Corpus DIMEx100](https://turing.iimas.unam.mx/~luis/DIME/CORPUS-DIMEX.html) will be analyzed according to stress, where each syllable (and corresponding vowels) will be labeled as `stressed` or `unstressed`.

The syltippy package (https://github.com/nur-ag/syltippy) will be used to generate syllabified (stress-indicated) outputs for each word found in the transcriptions. Then, the corresponding vowels in the TextGrid-formant dataframes will be marked as either stressed or unstressed.

In [1]:
#!pip install syltippy

In [1]:
import pandas as pd
import csv
import re
import numpy as np
from syltippy import syllabize

In [2]:
# syllabize from syltippy
guerra_syll, guerra_stress = syllabize("guerra")
print(guerra_syll)
print(guerra_stress)

['gue', 'rra']
0


Now go through each textgrid and generate dataframe with list of words and corresponding syllabification and stress index. The follow code is adapted from Ronald Sprouse's [`audiolabel` repository](https://github.com/rsprouse/audiolabel).

In [3]:
#!pip install git+https://github.com/rsprouse/audiolabel

In [4]:
import os
from audiolabel import read_label

In [15]:
tg_folder = "data/tgs/"
corpora = os.listdir(tg_folder)
syll_dict = {}

for f in corpora:
    # tier names different across corpora, analyze separately
    # first cbas
    if f.startswith("p"):
        filepath = tg_folder + f
        [wddf, phdf] = read_label(filepath,
            ftype='praat',
            tiers=['default - words', 'default - phones']
        )
        # add participant to dataframe
        phdf['Participant'] = f[:4]
    
        # Merge phone and word tiers.
        newdf = pd.merge_asof(
            phdf.rename({'t1': 't1_ph', 't2': 't2_ph', "label": "phone"}, axis='columns'),
            wddf.drop(["fname"], axis = 1) \
            .rename({'t1': 't1_wd', 't2': 't2_wd', "label": "word"}, axis='columns'),
            left_on='t1_ph',
            right_on='t1_wd'
        )
        
        # drop rows where word=="" or ".sil"
        newdf = newdf[(newdf["word"]!="") &
                     (newdf["word"]!=".sil")]
        
        # obtain syllables for each word
        newdf["syllable"] = newdf["word"].apply(lambda x: syllabize(x)[0])        
        newdf["stress_ind"] = newdf['word'].apply(lambda x: syllabize(x)[1])
        
        # new df with two cols: word and each syllable separately
        syll_cols = pd.DataFrame({'word':np.repeat(newdf.word.values, newdf.syllable.str.len()),
                          'stress_ind': np.repeat(newdf.stress_ind.values, newdf.syllable.str.len()),
                        'each_syllable':np.concatenate(newdf.syllable.values)})
        syll_cols = syll_cols.drop_duplicates()
        # remove 'h' at beginning of syllable
        syll_cols['each_syllable'] = syll_cols["each_syllable"].apply(lambda x: x[1:] if x.startswith("h") else x)
        
        dict_tgs = syll_cols[['each_syllable']].copy()
        
        fname = f[:-9]
        
        dict_tgs.to_csv("data/dict_tgs/" + fname + ".txt", index = False, header = False)
    
    # now dimex100
    if f.startswith("s"):
        filepath = tg_folder + f
        [wddf, phdf] = read_label(filepath,
                ftype = "praat",
                tiers = ["word", "phone"]
        )
        # add participant to dataframe
        phdf['Participant'] = f[:4]
    
        # Merge phone and word tiers.
        newdf = pd.merge_asof(
            phdf.rename({'t1': 't1_ph', 't2': 't2_ph', "label": "phone"}, axis='columns'),
            wddf.drop(["fname"], axis = 1) \
            .rename({'t1': 't1_wd', 't2': 't2_wd', "label": "word"}, axis='columns'),
            left_on='t1_ph',
            right_on='t1_wd'
        )
        
        # drop rows where word=="0", "", or ".sil"
        newdf = newdf[(newdf["word"]!="") &
                     (newdf["word"]!=".sil") &
             (newdf["word"]!=".bn")]

        # convert v_7 to accent over vowel
        # fix notation in dimex corpus, where V_7 yields accented V
        newdf['word'] = newdf['word'].apply(lambda x: re.sub("a_7", "á", x))
        newdf['word'] = newdf['word'].apply(lambda x: re.sub("i_7", "í", x))
        newdf['word'] = newdf['word'].apply(lambda x: re.sub("o_7", "ó", x))
        newdf['word'] = newdf['word'].apply(lambda x: re.sub("u_7", "ú", x))
        newdf['word'] = newdf['word'].apply(lambda x: re.sub("e_7", "é", x))

        # fix tildas
        newdf['word'] = newdf['word'].apply(lambda x: re.sub("n~", "ñ", x))
                     
        
        # obtain syllables and stress index for each word
        newdf["syllable"] = newdf["word"].apply(lambda x: syllabize(x)[0])
        newdf["stress_ind"] = newdf['word'].apply(lambda x: syllabize(x)[1])
        
        # new df with two cols: word and each syllable separately
        syll_cols = pd.DataFrame({'word':np.repeat(newdf.word.values, newdf.syllable.str.len()),
                          'stress_ind': np.repeat(newdf.stress_ind.values, newdf.syllable.str.len()),
                        'each_syllable':np.concatenate(newdf.syllable.values)})
        syll_cols = syll_cols.drop_duplicates()
        # remove 'h' at beginning of syllable
        syll_cols['each_syllable'] = syll_cols["each_syllable"].apply(lambda x: x[1:] if x.startswith("h") else x)
         
        dict_tgs = syll_cols[['each_syllable']].copy()
        
        fname = f[:-9]
        
        dict_tgs.to_csv("data/dict_tgs/" + fname + ".txt", index = False, header = False)

Now in the folder `dict_tgs` each file has a corresponding .txt file that lists out each syllable produced. This file will be run through the MFA again in order to produce a dictionary with the GlobalPhones associated with each syllable.

In [19]:
syllable_dictionary = pd.read_csv("data/syllable_dictionary.txt", sep = "\t", header = None)
syllable_dictionary = syllable_dictionary.rename(columns = {0: "each_syllable", 1: "GP_syll"})
syllable_dictionary.head()

Unnamed: 0,each_syllable,GP_syll
0,vos,b o s
1,nos,n o s
2,bar,b a rf
3,gua,G w a
4,ja,x a


In [181]:
filepath = "data/tgs/s00101.TextGrid"
[wddf, phdf] = read_label(filepath,
    ftype='praat',
    tiers=['word', 'phone']
)
# add participant to dataframe
phdf['Participant'] = f[:4]
    
# Merge phone and word tiers.
newdf = pd.merge_asof(
    phdf.rename({'t1': 't1_ph', 't2': 't2_ph', "label": "phone"}, axis='columns'),
    wddf.drop(["fname"], axis = 1) \
    .rename({'t1': 't1_wd', 't2': 't2_wd', "label": "word"}, axis='columns'),
    left_on='t1_ph',
    right_on='t1_wd'
)
        
# drop rows where word=="" or ".sil"
newdf = newdf[(newdf["word"]!="") &
                (newdf["word"]!=".sil") &
             (newdf["word"]!=".bn")]

# convert v_7 to accent over vowel
# fix notation in dimex corpus, where V_7 yields accented V
newdf['word'] = newdf['word'].apply(lambda x: re.sub("a_7", "á", x))
newdf['word'] = newdf['word'].apply(lambda x: re.sub("i_7", "í", x))
newdf['word'] = newdf['word'].apply(lambda x: re.sub("o_7", "ó", x))
newdf['word'] = newdf['word'].apply(lambda x: re.sub("u_7", "ú", x))
newdf['word'] = newdf['word'].apply(lambda x: re.sub("e_7", "é", x))

# fix tildas
newdf['word'] = newdf['word'].apply(lambda x: re.sub("n~", "ñ", x))
        
# obtain syllables for each word
newdf["syllable"] = newdf["word"].apply(lambda x: syllabize(x)[0])
newdf["stress_ind"] = newdf['word'].apply(lambda x: syllabize(x)[1])

newdf.head(60)

Unnamed: 0,t1_ph,t2_ph,phone,fname,Participant,t1_wd,t2_wd,word,syllable,stress_ind
1,0.067,0.15,e,data/tgs/s00101.TextGrid,s056,0.067,0.215,en,[en],0
2,0.15,0.215,n,data/tgs/s00101.TextGrid,s056,0.067,0.215,en,[en],0
3,0.215,0.258,e,data/tgs/s00101.TextGrid,s056,0.215,0.324,el,[el],0
4,0.258,0.324,l,data/tgs/s00101.TextGrid,s056,0.215,0.324,el,[el],0
5,0.324,0.385,k,data/tgs/s00101.TextGrid,s056,0.324,0.608,caso,"[ca, so]",0
6,0.385,0.462,a,data/tgs/s00101.TextGrid,s056,0.324,0.608,caso,"[ca, so]",0
7,0.462,0.535,s,data/tgs/s00101.TextGrid,s056,0.324,0.608,caso,"[ca, so]",0
8,0.535,0.608,o,data/tgs/s00101.TextGrid,s056,0.324,0.608,caso,"[ca, so]",0
9,0.608,0.665,e,data/tgs/s00101.TextGrid,s056,0.608,0.665,de,[de],0
10,0.665,0.704,l,data/tgs/s00101.TextGrid,s056,0.665,0.76,la,[la],0


In [182]:
words_sylls = newdf[['word', 't1_wd', 'stress_ind', 'syllable']].copy()
words_sylls = words_sylls.drop_duplicates(subset = ["word"])
words_sylls.head()

Unnamed: 0,word,t1_wd,stress_ind,syllable
1,en,0.067,0,[en]
3,el,0.215,0,[el]
5,caso,0.324,0,"[ca, so]"
9,de,0.608,0,[de]
10,la,0.665,0,[la]


In [183]:
syll_cols = pd.DataFrame({'word':np.repeat(words_sylls.word.values, words_sylls.syllable.str.len()),
                          'stress_ind': np.repeat(words_sylls.stress_ind.values, words_sylls.syllable.str.len()),
                          't1_wd': np.repeat(words_sylls.t1_wd.values, words_sylls.syllable.str.len()),
                        'each_syllable':np.concatenate(words_sylls.syllable.values)})
#syll_cols["syll_ind"] = syll_cols.groupby(['t1_wd']).cumcount()
#syll_cols = syll_cols.drop_duplicates()
syll_cols['each_syllable'] = syll_cols["each_syllable"].apply(lambda x: x[1:] if x.startswith("h") else x)
syll_cols = syll_cols.reset_index(drop = True)
syll_cols.head(60)

Unnamed: 0,word,stress_ind,t1_wd,each_syllable
0,en,0,0.067,en
1,el,0,0.215,el
2,caso,0,0.324,ca
3,caso,0,0.324,so
4,de,0,0.608,de
5,la,0,0.665,la
6,psicología,3,0.76,psi
7,psicología,3,0.76,co
8,psicología,3,0.76,lo
9,psicología,3,0.76,gí


In [184]:
stress_finder = syll_cols.merge(syllable_dictionary, how = "left", on = "each_syllable")
stress_finder.head(10)

Unnamed: 0,word,stress_ind,t1_wd,each_syllable,GP_syll
0,en,0,0.067,en,e ng
1,el,0,0.215,el,e l
2,caso,0,0.324,ca,k a
3,caso,0,0.324,so,s o
4,de,0,0.608,de,d e
5,la,0,0.665,la,l a
6,psicología,3,0.76,psi,s i
7,psicología,3,0.76,co,k o
8,psicología,3,0.76,lo,l o
9,psicología,3,0.76,gí,x i+


In [185]:
stress_finder["syll_ind"] = stress_finder.groupby(["word", "t1_wd"]).cumcount()
stress_finder.head(10)

Unnamed: 0,word,stress_ind,t1_wd,each_syllable,GP_syll,syll_ind
0,en,0,0.067,en,e ng,0
1,el,0,0.215,el,e l,0
2,caso,0,0.324,ca,k a,0
3,caso,0,0.324,so,s o,1
4,de,0,0.608,de,d e,0
5,la,0,0.665,la,l a,0
6,psicología,3,0.76,psi,s i,0
7,psicología,3,0.76,co,k o,1
8,psicología,3,0.76,lo,l o,2
9,psicología,3,0.76,gí,x i+,3


In [186]:
stress_finder['is_stress'] = np.where(stress_finder.stress_ind == stress_finder.syll_ind, 1, 0)
stress_finder.head()

Unnamed: 0,word,stress_ind,t1_wd,each_syllable,GP_syll,syll_ind,is_stress
0,en,0,0.067,en,e ng,0,1
1,el,0,0.215,el,e l,0,1
2,caso,0,0.324,ca,k a,0,1
3,caso,0,0.324,so,s o,1,0
4,de,0,0.608,de,d e,0,1


In [187]:
stress_finder['syll_list'] = stress_finder["GP_syll"].apply(lambda x: x.split(" "))
stress_finder.head()

Unnamed: 0,word,stress_ind,t1_wd,each_syllable,GP_syll,syll_ind,is_stress,syll_list
0,en,0,0.067,en,e ng,0,1,"[e, ng]"
1,el,0,0.215,el,e l,0,1,"[e, l]"
2,caso,0,0.324,ca,k a,0,1,"[k, a]"
3,caso,0,0.324,so,s o,1,0,"[s, o]"
4,de,0,0.608,de,d e,0,1,"[d, e]"


Now expand column `GP_syll` and concatenate with column `is_stress` so that each GlobalPhone is marked with a 0 (not stressed) or a 1 (stressed)

In [188]:
phones_stressed = pd.DataFrame({'word':np.repeat(stress_finder.word.values, stress_finder.syll_list.str.len()),
                                't1_wd': np.repeat(stress_finder.t1_wd.values, stress_finder.syll_list.str.len()),
                                'is_stress': np.repeat(stress_finder.is_stress.values, stress_finder.syll_list.str.len()),
                        'GP_ph':np.concatenate(stress_finder.syll_list.values)})
phones_stressed = phones_stressed.reset_index(drop = True)
counts = phones_stressed.groupby(['word', 't1_wd'])['GP_ph'].size().reset_index(name = 'num_ph')
counts = counts.sort_values(by = ["t1_wd"], ignore_index = True)
phones_stressed = phones_stressed.merge(counts, on = ["word", 't1_wd'])
phones_stressed

Unnamed: 0,word,t1_wd,is_stress,GP_ph,num_ph
0,en,0.067,1,e,2
1,en,0.067,1,ng,2
2,el,0.215,1,e,2
3,el,0.215,1,l,2
4,caso,0.324,1,k,4
5,caso,0.324,1,a,4
6,caso,0.324,0,s,4
7,caso,0.324,0,o,4
8,de,0.608,1,d,2
9,de,0.608,1,e,2


In [194]:
testdf = phones_stressed.drop(['t1_wd', 'is_stress', 'GP_ph'], axis = 1)
testdf = testdf.drop_duplicates()
test_3 = newdf.merge(testdf, on = "word")
word_counts = test_3.groupby(['word', 't1_wd']).size().reset_index(name = "count_word")
word_counts = word_counts.sort_values(by = ["t1_wd"], ignore_index = True)
test_3 = test_3.merge(word_counts, on = ["word", 't1_wd'])
test_3.head(10)

Unnamed: 0,t1_ph,t2_ph,phone,fname,Participant,t1_wd,t2_wd,word,syllable,stress_ind,num_ph,count_word
0,0.067,0.15,e,data/tgs/s00101.TextGrid,s056,0.067,0.215,en,[en],0,2,2
1,0.15,0.215,n,data/tgs/s00101.TextGrid,s056,0.067,0.215,en,[en],0,2,2
2,0.215,0.258,e,data/tgs/s00101.TextGrid,s056,0.215,0.324,el,[el],0,2,2
3,0.258,0.324,l,data/tgs/s00101.TextGrid,s056,0.215,0.324,el,[el],0,2,2
4,0.324,0.385,k,data/tgs/s00101.TextGrid,s056,0.324,0.608,caso,"[ca, so]",0,4,4
5,0.385,0.462,a,data/tgs/s00101.TextGrid,s056,0.324,0.608,caso,"[ca, so]",0,4,4
6,0.462,0.535,s,data/tgs/s00101.TextGrid,s056,0.324,0.608,caso,"[ca, so]",0,4,4
7,0.535,0.608,o,data/tgs/s00101.TextGrid,s056,0.324,0.608,caso,"[ca, so]",0,4,4
8,0.665,0.704,l,data/tgs/s00101.TextGrid,s056,0.665,0.76,la,[la],0,2,2
9,0.704,0.76,a,data/tgs/s00101.TextGrid,s056,0.665,0.76,la,[la],0,2,2


In [195]:
removed = test_3[test_3['num_ph']!=test_3['count_word']]
test_3 = test_3[test_3['num_ph']==test_3['count_word']]
test_3 = test_3.reset_index(drop = True)
test_3.head(10)

Unnamed: 0,t1_ph,t2_ph,phone,fname,Participant,t1_wd,t2_wd,word,syllable,stress_ind,num_ph,count_word
0,0.067,0.15,e,data/tgs/s00101.TextGrid,s056,0.067,0.215,en,[en],0,2,2
1,0.15,0.215,n,data/tgs/s00101.TextGrid,s056,0.067,0.215,en,[en],0,2,2
2,0.215,0.258,e,data/tgs/s00101.TextGrid,s056,0.215,0.324,el,[el],0,2,2
3,0.258,0.324,l,data/tgs/s00101.TextGrid,s056,0.215,0.324,el,[el],0,2,2
4,0.324,0.385,k,data/tgs/s00101.TextGrid,s056,0.324,0.608,caso,"[ca, so]",0,4,4
5,0.385,0.462,a,data/tgs/s00101.TextGrid,s056,0.324,0.608,caso,"[ca, so]",0,4,4
6,0.462,0.535,s,data/tgs/s00101.TextGrid,s056,0.324,0.608,caso,"[ca, so]",0,4,4
7,0.535,0.608,o,data/tgs/s00101.TextGrid,s056,0.324,0.608,caso,"[ca, so]",0,4,4
8,0.665,0.704,l,data/tgs/s00101.TextGrid,s056,0.665,0.76,la,[la],0,2,2
9,0.704,0.76,a,data/tgs/s00101.TextGrid,s056,0.665,0.76,la,[la],0,2,2


In [196]:
rem_words = list(removed['word'].unique())
rem_words

[]

In [197]:
test_3 = test_3.drop(['phone'], axis = 1)

In [198]:
phones_stressed = phones_stressed[~phones_stressed['word'].isin(rem_words)]
phones_stressed = phones_stressed.reset_index(drop = True)
phones_stressed

Unnamed: 0,word,t1_wd,is_stress,GP_ph,num_ph
0,en,0.067,1,e,2
1,en,0.067,1,ng,2
2,el,0.215,1,e,2
3,el,0.215,1,l,2
4,caso,0.324,1,k,4
5,caso,0.324,1,a,4
6,caso,0.324,0,s,4
7,caso,0.324,0,o,4
8,la,0.665,1,l,2
9,la,0.665,1,a,2


In [199]:
final_df = test_3.join(phones_stressed, how = "inner", rsuffix = "_dict")
final_df = final_df.drop(['stress_ind', "syllable", "count_word", "num_ph", "word_dict", "t1_wd_dict", "num_ph_dict"], axis = 1)
final_df.head(20)

Unnamed: 0,t1_ph,t2_ph,fname,Participant,t1_wd,t2_wd,word,is_stress,GP_ph
0,0.067,0.15,data/tgs/s00101.TextGrid,s056,0.067,0.215,en,1,e
1,0.15,0.215,data/tgs/s00101.TextGrid,s056,0.067,0.215,en,1,ng
2,0.215,0.258,data/tgs/s00101.TextGrid,s056,0.215,0.324,el,1,e
3,0.258,0.324,data/tgs/s00101.TextGrid,s056,0.215,0.324,el,1,l
4,0.324,0.385,data/tgs/s00101.TextGrid,s056,0.324,0.608,caso,1,k
5,0.385,0.462,data/tgs/s00101.TextGrid,s056,0.324,0.608,caso,1,a
6,0.462,0.535,data/tgs/s00101.TextGrid,s056,0.324,0.608,caso,0,s
7,0.535,0.608,data/tgs/s00101.TextGrid,s056,0.324,0.608,caso,0,o
8,0.665,0.704,data/tgs/s00101.TextGrid,s056,0.665,0.76,la,1,l
9,0.704,0.76,data/tgs/s00101.TextGrid,s056,0.665,0.76,la,1,a
