# Stress Assignment to Words in Corpora

To begin data analysis, each word transcribed in the [Corpus of Bay Area Spanish (CBAS)](https://spanish-portuguese.berkeley.edu/people/justin-davidson/) and the [Corpus DIMEx100](https://turing.iimas.unam.mx/~luis/DIME/CORPUS-DIMEX.html) will be analyzed according to stress, where each syllable (and corresponding vowels) will be labeled as `stressed` or `unstressed`.

The syltippy package (https://github.com/nur-ag/syltippy) will be used to generate syllabified (stress-indicated) outputs for each word found in the transcriptions. Then, the corresponding vowels in the TextGrid-formant dataframes will be marked as either stressed or unstressed.

In [1]:
#!pip install syltippy

In [1]:
import pandas as pd
import csv
import re
import numpy as np
from syltippy import syllabize

In [2]:
# syllabize from syltippy
guerra_syll, guerra_stress = syllabize("guerra")
print(guerra_syll)
print(guerra_stress)

['gue', 'rra']
0


Now go through each textgrid and generate dataframe with list of words and corresponding syllabification and stress index. The follow code is adapted from Ronald Sprouse's [`audiolabel` repository](https://github.com/rsprouse/audiolabel).

In [3]:
#!pip install git+https://github.com/rsprouse/audiolabel

In [4]:
import os
from audiolabel import read_label

In [48]:
tg_folder = "data/tgs/"
corpora = os.listdir(tg_folder)
syll_dict = {}

for f in corpora:
    # tier names different across corpora, analyze separately
    # first cbas
    if f.startswith("p"):
        filepath = tg_folder + f
        [wddf, phdf] = read_label(filepath,
            ftype='praat',
            tiers=['default - words', 'default - phones']
        )
        # add participant to dataframe
        phdf['Participant'] = f[:4]
    
        # Merge phone and word tiers.
        newdf = pd.merge_asof(
            phdf.rename({'t1': 't1_ph', 't2': 't2_ph', "label": "phone"}, axis='columns'),
            wddf.drop(["fname"], axis = 1) \
            .rename({'t1': 't1_wd', 't2': 't2_wd', "label": "word"}, axis='columns'),
            left_on='t1_ph',
            right_on='t1_wd'
        )
        
        # drop rows where word=="" or ".sil"
        newdf = newdf[(newdf["word"]!="") &
                     (newdf["word"]!=".sil")]
        
        # obtain syllables for each word
        newdf["syllable"] = newdf["word"].apply(lambda x: syllabize(x)[0])        
        newdf["stress_ind"] = newdf['word'].apply(lambda x: syllabize(x)[1])
        
        # new df with two cols: word and each syllable separately
        syll_cols = pd.DataFrame({'word':np.repeat(newdf.word.values, newdf.syllable.str.len()),
                          'stress_ind': np.repeat(newdf.stress_ind.values, newdf.syllable.str.len()),
                        'each_syllable':np.concatenate(newdf.syllable.values)})
        syll_cols = syll_cols.drop_duplicates()
        # remove 'h' at beginning of syllable
        syll_cols['each_syllable'] = syll_cols["each_syllable"].apply(lambda x: x[1:] if x.startswith("h") else x)
        
        dict_tgs = syll_cols[['each_syllable']].copy()
        
        fname = f[:-9]
        
        dict_tgs.to_csv("data/dict_tgs/" + fname + ".txt", index = False, header = False)
    
    # now dimex100
    if f.startswith("s"):
        filepath = tg_folder + f
        [wddf, phdf] = read_label(filepath,
                ftype = "praat",
                tiers = ["word", "phone"]
        )
        # add participant to dataframe
        phdf['Participant'] = f[:4]
    
        # Merge phone and word tiers.
        newdf = pd.merge_asof(
            phdf.rename({'t1': 't1_ph', 't2': 't2_ph', "label": "phone"}, axis='columns'),
            wddf.drop(["fname"], axis = 1) \
            .rename({'t1': 't1_wd', 't2': 't2_wd', "label": "word"}, axis='columns'),
            left_on='t1_ph',
            right_on='t1_wd'
        )
        
        # drop rows where word=="0", "", or ".sil"
        newdf = newdf[(newdf["word"]!="") &
                     (newdf["word"]!=".sil") &
             (newdf["word"]!=".bn")]

        # convert v_7 to accent over vowel
        # fix notation in dimex corpus, where V_7 yields accented V
        newdf['word'] = newdf['word'].apply(lambda x: re.sub("a_7", "á", x))
        newdf['word'] = newdf['word'].apply(lambda x: re.sub("i_7", "í", x))
        newdf['word'] = newdf['word'].apply(lambda x: re.sub("o_7", "ó", x))
        newdf['word'] = newdf['word'].apply(lambda x: re.sub("u_7", "ú", x))
        newdf['word'] = newdf['word'].apply(lambda x: re.sub("e_7", "é", x))

        # fix tildas
        newdf['word'] = newdf['word'].apply(lambda x: re.sub("n~", "ñ", x))
                     
        
        # obtain syllables and stress index for each word
        newdf["syllable"] = newdf["word"].apply(lambda x: syllabize(x)[0])
        newdf["stress_ind"] = newdf['word'].apply(lambda x: syllabize(x)[1])
        
        # new df with two cols: word and each syllable separately
        syll_cols = pd.DataFrame({'word':np.repeat(newdf.word.values, newdf.syllable.str.len()),
                          'stress_ind': np.repeat(newdf.stress_ind.values, newdf.syllable.str.len()),
                        'each_syllable':np.concatenate(newdf.syllable.values)})
        syll_cols = syll_cols.drop_duplicates()
        # remove 'h' at beginning of syllable
        syll_cols['each_syllable'] = syll_cols["each_syllable"].apply(lambda x: x[1:] if x.startswith("h") else x)
         
        dict_tgs = syll_cols[['each_syllable']].copy()
        
        fname = f[:-9]
        
        dict_tgs.to_csv("data/dict_tgs/" + fname + ".txt", index = False, header = False)

Now in the folder `dict_tgs` each file has a corresponding .txt file that lists out each syllable produced. This file will be run through the MFA again in order to produce a dictionary with the GlobalPhones associated with each syllable.

In [50]:
syllable_dictionary = pd.read_csv("data/syllable_dictionary.txt", sep = "\t", header = None)
syllable_dictionary = syllable_dictionary.rename(columns = {0: "each_syllable", 1: "GP_syll"})
syllable_dictionary.head()

Unnamed: 0,each_syllable,GP_syll
0,vos,b o s
1,nos,n o s
2,bar,b a rf
3,gua,G w a
4,ja,x a


In [53]:
pd.set_option('display.max_rows', 500)
filepath = "data/tgs/p111_spanish3.TextGrid"
[wddf, phdf] = read_label(filepath,
    ftype='praat',
    tiers=['default - words', 'default - phones']
)
# add participant to dataframe
phdf['Participant'] = f[:4]
    
# Merge phone and word tiers.
newdf = pd.merge_asof(
    phdf.rename({'t1': 't1_ph', 't2': 't2_ph', "label": "phone"}, axis='columns'),
    wddf.drop(["fname"], axis = 1) \
    .rename({'t1': 't1_wd', 't2': 't2_wd', "label": "word"}, axis='columns'),
    left_on='t1_ph',
    right_on='t1_wd'
)
        
# drop rows where word=="" or ".sil"
newdf = newdf[(newdf["word"]!="") &
                (newdf["word"]!=".sil") &
             (newdf["word"]!=".bn")]

# convert v_7 to accent over vowel
# fix notation in dimex corpus, where V_7 yields accented V
newdf['word'] = newdf['word'].apply(lambda x: re.sub("a_7", "á", x))
newdf['word'] = newdf['word'].apply(lambda x: re.sub("i_7", "í", x))
newdf['word'] = newdf['word'].apply(lambda x: re.sub("o_7", "ó", x))
newdf['word'] = newdf['word'].apply(lambda x: re.sub("u_7", "ú", x))
newdf['word'] = newdf['word'].apply(lambda x: re.sub("e_7", "é", x))

# fix tildas
newdf['word'] = newdf['word'].apply(lambda x: re.sub("n~", "ñ", x))
        
# obtain syllables for each word
newdf["syllable"] = newdf["word"].apply(lambda x: syllabize(x)[0])
newdf["stress_ind"] = newdf['word'].apply(lambda x: syllabize(x)[1])
newdf = newdf.reset_index(drop = True)
newdf.head(150)

299


Unnamed: 0,t1_ph,t2_ph,phone,fname,Participant,t1_wd,t2_wd,word,syllable,stress_ind
0,11.394,11.424,i,data/tgs/p111_spanish3.TextGrid,s056,11.394,11.974,invernal,"[in, ver, nal]",2
1,11.424,11.504,ng,data/tgs/p111_spanish3.TextGrid,s056,11.394,11.974,invernal,"[in, ver, nal]",2
2,11.504,11.554,V,data/tgs/p111_spanish3.TextGrid,s056,11.394,11.974,invernal,"[in, ver, nal]",2
3,11.554,11.604,e,data/tgs/p111_spanish3.TextGrid,s056,11.394,11.974,invernal,"[in, ver, nal]",2
4,11.604,11.664,rf,data/tgs/p111_spanish3.TextGrid,s056,11.394,11.974,invernal,"[in, ver, nal]",2
5,11.664,11.754,n,data/tgs/p111_spanish3.TextGrid,s056,11.394,11.974,invernal,"[in, ver, nal]",2
6,11.754,11.884,a,data/tgs/p111_spanish3.TextGrid,s056,11.394,11.974,invernal,"[in, ver, nal]",2
7,11.884,11.974,l,data/tgs/p111_spanish3.TextGrid,s056,11.394,11.974,invernal,"[in, ver, nal]",2
8,14.14,14.21,n,data/tgs/p111_spanish3.TextGrid,s056,14.14,14.63,nuestros,"[nues, tros]",0
9,14.21,14.25,w,data/tgs/p111_spanish3.TextGrid,s056,14.14,14.63,nuestros,"[nues, tros]",0


In [54]:
words_sylls = newdf[['word', 't1_wd', 'stress_ind', 'syllable']].copy()
words_sylls = words_sylls.drop_duplicates(subset = ['t1_wd',"word"])
words_sylls.head(60)

Unnamed: 0,word,t1_wd,stress_ind,syllable
0,invernal,11.394,2,"[in, ver, nal]"
8,nuestros,14.14,0,"[nues, tros]"
16,útiles,14.66,0,"[ú, ti, les]"
22,embuste,16.424,1,"[em, bus, te]"
29,lupa,19.01,0,"[lu, pa]"
33,francesa,19.42,1,"[fran, ce, sa]"
41,envejecimiento,21.85,4,"[en, ve, je, ci, mien, to]"
55,ladrón,23.719,1,"[la, drón]"
61,convento,25.408,1,"[con, ven, to]"
69,órbita,27.067,0,"[ór, bi, ta]"


In [65]:
syll_cols = pd.DataFrame({'word':np.repeat(words_sylls.word.values, words_sylls.syllable.str.len()),
                          'stress_ind': np.repeat(words_sylls.stress_ind.values, words_sylls.syllable.str.len()),
                          't1_wd': np.repeat(words_sylls.t1_wd.values, words_sylls.syllable.str.len()),
                        'each_syllable':np.concatenate(words_sylls.syllable.values)})
#syll_cols["syll_ind"] = syll_cols.groupby(['t1_wd']).cumcount()
#syll_cols = syll_cols.drop_duplicates()
syll_cols['each_syllable'] = syll_cols["each_syllable"].apply(lambda x: x[1:] if x.startswith("h") else x)
syll_cols = syll_cols.reset_index(drop = True)
syll_cols.head(150)

Unnamed: 0,word,stress_ind,t1_wd,each_syllable
0,invernal,2,11.394,in
1,invernal,2,11.394,ver
2,invernal,2,11.394,nal
3,nuestros,0,14.14,nues
4,nuestros,0,14.14,tros
5,útiles,0,14.66,ú
6,útiles,0,14.66,ti
7,útiles,0,14.66,les
8,embuste,1,16.424,em
9,embuste,1,16.424,bus


In [66]:
stress_finder = syll_cols.merge(syllable_dictionary, how = "left", on = "each_syllable")
stress_finder.head(150)

Unnamed: 0,word,stress_ind,t1_wd,each_syllable,GP_syll
0,invernal,2,11.394,in,i ng
1,invernal,2,11.394,ver,b e rf
2,invernal,2,11.394,nal,n a l
3,nuestros,0,14.14,nues,n w e s
4,nuestros,0,14.14,tros,t rf o s
5,útiles,0,14.66,ú,u+
6,útiles,0,14.66,ti,t i
7,útiles,0,14.66,les,l e s
8,embuste,1,16.424,em,e m
9,embuste,1,16.424,bus,b u s


In [67]:
stress_finder["syll_ind"] = stress_finder.groupby(["word", "t1_wd"]).cumcount()
stress_finder.head(150)

Unnamed: 0,word,stress_ind,t1_wd,each_syllable,GP_syll,syll_ind
0,invernal,2,11.394,in,i ng,0
1,invernal,2,11.394,ver,b e rf,1
2,invernal,2,11.394,nal,n a l,2
3,nuestros,0,14.14,nues,n w e s,0
4,nuestros,0,14.14,tros,t rf o s,1
5,útiles,0,14.66,ú,u+,0
6,útiles,0,14.66,ti,t i,1
7,útiles,0,14.66,les,l e s,2
8,embuste,1,16.424,em,e m,0
9,embuste,1,16.424,bus,b u s,1


In [68]:
stress_finder['is_stress'] = np.where(stress_finder.stress_ind == stress_finder.syll_ind, 1, 0)
stress_finder.head()

Unnamed: 0,word,stress_ind,t1_wd,each_syllable,GP_syll,syll_ind,is_stress
0,invernal,2,11.394,in,i ng,0,0
1,invernal,2,11.394,ver,b e rf,1,0
2,invernal,2,11.394,nal,n a l,2,1
3,nuestros,0,14.14,nues,n w e s,0,1
4,nuestros,0,14.14,tros,t rf o s,1,0


In [69]:
stress_finder['syll_list'] = stress_finder["GP_syll"].apply(lambda x: x.split(" "))
stress_finder.head()

Unnamed: 0,word,stress_ind,t1_wd,each_syllable,GP_syll,syll_ind,is_stress,syll_list
0,invernal,2,11.394,in,i ng,0,0,"[i, ng]"
1,invernal,2,11.394,ver,b e rf,1,0,"[b, e, rf]"
2,invernal,2,11.394,nal,n a l,2,1,"[n, a, l]"
3,nuestros,0,14.14,nues,n w e s,0,1,"[n, w, e, s]"
4,nuestros,0,14.14,tros,t rf o s,1,0,"[t, rf, o, s]"


Now expand column `GP_syll` and concatenate with column `is_stress` so that each GlobalPhone is marked with a 0 (not stressed) or a 1 (stressed)

In [72]:
phones_stressed = pd.DataFrame({'word':np.repeat(stress_finder.word.values, stress_finder.syll_list.str.len()),
                                't1_wd': np.repeat(stress_finder.t1_wd.values, stress_finder.syll_list.str.len()),
                                'is_stress': np.repeat(stress_finder.is_stress.values, stress_finder.syll_list.str.len()),
                        'GP_ph':np.concatenate(stress_finder.syll_list.values)})
phones_stressed = phones_stressed.reset_index(drop = True)
counts = phones_stressed.groupby(['word', 't1_wd'])['GP_ph'].size().reset_index(name = 'num_ph')
counts = counts.sort_values(by = ["t1_wd"], ignore_index = True)
phones_stressed = phones_stressed.merge(counts, on = ["word", 't1_wd'])
phones_stressed.head(150)

Unnamed: 0,word,t1_wd,is_stress,GP_ph,num_ph
0,invernal,11.394,0,i,8
1,invernal,11.394,0,ng,8
2,invernal,11.394,0,b,8
3,invernal,11.394,0,e,8
4,invernal,11.394,0,rf,8
5,invernal,11.394,1,n,8
6,invernal,11.394,1,a,8
7,invernal,11.394,1,l,8
8,nuestros,14.14,1,n,8
9,nuestros,14.14,1,w,8


In [73]:
testdf = phones_stressed.drop(['is_stress', 'GP_ph'], axis = 1)
testdf = testdf.drop_duplicates()
test_3 = newdf.merge(testdf, on = ["word", 't1_wd'])

In [74]:
word_counts = test_3.groupby(['word', 't1_wd']).size().reset_index(name = "count_word")
word_counts = word_counts.sort_values(by = ["t1_wd"], ignore_index = True)
test_3 = test_3.merge(word_counts, on = ["word", 't1_wd'])
test_3.head(150)

Unnamed: 0,t1_ph,t2_ph,phone,fname,Participant,t1_wd,t2_wd,word,syllable,stress_ind,num_ph,count_word
0,11.394,11.424,i,data/tgs/p111_spanish3.TextGrid,s056,11.394,11.974,invernal,"[in, ver, nal]",2,8,8
1,11.424,11.504,ng,data/tgs/p111_spanish3.TextGrid,s056,11.394,11.974,invernal,"[in, ver, nal]",2,8,8
2,11.504,11.554,V,data/tgs/p111_spanish3.TextGrid,s056,11.394,11.974,invernal,"[in, ver, nal]",2,8,8
3,11.554,11.604,e,data/tgs/p111_spanish3.TextGrid,s056,11.394,11.974,invernal,"[in, ver, nal]",2,8,8
4,11.604,11.664,rf,data/tgs/p111_spanish3.TextGrid,s056,11.394,11.974,invernal,"[in, ver, nal]",2,8,8
5,11.664,11.754,n,data/tgs/p111_spanish3.TextGrid,s056,11.394,11.974,invernal,"[in, ver, nal]",2,8,8
6,11.754,11.884,a,data/tgs/p111_spanish3.TextGrid,s056,11.394,11.974,invernal,"[in, ver, nal]",2,8,8
7,11.884,11.974,l,data/tgs/p111_spanish3.TextGrid,s056,11.394,11.974,invernal,"[in, ver, nal]",2,8,8
8,14.14,14.21,n,data/tgs/p111_spanish3.TextGrid,s056,14.14,14.63,nuestros,"[nues, tros]",0,8,8
9,14.21,14.25,w,data/tgs/p111_spanish3.TextGrid,s056,14.14,14.63,nuestros,"[nues, tros]",0,8,8


In [41]:
removed = test_3[test_3['num_ph']!=test_3['count_word']]
test_3 = test_3[test_3['num_ph']==test_3['count_word']]
test_3 = test_3.reset_index(drop = True)

In [42]:
rem_words = list(removed['word'].unique())
rem_words

[]

In [43]:
test_3 = test_3.drop(['phone'], axis = 1)

In [44]:
phones_stressed = phones_stressed[~phones_stressed['word'].isin(rem_words)]
phones_stressed = phones_stressed.reset_index(drop = True)
phones_stressed

Unnamed: 0,word,t1_wd,is_stress,GP_ph,num_ph
0,invernal,11.394,0,i,8
1,invernal,11.394,0,ng,8
2,invernal,11.394,0,b,8
3,invernal,11.394,0,e,8
4,invernal,11.394,0,rf,8
...,...,...,...,...,...
288,zurdo,103.414,1,s,5
289,zurdo,103.414,1,u,5
290,zurdo,103.414,1,rf,5
291,zurdo,103.414,0,d,5


In [45]:
final_df = test_3.join(phones_stressed, how = "inner", rsuffix = "_dict")
final_df = final_df.drop(['stress_ind', "syllable", "count_word", "num_ph", "word_dict", "t1_wd_dict", "num_ph_dict"], axis = 1)
final_df.head(20)

Unnamed: 0,t1_ph,t2_ph,fname,Participant,t1_wd,t2_wd,word,is_stress,GP_ph
0,11.394,11.424,data/tgs/p111_spanish3.TextGrid,s056,11.394,11.974,invernal,0,i
1,11.424,11.504,data/tgs/p111_spanish3.TextGrid,s056,11.394,11.974,invernal,0,ng
2,11.504,11.554,data/tgs/p111_spanish3.TextGrid,s056,11.394,11.974,invernal,0,b
3,11.554,11.604,data/tgs/p111_spanish3.TextGrid,s056,11.394,11.974,invernal,0,e
4,11.604,11.664,data/tgs/p111_spanish3.TextGrid,s056,11.394,11.974,invernal,0,rf
5,11.664,11.754,data/tgs/p111_spanish3.TextGrid,s056,11.394,11.974,invernal,1,n
6,11.754,11.884,data/tgs/p111_spanish3.TextGrid,s056,11.394,11.974,invernal,1,a
7,11.884,11.974,data/tgs/p111_spanish3.TextGrid,s056,11.394,11.974,invernal,1,l
8,14.14,14.21,data/tgs/p111_spanish3.TextGrid,s056,14.14,14.63,nuestros,1,n
9,14.21,14.25,data/tgs/p111_spanish3.TextGrid,s056,14.14,14.63,nuestros,1,w
