# Generating Spanish phonetic correspondence with prosody

The syltippy package (https://github.com/nur-ag/syltippy) will be used to generate syllabified (stress-indicated) outputs for each word found in the transcriptions. Then, the corresponding vowels in the TextGrid-formant dataframes will be marked as either stressed or unstressed.

In [2]:
import pandas as pd
import csv
import re
import numpy as np
from syltippy import syllabize

In [68]:
# create function to take in word input and generate uppercase for stressed syllable

def stress(word):
    syllables, stress = syllabize(word)
    return ','.join(s if stress != i else s.upper() for (i, s) in enumerate(syllables))

# enter nonce word
stress("guerra")

'GUE,rra'

In [52]:
# import dataframe of female wordlist productions

female = pd.read_csv("dicts/cbas_female_dictionary.txt", sep = "\t", header = None)
female.columns = ["word", "ipa"]
female.sample()

Unnamed: 0,word,ipa
55,zepelín,s e p e l i+ ng


In [53]:
# apply lambda function of 'stress' on the 'word' column

female["stress"] = female["word"].apply(lambda x : stress(x))
female.sample(5)

Unnamed: 0,word,ipa,stress
101,excavador,e k s k a b a D o rf,"ex,ca,va,DOR"
89,vaca,b a k a,"VA,ca"
35,hotel,o t e l,"ho,TEL"
83,abanicos,a b a n i k o s,"a,ba,NI,cos"
41,baile,b aI l e,"BAI,le"


In [55]:
# create separate column to hold only the vowels in each syllable

female["vowels"] = female["stress"].apply(lambda x: re.sub(r'[^,aeiouAEIOUáéíóúÁÉÍÓÚ]', '', x))
female.sample(5)

Unnamed: 0,word,ipa,stress,vowels
0,carbohidratos,k a rf b o i D rf a t o s,"car,bohi,DRA,tos","a,oi,A,o"
7,hervir,e rf b i rf,"her,VIR","e,I"
37,valle,b a L e,"VA,lle","A,e"
107,límite,l i+ m i t e,"LÍ,mi,te","Í,i,e"
96,bien,b j e ng,BIEN,IE


In [57]:
# define function to return index of 'vowels' column with stress

def is_stress(word):
    # convert to list
    word = word.split(",")
    for syllable in word:
        if syllable.isupper():
            stress = word.index(syllable)
    return stress

is_stress("e,I")

1

In [59]:
# create new column which gives vowel number in given word that has stress

female["stress_vowel"] = female["vowels"].apply(lambda x: is_stress(x))
female.sample(5)

Unnamed: 0,word,ipa,stress,vowels,stress_vowel
98,lunes,l u n e s,"LU,nes","U,e",0
28,base,b a s e,"BA,se","A,e",0
136,zigoto,s i G o t o,"zi,GO,to","i,O,o",1
167,señal,s e n~ a l,"se,ÑAL","e,A",1
38,payaso,p aI a s o,"pa,YA,so","a,A,o",1


In [60]:
# save as new variable

stress_indices = female[['word', 'stress_vowel']].copy()
stress_indices.sample(5)

Unnamed: 0,word,stress_vowel
110,ailas,0
28,base,0
42,asado,1
102,balcón,1
26,carbono,1


Now all that's left to do is mark each vowel in the textgrid dataframe according to its index in the word it is produced in. Then, for each vowel index that matches the index given in `stress_vowel`, the vowel will be marked as `stressed`.

In [29]:
cbas_fem = pd.read_csv("data/cbas_fem.csv")
cbas_fem.head(5)

Unnamed: 0,Participant,Vowel,F1 (Hz),F2 (Hz),F3 (Hz),t1_ph,t2_ph,dur_ph,t1_wd,t2_wd,word,is_wdinit_ph,is_wdfin_ph,prev_ph,next_ph,Gender
0,p113,a,847.425365,1467.475563,2678.720233,0.746,0.856,0.11,0.566,1.086,bajo,False,False,b,x,Female
1,p113,a,837.040453,1541.140221,2632.325338,6.234,6.334,0.1,6.124,6.804,básico,False,False,b,s,Female
2,p113,a,801.977941,1878.577333,2766.545398,11.101,11.251,0.15,10.641,11.321,bacterias,False,False,j,s,Female
3,p113,a,742.800861,1851.096178,2700.292082,15.721,15.771,0.05,15.691,15.841,las,False,False,l,s,Female
4,p113,a,705.3883,1739.019946,2699.164275,15.841,15.911,0.07,15.841,16.361,amigas,True,False,s,m,Female


In [30]:
# add column indicating stressed vowel index

cbas_fem = cbas_fem.merge(stress_indices, on = "word", how = "outer")
cbas_fem.sample(5)

Unnamed: 0,Participant,Vowel,F1 (Hz),F2 (Hz),F3 (Hz),t1_ph,t2_ph,dur_ph,t1_wd,t2_wd,word,is_wdinit_ph,is_wdfin_ph,prev_ph,next_ph,Gender,stress_vowel
826,p115,o,582.253573,1233.561418,2627.066978,91.741,91.831,0.09,91.401,92.001,control,False,False,rf,l,Female,1
1073,p120,o,772.010678,1043.483596,2558.603806,162.66,163.05,0.39,161.92,163.05,vandalismo,False,True,m,,Female,2
832,p115,a,757.562556,1511.396516,2913.63297,105.371,105.471,0.1,105.271,105.831,vapor,False,False,b,p,Female,1
382,p115,a,665.888539,1841.774484,3029.993352,152.436,152.596,0.16,151.986,152.596,bajeza,False,True,T,sp,Female,1
31,p115,a,812.500462,1771.432952,3033.577554,14.754,14.834,0.08,14.644,14.934,las,False,False,l,s,Female,0


In [31]:
cbas_fem["vowel_ind"] = cbas_fem.groupby(["Participant", "t1_wd"]).cumcount()
cbas_fem = cbas_fem.reset_index(drop = True)
cbas_fem.head(5)

Unnamed: 0,Participant,Vowel,F1 (Hz),F2 (Hz),F3 (Hz),t1_ph,t2_ph,dur_ph,t1_wd,t2_wd,word,is_wdinit_ph,is_wdfin_ph,prev_ph,next_ph,Gender,stress_vowel,vowel_ind
0,p113,a,847.425365,1467.475563,2678.720233,0.746,0.856,0.11,0.566,1.086,bajo,False,False,b,x,Female,0,0
1,p113,o,541.894715,877.100715,2625.49635,0.986,1.086,0.1,0.566,1.086,bajo,False,True,x,,Female,0,1
2,p115,a,836.580222,1572.356797,2544.417205,1.681,1.761,0.08,1.521,2.011,bajo,False,False,b,x,Female,0,0
3,p115,o,531.962741,1018.437658,2728.306199,1.841,2.011,0.17,1.521,2.011,bajo,False,True,x,sp,Female,0,1
4,p120,a,902.622756,1031.298779,3047.54853,0.34,0.41,0.07,0.21,0.91,bajo,False,False,b,x,Female,0,0


In [32]:
cbas_fem["stress"] = np.where(cbas_fem['stress_vowel'] == cbas_fem['vowel_ind'], "stressed", "unstressed")
cbas_fem.head(5)

Unnamed: 0,Participant,Vowel,F1 (Hz),F2 (Hz),F3 (Hz),t1_ph,t2_ph,dur_ph,t1_wd,t2_wd,word,is_wdinit_ph,is_wdfin_ph,prev_ph,next_ph,Gender,stress_vowel,vowel_ind,stress
0,p113,a,847.425365,1467.475563,2678.720233,0.746,0.856,0.11,0.566,1.086,bajo,False,False,b,x,Female,0,0,stressed
1,p113,o,541.894715,877.100715,2625.49635,0.986,1.086,0.1,0.566,1.086,bajo,False,True,x,,Female,0,1,unstressed
2,p115,a,836.580222,1572.356797,2544.417205,1.681,1.761,0.08,1.521,2.011,bajo,False,False,b,x,Female,0,0,stressed
3,p115,o,531.962741,1018.437658,2728.306199,1.841,2.011,0.17,1.521,2.011,bajo,False,True,x,sp,Female,0,1,unstressed
4,p120,a,902.622756,1031.298779,3047.54853,0.34,0.41,0.07,0.21,0.91,bajo,False,False,b,x,Female,0,0,stressed


Now, create a function that will take as an input a dictionary dataframe with columns `word` and `ipa` and will also take as input a dataframe of vowel productions (from formants and TextGrid), also with a column `word` and columns `Participant`, `t1_wd`, and `t1_ph`. The below function requires that the formant dataframe only include vowel formants (i.e. non vowels are exluded).

In [66]:
def get_stress(dictionary, formants):
    # import required packages
    import pandas as pd
    import csv
    import re
    import numpy as np
    from syltippy import syllabize
    
    # def fxn to create stress column in dictionary
    def stress(word):
        syllables, stress = syllabize(word)
        return ','.join(s if stress != i else s.upper() for (i, s) in enumerate(syllables))
    
    # add column to dictionary
    dictionary["stress"] = dictionary["word"].apply(lambda x : stress(x))
    
    # create separate column to hold only the vowels in each word
    dictionary["vowels"] = dictionary["stress"].apply(lambda x: re.sub(r'[^,aeiouAEIOUáéíóúÁÉÍÓÚ]', '', x))
    
    # define function to return index of 'vowels' column with stress
    def is_stress(word):
        # convert to list
        word = word.split(",")
        for syllable in word:
            if syllable.isupper():
                stress = word.index(syllable)
        return stress
    
    # create new column which gives vowel number in given word that has stress
    dictionary["stress_vowel"] = dictionary["vowels"].apply(lambda x: is_stress(x))
    
    # save as new variable
    stress_indices = dictionary[['word', 'stress_vowel']].copy()
    
    # add column to formants indicating stressed vowel index
    formants = formants.merge(stress_indices, on = "word", how = "outer")
    formants = formants.sort_values(["Participant", "t1_wd", "t1_ph"])
    
    # add column to formants indicating index of vowel in each row
    formants["vowel_ind"] = formants.groupby(["Participant", "t1_wd"]).cumcount()
    formants = formants.reset_index(drop = True)
    
    # add column to formants to indicate stress
    formants["stress"] = np.where(formants['stress_vowel'] == formants['vowel_ind'], "stressed", "unstressed")
       
    return formants

In [67]:
# test above function

male_dictionary = pd.read_csv("dicts/cbas_male_dictionary.txt", sep = "\t", header = None)
male_dictionary.columns = ['word', 'ipa']

cbas_male = pd.read_csv("data/cbas_male.csv")

cbas_male = get_stress(male_dictionary, cbas_male)
cbas_male.sample(5)

Unnamed: 0,Participant,Vowel,F1 (Hz),F2 (Hz),F3 (Hz),t1_ph,t2_ph,dur_ph,t1_wd,t2_wd,word,is_wdinit_ph,is_wdfin_ph,prev_ph,next_ph,Gender,stress_vowel,vowel_ind,stress
427,p119,a,807.038058,1499.712767,2563.030801,75.806,75.956,0.15,75.806,76.436,águilas,True,False,sp,g,Male,0,0,stressed
384,p119,o,534.718078,1267.931985,2405.329347,41.123,41.173,0.05,40.983,41.643,lógica,False,False,l,x,Male,0,0,stressed
539,p119,a,743.931378,1530.412182,2609.161841,181.16,181.22,0.06,181.16,181.85,amargas,True,False,s,m,Male,1,0,unstressed
541,p119,a,623.961868,1546.736861,2503.944786,181.52,181.62,0.1,181.16,181.85,amargas,False,False,G,s,Male,1,2,unstressed
65,p112,a,522.609425,1446.564943,2099.13011,54.812,54.992,0.18,54.452,54.992,lucha,False,True,tS,sp,Male,0,1,unstressed
