# Perform leave-one-out LDA on acoustic measures

Input large df with acoustic measures for each vowel in each word and output added accuracy for each acoustic measure of interest.

In [148]:
import pandas as pd
import numpy as np
from numpy import mean
from numpy import std
from sklearn.preprocessing import StandardScaler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold as kfold

In [198]:
df = pd.read_csv("data/data.csv")
df.head()

Unnamed: 0,Filename,Label,H1c_mean,H2c_mean,H4c_mean,A1c_mean,A2c_mean,A3c_mean,H2Kc_mean,H1H2c_mean,...,t1_wd,t2_wd,word,stress,syl_dur,v_dur,sF1_norm,sF2_norm,pF1_norm,pF2_norm
0,lrv099_catala_exp2_a.mat,i,20.927,4.12,-12.382,-21.605,-13.612,-8.504,-14.792,16.808,...,3.056936,3.501372,desanimar,0,0.084149,0.037912,0.646745,2.078281,0.263155,2.079375
1,lrv099_catala_exp2_a.mat,a,19.418,4.686,-14.122,-19.125,-21.252,-9.591,-11.833,14.732,...,3.056936,3.501372,desanimar,1,0.141372,0.080206,0.764094,1.375944,0.564352,1.266652
2,lrv099_catala_exp2_a.mat,i,11.935,15.855,17.809,10.503,4.465,6.305,2.997,-3.92,...,7.674408,8.083331,elimina,1,0.11036,0.048069,0.19817,2.156112,0.304463,1.997006
3,lrv099_catala_exp2_a.mat,a,9.754,13.376,22.242,8.334,2.36,13.707,3.894,-3.623,...,7.674408,8.083331,elimina,0,0.103463,0.066019,0.216104,1.612011,0.418792,1.533985
4,lrv099_catala_exp2_a.mat,i,22.174,6.194,-8.876,-17.493,-13.925,-14.555,-15.125,15.979,...,12.2078,12.815098,discrimina,1,0.119827,0.059827,0.619651,2.169299,0.267623,2.054012


Check `Label` values:

In [199]:
df.Label.unique()

array(['i', 'a', 'pen', 'ult', 'o'], dtype=object)

In [200]:
df_1 = df[(df['Label']=="i") | (df['Label']=="pen")].copy()
df_2 = df[(df['Label']!="i") & (df['Label']!="pen")].copy()

In [201]:
df_wd = pd.merge(df_1, df_2, on=['Filename', 'word', 't1_wd', 't2_wd', 'partID', 'Language'],
                 suffixes=('_vowel1', '_vowel2'))

Label row as `Paroxytone` if vowel 1 is stressed and `Oxytone` if vowel 2 is stressed.

In [202]:
df_wd['stress_wd'] = np.where(df_wd['stress_vowel1']==1, 'Paroxytone', 'Oxytone')

Columns to drop from LDA: `Filename`, `stress_vowel1`, `stress_vowel2`

In [203]:
df_wd.drop(['Filename', 'stress_vowel1', 'stress_vowel2', 'Label_vowel1', 'Label_vowel2',
           't1_wd', 't2_wd'],
           axis = 1, inplace = True)
df_wd.sample(5)

Unnamed: 0,H1c_mean_vowel1,H2c_mean_vowel1,H4c_mean_vowel1,A1c_mean_vowel1,A2c_mean_vowel1,A3c_mean_vowel1,H2Kc_mean_vowel1,H1H2c_mean_vowel1,H2H4c_mean_vowel1,H1A1c_mean_vowel1,...,pF0_mean_vowel2,shrF0_mean_vowel2,epoch_mean_vowel2,syl_dur_vowel2,v_dur_vowel2,sF1_norm_vowel2,sF2_norm_vowel2,pF1_norm_vowel2,pF2_norm_vowel2,stress_wd
89,11.87,-2.28,-1.756,-5.44,-16.559,-7.019,-17.947,14.15,-0.524,17.311,...,219.476,217.922,0.206,0.324093,0.061651,0.491066,1.297863,0.606663,1.60566,Paroxytone
93,8.287,-0.055,-4.275,-13.221,-11.741,-14.051,-11.644,8.341,4.22,21.508,...,217.179,219.021,0.221,0.251639,0.075995,0.249857,1.852478,0.285118,1.676987,Paroxytone
76,6.941,-1.35,-2.422,-7.537,-12.495,0.665,-6.275,8.291,1.072,14.478,...,215.2,214.568,0.216,0.3461,0.114469,0.398205,1.874521,0.354325,2.058149,Oxytone
235,13.362,18.399,17.008,11.79,0.36,2.689,-0.021,-5.037,1.391,1.572,...,199.159,201.649,0.194,0.110109,0.071348,0.229096,1.34938,0.429277,1.14721,Oxytone
4,22.248,2.999,-12.037,-20.002,-16.433,-11.913,-16.076,19.249,15.037,42.25,...,193.06,192.361,0.192,0.116782,0.07191,0.72708,1.511164,0.638038,1.335687,Paroxytone


In [None]:
sc = StandardScaler()

In [218]:
def lda_leave_one_out(df):
    lang = []
    partid = []
    dfs = []
    
    for p in df.partID.unique():
        part_dicts = []

        for l in df.Language.unique():
            # isolate language of interest
            data = df[df['Language']==l].copy()
            data.drop(['Language', 'partID'], axis = 1, inplace = True)
            data.reset_index(inplace = True, drop = True)
            
            # define predictor label
            y = data['stress_wd']

            # define feature set
            X = data.drop(['stress_wd'], axis = 1)
            
            # subset data
            X_nodur = X.drop(['syl_dur_vowel1', 'syl_dur_vowel2',
                                 'v_dur_vowel1', 'v_dur_vowel2'], axis = 1)
            X_dur_vowel = X.drop(['syl_dur_vowel1', 'syl_dur_vowel2'], axis = 1)
            X_dur_syl = X.drop(['v_dur_vowel1', 'v_dur_vowel2'], axis = 1)
            X_noformant = X.drop(['sF1_norm_vowel1', 'sF2_norm_vowel1',
                                'pF1_norm_vowel1', 'pF2_norm_vowel1'], axis = 1)
            
            subset_X = [X_noformant, X, X_nodur, X_dur_vowel, X_dur_syl]
            col_names = ['noformant', 'formant', 'nodur', 'vowel_dur', 'syl_dur']
            
            accuracy_list = []
            std_list = []
            
            for X_df, name in zip(subset_X, col_names):
                # one-hot encoding
                X_df = pd.get_dummies(X_df)

                # scale features
                X_df = sc.fit_transform(X_df)

                # define model and evaluation
                model = LDA()
                cv = kfold(n_splits=10, n_repeats=3, random_state=1)

                # evaluate baseline model
                scores = cross_val_score(model, X_df, y, scoring='accuracy', cv=cv, n_jobs=-1)
                accuracy_list.append(mean(scores))
                std_list.append(std(scores))
        
            lda_dict = {'partID': p, 'Language': l, 'Accuracy_noFormants': accuracy_list[0],
                'Std_noFormants': std_list[0], 'Accuracy_Formants': accuracy_list[1],
               'Std_Formant': std_list[1], 'Accuracy_noDur': accuracy_list[2],
                'Std_noDur': std_list[2], 'Accuracy_vowelDur': accuracy_list[3],
                'Std_vowelDur': std_list[3], 'Accuracy_sylDur': accuracy_list[4],
                'Std_sylDur': std_list[4]}
            part_dicts.append(lda_dict)
        
        # make df for this part and append to df list
        part_df = pd.DataFrame.from_dict(part_dicts)
        dfs.append(part_df)
        
    lda_df = pd.concat(dfs)
        
    return(lda_df)

In [219]:
lda_df = lda_leave_one_out(df_wd)
lda_df.head(18)

Unnamed: 0,partID,Language,Accuracy_noFormants,Std_noFormants,Accuracy_Formants,Std_Formant,Accuracy_noDur,Std_noDur,Accuracy_vowelDur,Std_vowelDur,Accuracy_sylDur,Std_sylDur
0,lrv099,cat,0.689286,0.188058,0.658333,0.17213,0.647619,0.195013,0.67381,0.177792,0.654167,0.187655
1,lrv099,eng,0.637037,0.183009,0.684444,0.164294,0.58963,0.139338,0.62037,0.164732,0.650741,0.161467
2,lrv099,spa,0.752778,0.137325,0.753241,0.146446,0.662963,0.17078,0.722222,0.16927,0.769907,0.148669
