# Perform leave-one-out LDA on acoustic measures

Input large df with acoustic measures for each vowel in each word and output added accuracy for each acoustic measure of interest.

In [1]:
import pandas as pd
import numpy as np
from numpy import mean
from numpy import std
from sklearn.preprocessing import StandardScaler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold as kfold
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

In [103]:
df = pd.read_csv("data/data.csv")
df.head()

Unnamed: 0,spectilt,Filename,Label,CPP_mean,Energy_mean,HNR05_mean,HNR15_mean,HNR25_mean,HNR35_mean,SHR_mean,...,Language,t1_wd,t2_wd,stress,syl_dur,v_dur,sF1_norm,sF2_norm,pF1_norm,pF2_norm
0,2.532693,lrv099_catala_exp2_a.mat,i,26.053,2.144,30.506,40.68,44.487,44.465,0.0,...,cat,3.056936,3.501372,0,0.084149,0.037912,0.646745,2.078281,0.263155,2.079375
1,2.564851,lrv099_catala_exp2_a.mat,a,23.106,1.849,31.482,32.598,37.773,39.482,0.0,...,cat,3.056936,3.501372,1,0.141372,0.080206,0.764094,1.375944,0.564352,1.266652
2,-2.196108,lrv099_catala_exp2_a.mat,i,25.742,1.544,30.491,41.154,46.163,46.424,0.0,...,cat,7.674408,8.083331,1,0.11036,0.048069,0.19817,2.156112,0.304463,1.997006
3,-2.889204,lrv099_catala_exp2_a.mat,a,23.114,1.239,31.431,37.203,39.094,39.701,0.0,...,cat,7.674408,8.083331,0,0.103463,0.066019,0.216104,1.612011,0.418792,1.533985
4,2.614449,lrv099_catala_exp2_a.mat,i,26.77,2.945,35.317,46.545,50.721,48.58,0.0,...,cat,12.2078,12.815098,1,0.119827,0.059827,0.619651,2.169299,0.267623,2.054012


Check `Label` values:

In [104]:
df.Label.unique()

array(['i', 'a', 'pen', 'ult', 'o'], dtype=object)

Check `stress` values:

In [105]:
df.stress.unique()

array([0, 1], dtype=int64)

In [106]:
df_1 = df[(df['Label']=="i") | (df['Label']=="pen")].copy()
df_2 = df[(df['Label']!="i") & (df['Label']!="pen")].copy()

In [107]:
df_wd = pd.merge(df_1, df_2, on=['Filename', 't1_wd', 't2_wd', 'partID', 'Language'],
                 suffixes=('_vowel1', '_vowel2'))

Label row as `Paroxytone` if vowel 1 is stressed and `Oxytone` if vowel 2 is stressed.

In [108]:
df_wd['stress_wd'] = np.where(df_wd['stress_vowel1']==1, 'Paroxytone', 'Oxytone')

New column for vowel duration ratio, new column for syllable duration ratio, new column for Delta F distance from center calculation, new column for energy ratio.

In [109]:
df_wd['vdur_ratio'] = df_wd['v_dur_vowel1'] / df_wd['v_dur_vowel2']
df_wd['syldur_ratio'] = df_wd['syl_dur_vowel1'] / df_wd['syl_dur_vowel2']
df_wd['s_deltaF_dist_vowel1'] = np.sqrt(np.square(df_wd['sF1_norm_vowel1']) + 
                                           np.sqrt(df_wd['sF2_norm_vowel1']))
df_wd['s_deltaF_dist_vowel2'] = np.sqrt(np.square(df_wd['sF1_norm_vowel2']) + 
                                           np.sqrt(df_wd['sF2_norm_vowel2']))
df_wd['p_deltaF_dist_vowel1'] = np.sqrt(np.square(df_wd['pF1_norm_vowel1']) + 
                                           np.sqrt(df_wd['pF2_norm_vowel1']))
df_wd['p_deltaF_dist_vowel2'] = np.sqrt(np.square(df_wd['pF1_norm_vowel2']) + 
                                           np.sqrt(df_wd['pF2_norm_vowel2']))
df_wd['energy_ratio'] = df_wd['Energy_mean_vowel1'] / df_wd['Energy_mean_vowel2']

Columns to drop from LDA: `Filename`, `stress_vowel1`, `stress_vowel2`, `v_dur_vowel1`, `v_dur_vowel2`, `sF1_norm_vowel1`, `sF1_norm_vowel2`, `sF2_norm_vowel1`, `sF2_norm_vowel2`, `pF1_norm_vowel1`, `pF2_norm_vowel1`, `pF1_norm_vowel2`, `pF2_norm_vowel2`

In [110]:
df_wd.drop(['Filename', 'stress_vowel1', 'stress_vowel2', 'Label_vowel1', 'Label_vowel2',
           't1_wd', 't2_wd', 'v_dur_vowel1','v_dur_vowel2', 'syl_dur_vowel1', 'syl_dur_vowel2',
            'sF1_norm_vowel1', 'sF1_norm_vowel2', 'sF2_norm_vowel1', 'sF2_norm_vowel2',
            'pF1_norm_vowel1', 'pF2_norm_vowel1', 'pF1_norm_vowel2', 'pF2_norm_vowel2',
            'Energy_mean_vowel1', 'Energy_mean_vowel2'],
           axis = 1, inplace = True)
df_wd.sample(5)

Unnamed: 0,spectilt_vowel1,CPP_mean_vowel1,HNR05_mean_vowel1,HNR15_mean_vowel1,HNR25_mean_vowel1,HNR35_mean_vowel1,SHR_mean_vowel1,partID,Language,spectilt_vowel2,...,HNR35_mean_vowel2,SHR_mean_vowel2,stress_wd,vdur_ratio,syldur_ratio,s_deltaF_dist_vowel1,s_deltaF_dist_vowel2,p_deltaF_dist_vowel1,p_deltaF_dist_vowel2,energy_ratio
224,-1.852174,21.263,26.044,36.182,41.827,43.536,0.0,lrv099,spa,-2.427186,...,48.329,0.0,Paroxytone,0.68651,1.026836,1.178616,1.10771,1.17216,1.201265,1.417065
13,3.112122,25.544,35.512,45.631,48.828,49.323,0.0,lrv099,cat,1.459192,...,45.661,0.0,Paroxytone,0.609595,0.832329,1.362015,1.281485,1.178865,1.226882,1.323772
10,2.810038,26.733,29.978,40.542,45.886,47.273,0.0,lrv099,cat,1.569637,...,43.151,0.0,Oxytone,0.717398,0.970329,1.343772,1.297056,1.214243,1.229058,2.664286
118,-3.767598,20.371,21.443,29.33,38.681,42.926,0.0,lrv099,eng,-3.255402,...,51.808,0.001,Paroxytone,0.711538,0.657399,0.987374,1.128669,1.238458,1.125686,0.930982
234,2.840482,26.007,31.984,42.117,47.032,47.869,0.0,lrv099,spa,1.62333,...,42.58,0.382,Paroxytone,1.204533,1.379769,1.361221,1.26291,1.206737,1.143557,1.574713


In [111]:
df_wd.columns

Index(['spectilt_vowel1', 'CPP_mean_vowel1', 'HNR05_mean_vowel1',
       'HNR15_mean_vowel1', 'HNR25_mean_vowel1', 'HNR35_mean_vowel1',
       'SHR_mean_vowel1', 'partID', 'Language', 'spectilt_vowel2',
       'CPP_mean_vowel2', 'HNR05_mean_vowel2', 'HNR15_mean_vowel2',
       'HNR25_mean_vowel2', 'HNR35_mean_vowel2', 'SHR_mean_vowel2',
       'stress_wd', 'vdur_ratio', 'syldur_ratio', 's_deltaF_dist_vowel1',
       's_deltaF_dist_vowel2', 'p_deltaF_dist_vowel1', 'p_deltaF_dist_vowel2',
       'energy_ratio'],
      dtype='object')

In [112]:
sc = StandardScaler()

In [65]:
def lda_leave_one_out(df):
    lang = []
    partid = []
    dfs = []
    
    for p in df.partID.unique():
        part_dicts = []

        for l in df.Language.unique():
            # isolate language of interest
            data = df[df['Language']==l].copy()
            data.drop(['Language', 'partID'], axis = 1, inplace = True)
            data.reset_index(inplace = True, drop = True)
            
            # define predictor label
            y = data['stress_wd']

            # define feature set
            X = data.drop(['stress_wd'], axis = 1)
            
            # subset data
            X_nodur = X.drop(['syldur_ratio', 'vdur_ratio'], axis = 1)
#             X_dur_vowel = X.drop(['syldur_ratio'], axis = 1)
#             X_dur_syl = X.drop(['vdur_ratio'], axis = 1)
            X_noformant = X.drop(['s_deltaF_dist_vowel1', 's_deltaF_dist_vowel2',
                                'p_deltaF_dist_vowel1', 'p_deltaF_dist_vowel2'], axis = 1)
            X_nospectilt = X.drop(['spectilt_vowel1', 'spectilt_vowel2'], axis = 1)
            X_noCPP = X.drop(['CPP_mean_vowel1', 'CPP_mean_vowel2'], axis = 1)
            X_noEnergy = X.drop(['Energy_mean_vowel1', 'Energy_mean_vowel2'], axis = 1)
            X_noHNR = X.drop(['HNR15_mean_vowel1', 'HNR25_mean_vowel1', 'HNR35_mean_vowel1',
                             'HNR15_mean_vowel2', 'HNR25_mean_vowel2', 'HNR35_mean_vowel2'],
                            axis = 1)
            X_noSHR = X.drop(['SHR_mean_vowel1', 'SHR_mean_vowel2'], axis = 1)
            X_noEpoch = X.drop(['epoch_mean_vowel1', 'epoch_mean_vowel2'], axis = 1)
            
            subset_X = [X, X_noformant, X_nodur, X_nospectilt, X_noCPP, X_noEnergy,
                       X_noHNR, X_noSHR, X_noEpoch]
            
            accuracy_list = []
            std_list = []
            
            for X_df in subset_X:
                # one-hot encoding
                #X_df = pd.get_dummies(X_df)

                # scale features
                X_df = sc.fit_transform(X_df)

                # define model and evaluation
                model = LDA()
                cv = kfold(n_splits=10, n_repeats=3, random_state=1)

                # evaluate baseline model
                scores = cross_val_score(model, X_df, y, scoring='accuracy', cv=cv, n_jobs=-1)
                accuracy_list.append(mean(scores))
                std_list.append(std(scores))
        
            lda_dict = {'partID': p, 'Language': l, 'Accuracy_total': accuracy_list[0],
                        'Std_total': std_list[0], 'Accuracy_noFormants': accuracy_list[1],
                'Std_noFormants': std_list[1],
#                         'Accuracy_Formants': accuracy_list[1], 'Std_Formant': std_list[1],
                        'Accuracy_noDur': accuracy_list[2], 'Std_noDur': std_list[2],
#                         'Accuracy_vowelDur': accuracy_list[3], 'Std_vowelDur': std_list[3],
#                         'Accuracy_sylDur': accuracy_list[4],  'Std_sylDur': std_list[4],
                        'Accuracy_nospectilt': accuracy_list[3], 'Std_nospectilt': std_list[3],
                       'Accuracy_noCPP': accuracy_list[4], 'Std_noCPP': std_list[4],
                       'Accuracy_noEnergy': accuracy_list[5], 'Std_noEnergy': std_list[5],
                       'Accuracy_noHNR': accuracy_list[6], 'Std_noEnergy': std_list[6],
                'Accuracy_noSHR': accuracy_list[7], 'Std_noSHR': std_list[7],
                        'Accuracy_noEpoch': accuracy_list[8], 'Std_noEpoch': std_list[8]}
            part_dicts.append(lda_dict)
        
        # make df for this part and append to df list
        part_df = pd.DataFrame.from_dict(part_dicts)
        dfs.append(part_df)
        
    lda_df = pd.concat(dfs)
        
    return(lda_df)

In [66]:
lda_df = lda_leave_one_out(df_wd)
lda_df.head()

Unnamed: 0,partID,Language,Accuracy_total,Std_total,Accuracy_noFormants,Std_noFormants,Accuracy_noDur,Std_noDur,Accuracy_nospectilt,Std_nospectilt,Accuracy_noCPP,Std_noCPP,Accuracy_noEnergy,Std_noEnergy,Accuracy_noHNR,Accuracy_noSHR,Std_noSHR,Accuracy_noEpoch,Std_noEpoch
0,lrv099,cat,0.751786,0.140634,0.779167,0.132551,0.74881,0.148399,0.760119,0.12196,0.760119,0.158495,0.743452,0.144676,0.685714,0.752381,0.125825,0.74881,0.150815
1,lrv099,eng,0.812222,0.106084,0.822222,0.106767,0.703333,0.146317,0.815926,0.096126,0.834074,0.12881,0.822963,0.119867,0.79037,0.797407,0.102024,0.783333,0.118139
2,lrv099,spa,0.78287,0.145601,0.79213,0.128774,0.794907,0.161521,0.774537,0.164636,0.783333,0.11924,0.730093,0.1536,0.755556,0.794907,0.151539,0.767593,0.127492
