In [1]:
%matplotlib inline 

import os
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd

In [2]:
eng_levels = [-2, -1, 0, 1, 2, 3]

## DisVoice Features

In [3]:
### !!!! Also test with kmeans featurizer !!! ###

In [4]:
base_dir = "../sound/features/"

In [5]:
# Load data
df = pd.read_csv(os.path.join(base_dir, 'all.csv'))
labels = df['label']
df.head()

Unnamed: 0,F0avg,F0std,F0max,F0min,F0skew,F0kurt,F0tiltavg,F0mseavg,F0tiltstd,F0msestd,...,maxdurpause,mindurpause,PVU,PU,UVU,VVU,VP,UP,path,label
0,107.251472,4.754879,112.14225,97.89109,-1.044098,-0.510219,38.107999,21.846451,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,C:\Users\ASABUNCUOGLU13\Documents\data\vol02\e...,0
1,107.2491,4.753147,112.142258,97.891144,-1.044382,-0.509233,38.172404,21.827407,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,C:\Users\ASABUNCUOGLU13\Documents\data\vol02\e...,0
2,114.07309,35.353394,238.759155,69.948997,1.999682,2.876741,-39.423164,100.290732,227.348111,195.716531,...,1.19,0.17,1.298283,3.626374,0.358011,0.641989,0.49449,0.275758,C:\Users\ASABUNCUOGLU13\Documents\data\vol02\e...,-1
3,131.886368,30.292049,241.64798,66.003014,0.802149,0.373328,-31.940573,135.09657,237.915601,177.521635,...,0.98,0.15,1.087145,4.711485,0.230744,0.769256,0.707593,0.212247,C:\Users\ASABUNCUOGLU13\Documents\data\vol02\e...,0
4,118.834885,20.374716,223.293121,66.754921,1.245751,3.671583,-79.806733,89.552508,227.226774,94.320115,...,0.93,0.16,1.166132,5.765267,0.202269,0.797731,0.684083,0.173452,C:\Users\ASABUNCUOGLU13\Documents\data\vol02\e...,1


In [6]:
# Define Feature Series Ranges
r_f0 = range(1,7)
r_dur_voiced = range(80, 86)
r_dur_unvoiced = range(86, 92)

df_f0 = df.iloc[:, r_f0]
df_dur_voiced = df.iloc[:, r_dur_voiced]
df_dur_unvoiced = df.iloc[:, r_dur_unvoiced]
#df_f0.to_csv("../sound/reduced/f0.tsv", sep="\t", index=False)

In [7]:
df_f0['label'] = labels.values
df_dur_voiced['label'] = labels.values
df_dur_unvoiced['label'] = labels.values

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_f0['label'] = labels.values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_dur_voiced['label'] = labels.values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_dur_unvoiced['label'] = labels.values


In [8]:
df_all = pd.concat([df_f0.iloc[:, :-1], 
df_dur_voiced.iloc[:, :-1],
df_dur_unvoiced],axis=1)


In [9]:
feature_sets = {
    "F0": df_f0,
    "Duration of Voiced": df_dur_voiced,
    "Duration of UnVoiced": df_dur_unvoiced,
    "All Selected Features": df_all,
    "All Features": df
}

In [10]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score

In [13]:
classifier_names = ['LR', 'knn', 'rbf svm', 'random forest', 'boosted trees']
classifiers = [LogisticRegression(random_state=42, solver="liblinear"),
                KNeighborsClassifier(n_neighbors=6),
                SVC(gamma=2, C=1),
                RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
                GradientBoostingClassifier(n_estimators=10, learning_rate=1, max_depth=5)]

results = pd.DataFrame(columns= ['LR', 'knn', 'rbf svm', 'random forest', 'boosted trees', 'title'])

for title in feature_sets:
    s = [0, 0, 0, 0, 0, "title"]
    dfc = feature_sets[title]
    not_zero_ind = ~(dfc == 0).all(axis=1)

    dfc = dfc.loc[not_zero_ind]
    labels = dfc['label'].loc[not_zero_ind]

    not_nan_index = ~dfc.isna().any(axis=1)
    dfc = dfc[not_nan_index]
    labels = labels[not_nan_index]

    scaler = StandardScaler()
    scaled_samples = scaler.fit_transform(dfc.iloc[:,:-2])

    X_train, X_test, y_train, y_test = train_test_split(scaled_samples, labels, test_size=0.2, random_state=42, stratify=labels)

    i = 0
    for model in classifiers:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        res = f1_score(y_test, y_pred, average='weighted')
        print(title, model, res)
        s[i] = res
        i +=1
    s[i] = title
    results.loc[len(results.index)] = s
    #results.head()

F0 LogisticRegression(random_state=42, solver='liblinear') 0.326872930990578
F0 KNeighborsClassifier(n_neighbors=6) 0.3368326118326119
F0 SVC(C=1, gamma=2) 0.3495254745254745
F0 RandomForestClassifier(max_depth=5, max_features=1, n_estimators=10) 0.34880382775119617
F0 GradientBoostingClassifier(learning_rate=1, max_depth=5, n_estimators=10) 0.360340417158599
Duration of Voiced LogisticRegression(random_state=42, solver='liblinear') 0.4655110982714622
Duration of Voiced KNeighborsClassifier(n_neighbors=6) 0.42656158545130785
Duration of Voiced SVC(C=1, gamma=2) 0.4008859357696567
Duration of Voiced RandomForestClassifier(max_depth=5, max_features=1, n_estimators=10) 0.32597268364710225
Duration of Voiced GradientBoostingClassifier(learning_rate=1, max_depth=5, n_estimators=10) 0.3992808497092462
Duration of UnVoiced LogisticRegression(random_state=42, solver='liblinear') 0.3809958322030136
Duration of UnVoiced KNeighborsClassifier(n_neighbors=6) 0.4633537331701346
Duration of UnVoiced 

In [14]:
results.to_csv('reports/f1_scores_disvoice.csv')

In [None]:
from sklearn.model_selection import cross_val_score

i = 0
for model in classifiers: 
    scores = cross_val_score(model, scaled_samples, labels, cv=5)
    print("cross val scores of scaled %s:" % classifier_names[i], scores)
    scores = cross_val_score(model, dfc.iloc[:,:-2], labels, cv=5)
    print("cross val scores of %s:" % classifier_names[i], scores)

    i +=1

In [None]:
plt.figure()

for c in classifier_names:
    fpr, tpr = test_roc(y_test.values, y_pred)
    plt.plot(fpr, tpr, label=c)

plt.plot([0, 1], [0, 1], 'k--')
plt.legend()