In [2]:
import numpy as np
import pandas as pd
import sklearn
import sklearn.model_selection
import sklearn.metrics
import sklearn.inspection
import sklearn.linear_model
import sklearn.svm
import matplotlib.pyplot as plt

In [4]:
df = pd.read_csv("../wavData.csv")
#drop columns unimportant to modeling
df.drop(['filename', 'frameID'], axis=1, inplace=True)
df['intelligence'] = df['intelligence'].map({"Human": 0, "AI": 1})
X = df.drop(['intelligence'], axis=1, inplace=False)
Y = df['intelligence']
#split to training and test data
# 80/20 split
X_train, X_test, Y_train, Y_test = sklearn.model_selection.train_test_split(X, Y, test_size=0.2, random_state=10)

In [None]:
#find the best params for svm

svc = sklearn.svm.SVC()

cVals = [1, 10, 100, 1000]
gammaVals = np.arange(0.1, 1, 0.05)


paramGrid1 = {'C': cVals, 'kernel': ['linear']}
paramGrid2 = {'C': cVals, 'kernel': ['rbf'], 'gamma': gammaVals}
paramGrid4 = {'C': cVals, 'kernel': ['sigmoid'], 'gamma': gammaVals}

In [10]:
gridSearchLinear = sklearn.model_selection.GridSearchCV(svc, paramGrid1, cv=5)
gridSearchLinear.fit(X_train, Y_train)
bestParamsLinear = gridSearchLinear.best_params_
print(bestParamsLinear)

{'C': 100, 'kernel': 'linear'}


In [11]:
gridSearchRBF = sklearn.model_selection.GridSearchCV(svc, paramGrid2, cv=5)
gridSearchRBF.fit(X_train, Y_train)
bestParamsRBF = gridSearchRBF.best_params_
print(bestParamsRBF)

{'C': 10, 'gamma': np.float64(0.9500000000000003), 'kernel': 'rbf'}


In [8]:
gridSearchSigmoid = sklearn.model_selection.GridSearchCV(svc, paramGrid4, cv=5)
gridSearchSigmoid.fit(X_train, Y_train)
bestParamsSigmoid = gridSearchSigmoid.best_params_
print(bestParamsSigmoid)

{'C': 1, 'gamma': np.float64(0.1), 'kernel': 'sigmoid'}


In [9]:
#compare best params. Hard code values in case jupyter kernel reset
paramGridAll = [{'C': [100], 'kernel': ['linear']},
                {'C': [10], 'kernel': ['rbf'], 'gamma': [0.95]},
                {'C': [1], 'kernel': ['sigmoid'], 'gamma': [0.1]}]
gridSearchAll = sklearn.model_selection.GridSearchCV(svc, paramGridAll, cv=5)
gridSearchAll.fit(X_train, Y_train)
bestParamsAll = gridSearchAll.best_params_
print(bestParamsAll)

{'C': 10, 'gamma': 0.95, 'kernel': 'rbf'}


In [10]:
model = sklearn.svm.SVC(C=10, gamma=0.95, kernel='rbf')
model.fit(X_train, Y_train)

In [13]:
y_predictions = model.predict(X_test)

In [14]:
#examine accuracy of model
print("accuracy: {0:0.4f}".format(sklearn.metrics.accuracy_score(Y_test, y_predictions)) + "\n")
print("training score: {0:0.4f}".format(model.score(X_train, Y_train)))
print("test score: {0:0.4f}".format(model.score(X_test, Y_test)) + "\n")

#check confusion matrix

confusionMatrix = sklearn.metrics.confusion_matrix(Y_test, y_predictions)
print("Confusion Matrix:")
print("true pos: ", confusionMatrix[0, 0])
print("true neg: ", confusionMatrix[1, 1])
print("false pos: ", confusionMatrix[0, 1])
print("false neg: ", confusionMatrix[1, 0])

accuracy: 0.9857

training score: 0.9987
test score: 0.9857

Confusion Matrix:
true pos:  505
true neg:  600
false pos:  7
false neg:  9


In [15]:
#find features that model is most dependent on
permutationImport = sklearn.inspection.permutation_importance(model, X_test, Y_test, n_repeats=5, random_state=1)

sortImportance = list(zip(X_test.columns, permutationImport.importances_mean))
sortImportance = sorted(sortImportance, key=lambda tup: tup[1], reverse=True)

for tup in sortImportance:
    print(tup[0] + ": " + str(tup[1]))

MFCC1: 0.12346119536128464
spectralRolloff: 0.1086529884032115
MFCC2: 0.07421944692239077
MFCC14: 0.07386262265834083
MFCC4: 0.06529884032114189
spectralCentroid: 0.056556645851918
spectralBandwidth: 0.052274754683318526
MFCC3: 0.05049063336306876
ZCR: 0.049063336306868946
MFCC6: 0.046922390722569186
MFCC5: 0.041926851025869814
MFCC11: 0.03675289919714546
MFCC7: 0.03247100802854599
MFCC8: 0.031043710972346195
amplitudeEnvelope: 0.030865298840321187
MFCC15: 0.029259589652096386
MFCC9: 0.02872435325602145
MFCC19: 0.02872435325602145
MFCC20: 0.027653880463871582
MFCC17: 0.02676181980374668
RMSE: 0.024977698483496912
MFCC16: 0.02479928635147195
MFCC10: 0.02319357716324715
MFCC12: 0.021231043710972397
MFCC13: 0.016592328278323
MFCC18: 0.01623550401427303


In [16]:
#cross validation scoring
crossVal = sklearn.model_selection.cross_val_score(model, X_train, Y_train, cv=10, scoring="accuracy")
print("Average cross validation: " + str(crossVal.mean()))
print("Cross Validations:")
print(crossVal)

Average cross validation: 0.9732202513522112
Cross Validations:
[0.98886414 0.9844098  0.97767857 0.95535714 0.97767857 0.97098214
 0.97544643 0.95982143 0.97767857 0.96428571]
