In [1]:
import numpy as np
import pandas as pd
import sklearn
import sklearn.model_selection
import sklearn.metrics
import sklearn.inspection

In [2]:
df = pd.read_csv("../wavData.csv")
#drop columns unimportant to modeling
df.drop(['filename', 'frameID'], axis=1, inplace=True)
df['intelligence'] = df['intelligence'].map({"Human": 0, "AI": 1})
X = df.drop(['intelligence'], axis=1, inplace=False)
Y = df['intelligence']
#split to training and test data
# 80/20 split
X_train, X_test, Y_train, Y_test = sklearn.model_selection.train_test_split(X, Y, test_size=0.2, random_state=10)

In [None]:
model = sklearn.neighbors.KNeighborsClassifier()

#find the best params value for knn
paramGrid = {'n_neighbors': range(1, 26)}
gridSearch = sklearn.model_selection.GridSearchCV(model, paramGrid, cv=5)
gridSearch.fit(X_train, Y_train)
nNeighbors = gridSearch.best_params_['n_neighbors']
print(nNeighbors)

1


In [4]:
model = sklearn.neighbors.KNeighborsClassifier(n_neighbors=nNeighbors)
model.fit(X_train, Y_train)

In [5]:
#test out model
y_predictions = model.predict(X_test)

In [6]:
#examine accuracy of model
print("accuracy: {0:0.4f}".format(sklearn.metrics.accuracy_score(Y_test, y_predictions)) + "\n")
print("training score: {0:0.4f}".format(model.score(X_train, Y_train)))
print("test score: {0:0.4f}".format(model.score(X_test, Y_test)) + "\n")

#check confusion matrix

confusionMatrix = sklearn.metrics.confusion_matrix(Y_test, y_predictions)
print("Confusion Matrix:")
print("true pos: ", confusionMatrix[0, 0])
print("true neg: ", confusionMatrix[1, 1])
print("false pos: ", confusionMatrix[0, 1])
print("false neg: ", confusionMatrix[1, 0])

accuracy: 0.9741

training score: 1.0000
test score: 0.9741

Confusion Matrix:
true pos:  500
true neg:  592
false pos:  12
false neg:  17


In [7]:
#find features that model is most dependent on
permutationImport = sklearn.inspection.permutation_importance(model, X_test, Y_test, n_repeats=5, random_state=1)

sortImportance = list(zip(X_test.columns, permutationImport.importances_mean))
sortImportance = sorted(sortImportance, key=lambda tup: tup[1], reverse=True)

for tup in sortImportance:
    print(tup[0] + ": " + str(tup[1]))

MFCC7: 0.028724353256021407
MFCC1: 0.026404995539696684
MFCC5: 0.020874219446922405
MFCC6: 0.01891168599464761
MFCC14: 0.01801962533452277
MFCC11: 0.016057091882247975
MFCC3: 0.015343443354148079
MFCC8: 0.014272970561998211
MFCC12: 0.01123996431757357
MFCC15: 0.011061552185548584
amplitudeEnvelope: 0.010526315789473672
MFCC10: 0.010169491525423725
MFCC19: 0.00963425512934879
MFCC9: 0.008920606601248871
MFCC20: 0.007850133809099024
RMSE: 0.007493309545049054
MFCC2: 0.007136485280999127
MFCC17: 0.006958073148974142
MFCC18: 0.00695807314897412
MFCC4: 0.006779661016949157
MFCC13: 0.0066012488849241935
ZCR: 0.004995539696699369
MFCC16: 0.00428189116859945
spectralBandwidth: 0.003568242640499553
spectralRolloff: 0.00267618198037467
spectralCentroid: -0.000535236396074934


In [8]:
#cross validation scoring
crossVal = sklearn.model_selection.cross_val_score(model, X_train, Y_train, cv=10, scoring="accuracy")
print("Average cross validation: " + str(crossVal.mean()))
print("Cross Validations:")
print(crossVal)

Average cross validation: 0.9700997255806554
Cross Validations:
[0.97995546 0.97327394 0.97544643 0.95089286 0.97098214 0.97767857
 0.96205357 0.96428571 0.97991071 0.96651786]
