In [19]:
import numpy as np
import pandas as pd
import sklearn
import sklearn.model_selection
import sklearn.metrics
import sklearn.ensemble
import math

In [2]:
df = pd.read_csv("../wavData.csv")
#drop columns unimportant to modeling
df.drop(['filename', 'frameID'], axis=1, inplace=True)
df['intelligence'] = df['intelligence'].map({"Human": 0, "AI": 1})
X = df.drop(['intelligence'], axis=1, inplace=False)
Y = df['intelligence']
#split to training and test data
# 80/20 split
X_train, X_test, Y_train, Y_test = sklearn.model_selection.train_test_split(X, Y, test_size=0.2, random_state=10)

In [None]:
model = sklearn.ensemble.RandomForestClassifier()

#find the best n neighbors value for gbc
paramGrid = {'n_estimators': [100, 200, 300], 'max_features': ['sqrt', 'log2', 5],
             'max_depth': [math.floor(math.sqrt(len(X.columns))), math.floor(math.sqrt(len(X.columns)))*2]}
gridSearch = sklearn.model_selection.GridSearchCV(model, paramGrid, cv=5)
gridSearch.fit(X_train, Y_train)
bestParams = gridSearch.best_params_
print(bestParams)

# n_estimators=500 and max_features=5 had greater accuracy
# when conducting our first grid search, but exhibit signs of overfitting.
# We removed this option and added a max_depth.
# This model still exhibits signs of overfitting, but was modified
# until training accuracy < 1.0000

{'max_depth': 10, 'max_features': 'sqrt', 'n_estimators': 300}


In [34]:
model = sklearn.ensemble.RandomForestClassifier(n_estimators=bestParams['n_estimators'],
                                                max_features=bestParams['max_features'],
                                                max_depth=bestParams['max_depth'])
model.fit(X_train, Y_train)

In [35]:
#test out model
y_predictions = model.predict(X_test)

In [36]:
#examine accuracy of model
print("accuracy: {0:0.4f}".format(sklearn.metrics.accuracy_score(Y_test, y_predictions)) + "\n")
print("training score: {0:0.4f}".format(model.score(X_train, Y_train)))
print("test score: {0:0.4f}".format(model.score(X_test, Y_test)) + "\n")

#check confusion matrix

confusionMatrix = sklearn.metrics.confusion_matrix(Y_test, y_predictions)
print("Confusion Matrix:")
print("true pos: ", confusionMatrix[0, 0])
print("true neg: ", confusionMatrix[1, 1])
print("false pos: ", confusionMatrix[0, 1])
print("false neg: ", confusionMatrix[1, 0])

accuracy: 0.9063

training score: 0.9946
test score: 0.9063

Confusion Matrix:
true pos:  432
true neg:  584
false pos:  80
false neg:  25


In [37]:
#find features that model is most dependent on
permutationImport = sklearn.inspection.permutation_importance(model, X_test, Y_test, n_repeats=5, random_state=1)

sortImportance = list(zip(X_test.columns, permutationImport.importances_mean))
sortImportance = sorted(sortImportance, key=lambda tup: tup[1], reverse=True)

for tup in sortImportance:
    print(tup[0] + ": " + str(tup[1]))

MFCC14: 0.0979482604817127
MFCC1: 0.034076717216770705
ZCR: 0.03050847457627113
MFCC5: 0.01980374665477249
MFCC6: 0.01926851025869756
MFCC3: 0.016770740410347828
spectralBandwidth: 0.015878679750222946
MFCC4: 0.014272970561998166
MFCC7: 0.011953612845673445
MFCC2: 0.011061552185548584
MFCC11: 0.010704727921498614
MFCC15: 0.009991079393398694
spectralRolloff: 0.009991079393398671
MFCC19: 0.009634255129348745
MFCC18: 0.00909901873327379
MFCC9: 0.008920606601248804
MFCC20: 0.007671721677073973
RMSE: 0.00767172167707395
MFCC16: 0.00677966101694909
MFCC17: 0.006422836752899119
MFCC10: 0.006066012488849193
amplitudeEnvelope: 0.004817127564674362
MFCC12: 0.003925066904549457
MFCC8: 0.003033006244424552
spectralCentroid: 0.00249776984834964
MFCC13: 0.001248884924174809


In [38]:
#cross validation scoring
crossVal = sklearn.model_selection.cross_val_score(model, X_train, Y_train, cv=10, scoring="accuracy")
print("Average cross validation: " + str(crossVal.mean()))
print("Cross Validations:")
print(crossVal)

Average cross validation: 0.9051687281259942
Cross Validations:
[0.93318486 0.91091314 0.90178571 0.89285714 0.90178571 0.90625
 0.92410714 0.88839286 0.90625    0.88616071]
