In [2]:
import numpy as np
import pandas as pd
import sklearn
import sklearn.model_selection
import sklearn.metrics
import sklearn.inspection
import sklearn.linear_model
import matplotlib.pyplot as plt

In [3]:
df = pd.read_csv("../wavData.csv")
#drop columns unimportant to modeling
df.drop(['filename', 'frameID'], axis=1, inplace=True)
df['intelligence'] = df['intelligence'].map({"Human": 0, "AI": 1})
X = df.drop(['intelligence'], axis=1, inplace=False)
Y = df['intelligence']
#split to training and test data
# 80/20 split
X_train, X_test, Y_train, Y_test = sklearn.model_selection.train_test_split(X, Y, test_size=0.2, random_state=10)

In [6]:
model = sklearn.linear_model.LogisticRegression()

#find best params for logReg
paramGrid = {'C': [0.1, 0.25, 0.5, 0.75, 0.9], 'solver': ['liblinear'], 'penalty': ['l1', 'l2'] }
gridSearch = sklearn.model_selection.GridSearchCV(model, paramGrid, cv=5)
gridSearch.fit(X_train, Y_train)
bestParams = gridSearch.best_params_
print(bestParams)

{'C': 0.9, 'penalty': 'l2', 'solver': 'liblinear'}


In [7]:
model = sklearn.linear_model.LogisticRegression(solver=bestParams['solver'],
                                                C=bestParams['C'],
                                                penalty=bestParams['penalty'])
model.fit(X_train, Y_train)

In [8]:
y_predictions = model.predict(X_test)

In [9]:
#examine accuracy of model
print("accuracy: {0:0.4f}".format(sklearn.metrics.accuracy_score(Y_test, y_predictions)) + "\n")
print("training score: {0:0.4f}".format(model.score(X_train, Y_train)))
print("test score: {0:0.4f}".format(model.score(X_test, Y_test)) + "\n")

#check confusion matrix

confusionMatrix = sklearn.metrics.confusion_matrix(Y_test, y_predictions)
print("Confusion Matrix:")
print("true pos: ", confusionMatrix[0, 0])
print("true neg: ", confusionMatrix[1, 1])
print("false pos: ", confusionMatrix[0, 1])
print("false neg: ", confusionMatrix[1, 0])

accuracy: 0.6717

training score: 0.6720
test score: 0.6717

Confusion Matrix:
true pos:  295
true neg:  458
false pos:  217
false neg:  151


In [10]:
#find features that model is most dependent on
permutationImport = sklearn.inspection.permutation_importance(model, X_test, Y_test, n_repeats=5, random_state=1)

sortImportance = list(zip(X_test.columns, permutationImport.importances_mean))
sortImportance = sorted(sortImportance, key=lambda tup: tup[1], reverse=True)

for tup in sortImportance:
    print(tup[0] + ": " + str(tup[1]))

MFCC14: 0.06779661016949154
MFCC5: 0.06066012488849244
MFCC1: 0.05673505798394296
spectralRolloff: 0.04906333630686892
ZCR: 0.037644959857270344
MFCC3: 0.01873327386262269
MFCC13: 0.01819803746654778
MFCC4: 0.009634255129348812
MFCC2: 0.008563782337198966
MFCC19: 0.006779661016949157
MFCC17: 0.004817127564674428
MFCC9: 0.0037466547725245603
MFCC15: 0.0023193577163247658
MFCC10: 0.001784121320249832
MFCC8: 0.0012488849241748755
MFCC20: 0.0010704727921498902
spectralBandwidth: 0.000892060660124927
spectralCentroid: -0.0005352363960748896
amplitudeEnvelope: -0.0008920606601248604
MFCC12: -0.0008920606601248604
MFCC6: -0.0021409455842997136
RMSE: -0.0023193577163246547
MFCC18: -0.0024977698483496626
MFCC7: -0.0032114183764495817
MFCC16: -0.0035682426404995303
MFCC11: -0.0062444246208742


In [11]:
#cross validation scoring
crossVal = sklearn.model_selection.cross_val_score(model, X_train, Y_train, cv=10, scoring="accuracy")
print("Average cross validation: " + str(crossVal.mean()))
print("Cross Validations:")
print(crossVal)

Average cross validation: 0.6680047923958002
Cross Validations:
[0.68151448 0.65701559 0.66071429 0.640625   0.62723214 0.67410714
 0.69642857 0.70758929 0.68973214 0.64508929]
