In [65]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import os
import sys
import time
import warnings
warnings.filterwarnings('ignore')

In [66]:
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_curve, auc
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
    


In [67]:
# Loading in the cleaned dataset
df_clean = pd.read_csv('data/clean/features_30_sec_pca.csv')
df_clean.head()

# Splitting the predictor value from the remainder of the dataset

X = df_clean.drop(['label'], axis=1)
y = df_clean['label']

In [68]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, mean_absolute_error, mean_squared_error, r2_score
from statistics import mean


# Creating a reusable function for churning through all five binary classification algorithms
def generate_binary_classification_model(X, y, model_algorithm, hyperparameters, needs_scaled = False):
    """
    Generating everything required for training and validation of a binary classification model

    Args:
        - X (Pandas DataFrame): A DataFrame containing the cleaned training data
        - y (Pandas DataFrame): A DataFrame containing the target values correlated to the X training data
        - model_algorithm (object): A model algorithm that will be trained against the X and y data
        - hyperparameters (dict): A dictionary containing all the hyperparameters to test the model with
        - needs_scaled (Boolean): A boolean value that indicates whether or not the input dataset
    """

    # Performing a scaling on the data if required
    if needs_scaled:

        # Instantiating the StandardScaler
        scaler = StandardScaler()

        # Performing a fit_transform on the dataset
        scaled_features = scaler.fit_transform(X)

        # Transforming the StandardScaler output back into a Pandas DataFrame
        X = pd.DataFrame(scaled_features, index = X.index, columns = X.columns)

    # Instantiating a GridSearch object with the inputted model algorithm and hyperparameters
    gridsearchcv = GridSearchCV(estimator = model_algorithm,
                                param_grid = hyperparameters)

    # Fitting the training data to the GridSearch object
    gridsearchcv.fit(X, y)

    # Printing out the best hyperparameters
    print(f'Best hyperparameters: {gridsearchcv.best_params_}')

    # Instantiating a new model object with the ideal hyperparameters from the GridSearch job
    model_algorithm.set_params(**gridsearchcv.best_params_)

    # Creating a container to hold each set of validation metrics
    accuracy_scores, roc_auc_scores, f1_scores = [], [], []

    # Instantiating the K-Fold cross validation object
    k_fold = KFold(n_splits = 5)

    # Iterating through each of the folds in K-Fold
    for train_index, val_index in k_fold.split(X):

        # Splitting the training set from the validation set for this specific fold
        X_train, X_val = X.iloc[train_index, :], X.iloc[val_index, :]
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]

        # Fitting the X_train and y_train datasets to the model algorithm
        model_algorithm.fit(X_train, y_train)

        # Getting inferential predictions for the validation dataset
        val_preds = model_algorithm.predict(X_val)

        # Generating validation metrics by comparing the inferential predictions (val_preds) to the actuals (y_val)
        val_accuracy = accuracy_score(y_val, val_preds)
        val_roc_auc_score = roc_auc_score(y_val, val_preds, average = 'macro')
        val_f1_score = f1_score(y_val, val_preds, average='macro')

        # Appending the validation scores to the respective validation metric container
        accuracy_scores.append(val_accuracy)
        roc_auc_scores.append(val_roc_auc_score)
        f1_scores.append(val_f1_score)

    # Getting the average (mean) of each validation score
    average_accuracy = int(mean(accuracy_scores) * 100)
    average_roc_auc_score = int(mean(roc_auc_scores) * 100)
    average_f1_score = int(mean(f1_scores) * 100)

    # Printing out the average validation metrics
    print(f'Average accuracy score: {average_accuracy}%')
    print(f'Average ROC AUC score: {average_roc_auc_score}%')
    print(f'Average F1 score: {average_f1_score}%')

In [69]:
# MODEL1: Logistic Regression
# Instantiating a LogisticRegression object
logreg = LogisticRegression()

# Setting the hyperparameter grid for the Logistic Regression algorithm
logistic_reg_params = {
    'penalty': ['l1', 'l2'],
    'C': np.logspace(-4, 4, 20),
    'solver': ['lbfgs', 'liblinear']
}


X = df_clean.drop(['label'], axis=1)
y = df_clean['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)

# train the model
logreg.fit(X_train, y_train)

# predict the test data
y_pred = logreg.predict(X_test)


# print the accuracy score
print(f'Accuracy score: {accuracy_score(y_test, y_pred)}')

# F1 score
print(f'F1 score: {f1_score(y_test, y_pred, average="macro")}')

Accuracy score: 0.68
F1 score: 0.6756053709976788


In [70]:
# Model 2: Support Vector Machine
# Instantiating a Support Vector Machine object
svm = SVC()

# Setting the hyperparameter grid for the Support Vector Machine algorithm
svm_params = {
    'C': [0.1, 1, 10, 100],
    'gamma': [1, 0.1, 0.01, 0.001],
    'kernel': ['rbf', 'poly', 'sigmoid']
}

svm.fit(X_train, y_train)

# predict the test data
y_pred = svm.predict(X_test)


# print the accuracy score
print(f'Accuracy score: {accuracy_score(y_test, y_pred)}')

# F1 score
print(f'F1 score: {f1_score(y_test, y_pred, average="macro")}')

Accuracy score: 0.675
F1 score: 0.6702480229608713


In [71]:
# Model 3: XGBoost Classifier

from xgboost import XGBClassifier

# Create a classifier
xgb = XGBClassifier(booster='gbtree', objective='multi:softprob', random_state=42, eval_metric="auc", num_class=10)

# Fit the classifier with the training data
xgb.fit(X_train,y_train)


from sklearn.metrics import roc_auc_score
from sklearn import preprocessing

# Use trained model to predict output of test dataset
val = xgb.predict(X_test)

lb = preprocessing.LabelBinarizer()
lb.fit(y_test)

y_test_lb = lb.transform(y_test)
val_lb = lb.transform(val)

roc_auc_score(y_test_lb, val_lb, average='macro')

0.8049262544042122

In [72]:

# MODEL4: Random Forest
# Instantiating a RandomForestClassifier object
rfc = RandomForestClassifier()

# Creating a hyperparameter grid for RandomForestClassifier
rfc_hyperparameters = {'n_estimators': [100, 200, 300], 
                          'max_depth': [None, 5, 10, 20, 30],
                            'min_samples_split': [2, 5, 10],
                            'min_samples_leaf': [1, 2, 4]}

rfc.fit(X_train, y_train)

# predict the test data
y_pred = rfc.predict(X_test)


# print the accuracy score
print(f'Accuracy score: {accuracy_score(y_test, y_pred)}')

# F1 score
print(f'F1 score: {f1_score(y_test, y_pred, average="macro")}')


Accuracy score: 0.67
F1 score: 0.6547855209640339


In [73]:
# MODEL5: KNN
# Instantiating a KNeighborsClassifier object
knn = KNeighborsClassifier()

# Creating a hyperparameter grid for KNeighborsClassifier
knn_hyperparameters = {'n_neighbors': [3, 5, 7, 9, 11, 13, 15],
                          'weights': ['uniform', 'distance'],   
                            'algorithm': ['auto', 'ball_tree', 'kd_tree'],
                            'p': [1, 2]}
knn.fit(X_train, y_train)

# predict the test data
y_pred = knn.predict(X_test)


# print the accuracy score
print(f'Accuracy score: {accuracy_score(y_test, y_pred)}')

# F1 score
print(f'F1 score: {f1_score(y_test, y_pred, average="macro")}')

Accuracy score: 0.63
F1 score: 0.6232271905274428


In [74]:
# MODEL6: Convolutional Neural Network
# Importing the required libraries

from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential

In [75]:
# Instantiating a Sequential object
model = Sequential()

# Adding the first convolutional layer
model.add(layers.Conv2D(32, (3, 3), activation='relu'))

# Adding the second convolutional layer
model.add(layers.Conv2D(64, (3, 3), activation='relu'))

# Adding the third convolutional layer
model.add(layers.Conv2D(128, (3, 3), activation='relu'))

# Adding the fourth convolutional layer
model.add(layers.Conv2D(128, (3, 3), activation='relu'))

# Adding the max pooling layer
model.add(layers.MaxPooling2D((2, 2)))

# Adding the flatten layer
model.add(layers.Flatten())

# Adding the first dense layer
model.add(layers.Dense(512, activation='relu'))

# Adding the output layer
model.add(layers.Dense(10, activation='softmax'))