# Training/Testing on a single patient for each fold:

In [1]:
# import toolboxes 
# import the toolboxes 
import numpy as np 
import pandas as pd
import os
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from tqdm import tqdm
from sklearn.linear_model import BayesianRidge
from sklearn.feature_selection import SelectFromModel
from sklearn.kernel_approximation import RBFSampler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier


In [2]:
from warnings import simplefilter
from sklearn.exceptions import ConvergenceWarning
simplefilter("ignore", category=ConvergenceWarning)

In [3]:
directory = '/Users/amandabreton/Documents/Duke 2022-2023/ECE 590/Final Project/'
filename = 'meanfeatures.csv'

df = pd.read_csv(directory+filename, index_col=0)

In [4]:
df.head()

Unnamed: 0,SubjectID,VideoID,predefinedlabel,user-definedlabel,Attention,Mediation,Raw,Delta,Theta,Alpha1,Alpha2,Beta1,Beta2,Gamma1,Gamma2
0,0.0,0.0,0.0,0.0,55.256944,53.826389,46.986111,544315.097222,124965.590278,36693.701389,25875.298611,20108.791667,40268.763889,40729.284722,16817.0625
1,0.0,1.0,0.0,1.0,43.621429,48.621429,28.8,739737.292857,161064.228571,34918.028571,25078.935714,22157.307143,37410.728571,36758.7,14519.407143
2,0.0,2.0,0.0,1.0,43.978873,47.316901,13.15493,694078.084507,149816.873239,30493.873239,21667.591549,21888.338028,36446.43662,33908.873239,14545.84507
3,0.0,3.0,0.0,0.0,51.057377,51.844262,34.713115,600823.688525,162653.360656,33367.278689,26281.5,17224.278689,43706.52459,41438.213115,16558.631148
4,0.0,4.0,0.0,0.0,55.224138,47.474138,30.008621,546628.017241,126893.948276,23113.844828,17017.051724,15955.87931,36427.836207,36024.818966,14752.655172


In [5]:
df["SubjectID"].value_counts()

0.0    10
1.0    10
2.0    10
3.0    10
4.0    10
5.0    10
6.0    10
7.0    10
8.0    10
9.0    10
Name: SubjectID, dtype: int64

In [16]:
def pick_model(model_type, C=1,verb=False):
    if model_type == 'logreg':
        clf = LogisticRegression()
        param_grid = {"C": [0.1, 1, 10, 100], "penalty": ["l1", "l2"], "solver": ["liblinear"]}
    elif model_type == 'rvm':
        clf = BayesianRidge(compute_score=True)
        param_grid = {"alpha_1": [1e-6, 1e-5, 1e-4, 1e-3], "alpha_2": [1e-6, 1e-5, 1e-4, 1e-3], 
                      "lambda_1": [1e-6, 1e-5, 1e-4, 1e-3], "lambda_2": [1e-6, 1e-5, 1e-4, 1e-3]}
    elif model_type == 'knn':
        clf = KNeighborsClassifier(n_neighbors=5)
        Ns = [1,2,3,4]
        param_grid = {"n_neighbors": Ns}
    elif model_type == 'svm':
        clf = LinearSVC(C=C,verbose=verb, max_iter=10000)
        Cs = np.logspace(-11,6,18)
        param_grid = {"C": Cs, "penalty": ["l2"], "loss": ['hinge', 'squared_hinge']} 
    elif model_type == 'random forest': 
        clf = RandomForestClassifier()
        param_grid = {"n_estimators": [10, 50, 100, 200, 500, 1000], "max_depth": 
                      [None, 5, 10, 20, 50, 100]}
    elif model_type == 'decision tree': 
        clf = DecisionTreeClassifier()
        param_grid = {'criterion': ['gini', 'entropy', 'log_loss'], 'splitter': ['best', 'random']}
    else: #model_type = naivebayes
        clf = GaussianNB()
        param_grid = {}
    return clf, param_grid

In [7]:
def studentspecificCV(df,verb=False, model_type = 'svm', target = 'user-definedlabel'):
    # training on half the videos testing on the other half in a single student
    accs = np.zeros(50)
    counter = 0
    for i in range(10): 
        curr_student = df.loc[df['SubjectID'] == i]
        X = curr_student.drop(['SubjectID', 'VideoID', 'user-definedlabel', 'predefinedlabel'], axis=1)
        y = curr_student[[target]]
        y = y[target].values
        num_splits = KFold(n_splits=5,shuffle = True)
        for train_index, test_index in tqdm(num_splits.split(X)): # to actually split into training/testing
            
            #X.iloc[0].values
            X_train, X_test = X.iloc[train_index].values, X.iloc[test_index].values
            y_train, y_test = y[train_index], y[test_index]
    
            # Create an instance of the  model
            model, param_grid = pick_model(model_type)
            # Create an instance of the GridSearchCV object
            grid_search = GridSearchCV(model, param_grid, cv=2, scoring="accuracy")
            # Fit the GridSearchCV object to the data
            grid_search.fit(X_train, y_train)
            # Print the best hyperparameters and their accuracy score
            #print("Training: Best Hyperparameters:", grid_search.best_params_)
            #print("Training: Best Accuracy Score:", grid_search.best_score_)
            best_est = grid_search.best_estimator_
            best_est.fit(X_train, y_train)
            y_pred = best_est.predict(X_test)
            acc = accuracy_score(y_test, y_pred)
            accs[counter] = acc
            #print('Test accuracy for target:', target, acc)
            counter += 1 
    print('Average Test Accuracy: ', np.mean(accs))

# Log reg 

In [8]:
studentspecificCV(df,verb=False, model_type = 'logreg', target = 'predefinedlabel')

5it [00:00, 22.74it/s]
5it [00:00, 27.08it/s]
5it [00:00, 27.51it/s]
5it [00:00, 27.12it/s]
5it [00:00, 28.11it/s]
5it [00:00, 20.97it/s]
5it [00:00, 31.33it/s]
5it [00:00, 23.28it/s]
5it [00:00, 27.05it/s]
5it [00:00, 31.88it/s]

Average Test Accuracy:  0.46





In [9]:
studentspecificCV(df,verb=False, model_type = 'logreg', target = 'user-definedlabel')

5it [00:00, 27.12it/s]
5it [00:00, 27.80it/s]
5it [00:00, 31.61it/s]
5it [00:00, 28.51it/s]
5it [00:00, 29.91it/s]
5it [00:00, 29.96it/s]
5it [00:00, 21.69it/s]
5it [00:00, 27.18it/s]
5it [00:00, 30.71it/s]
5it [00:00, 29.12it/s]

Average Test Accuracy:  0.65





# RVM 

In [11]:
#studentspecificCV(df,verb=False, model_type = 'rvm', target = 'user-definedlabel')

In [12]:
#studentspecificCV(df,verb=False, model_type = 'rvm', target = 'predefinedlabel')

# SVM 

In [13]:
studentspecificCV(df,verb=False, model_type = 'svm', target = 'predefinedlabel')

5it [00:01,  4.76it/s]
5it [00:00,  5.13it/s]
5it [00:00,  7.56it/s]
5it [00:01,  4.83it/s]
5it [00:01,  4.30it/s]
5it [00:01,  4.48it/s]
5it [00:00,  7.14it/s]
5it [00:01,  4.05it/s]
5it [00:01,  4.07it/s]
5it [00:01,  4.85it/s]

Average Test Accuracy:  0.33





In [14]:
studentspecificCV(df,verb=False, model_type = 'svm', target = 'user-definedlabel')

5it [00:01,  3.98it/s]
5it [00:01,  4.94it/s]
5it [00:00,  5.98it/s]
5it [00:01,  4.50it/s]
5it [00:01,  4.69it/s]
5it [00:01,  4.40it/s]
5it [00:00,  6.21it/s]
5it [00:01,  4.79it/s]
5it [00:01,  4.80it/s]
5it [00:01,  4.33it/s]

Average Test Accuracy:  0.53





# KNN

In [17]:
studentspecificCV(df,verb=False, model_type = 'knn', target = 'user-definedlabel')

5it [00:00, 31.22it/s]
5it [00:00, 40.77it/s]
5it [00:00, 41.48it/s]
5it [00:00, 38.85it/s]
5it [00:00, 38.31it/s]
5it [00:00, 40.93it/s]
5it [00:00, 38.27it/s]
5it [00:00, 39.06it/s]
5it [00:00, 42.58it/s]
5it [00:00, 41.54it/s]

Average Test Accuracy:  0.79





In [18]:
studentspecificCV(df,verb=False, model_type = 'knn', target = 'predefinedlabel')

5it [00:00, 35.81it/s]
5it [00:00, 38.04it/s]
5it [00:00, 37.33it/s]
5it [00:00, 37.93it/s]
5it [00:00, 40.91it/s]
5it [00:00, 42.73it/s]
5it [00:00, 43.29it/s]
5it [00:00, 38.58it/s]
5it [00:00, 34.16it/s]
5it [00:00, 36.62it/s]

Average Test Accuracy:  0.5





# Random Forest

In [19]:
studentspecificCV(df,verb=False, model_type = 'random forest', target = 'user-definedlabel')

5it [02:32, 30.40s/it]
5it [02:33, 30.62s/it]
5it [02:34, 30.93s/it]
5it [02:50, 34.03s/it]
5it [02:38, 31.79s/it]
5it [02:29, 29.87s/it]
5it [02:42, 32.50s/it]
5it [02:30, 30.14s/it]
5it [02:27, 29.57s/it]
5it [02:30, 30.19s/it]

Average Test Accuracy:  0.8





In [20]:
studentspecificCV(df,verb=False, model_type = 'random forest', target = 'predefinedlabel')

5it [02:27, 29.52s/it]
5it [02:44, 33.00s/it]
5it [02:46, 33.36s/it]
5it [02:44, 32.89s/it]
5it [03:22, 40.49s/it]
5it [03:26, 41.30s/it]
5it [03:32, 42.58s/it]
5it [03:34, 42.88s/it]
5it [03:33, 42.64s/it]
5it [03:18, 39.77s/it]

Average Test Accuracy:  0.57





# Decision Tree

In [21]:
studentspecificCV(df,verb=False, model_type = 'decision tree', target = 'user-definedlabel')

5it [00:00, 33.31it/s]
5it [00:00, 35.39it/s]
5it [00:00, 35.23it/s]
5it [00:00, 35.22it/s]
5it [00:00, 37.04it/s]
5it [00:00, 34.32it/s]
5it [00:00, 36.78it/s]
5it [00:00, 36.53it/s]
5it [00:00, 37.22it/s]
5it [00:00, 37.37it/s]

Average Test Accuracy:  0.76





In [22]:
studentspecificCV(df,verb=False, model_type = 'decision tree', target = 'predefinedlabel')

5it [00:00, 36.83it/s]
5it [00:00, 37.04it/s]
5it [00:00, 36.22it/s]
5it [00:00, 36.35it/s]
5it [00:00, 36.99it/s]
5it [00:00, 37.39it/s]
5it [00:00, 37.39it/s]
5it [00:00, 37.06it/s]
5it [00:00, 37.02it/s]
5it [00:00, 37.55it/s]

Average Test Accuracy:  0.5





# Naive Bayes

In [40]:
studentspecificCV(df,verb=False, model_type = 'naivebayes', target = 'user-definedlabel')

5it [00:00, 101.59it/s]
5it [00:00, 148.62it/s]
5it [00:00, 141.81it/s]
5it [00:00, 155.90it/s]
5it [00:00, 161.19it/s]
5it [00:00, 163.60it/s]
5it [00:00, 141.76it/s]
5it [00:00, 151.46it/s]
5it [00:00, 162.58it/s]
5it [00:00, 162.65it/s]

Average Test Accuracy:  0.73





In [41]:
studentspecificCV(df,verb=False, model_type = 'naivebayes', target = 'predefinedlabel')

5it [00:00, 119.66it/s]
5it [00:00, 145.22it/s]
5it [00:00, 156.80it/s]
5it [00:00, 155.94it/s]
5it [00:00, 170.82it/s]
5it [00:00, 171.52it/s]
5it [00:00, 153.02it/s]
5it [00:00, 163.89it/s]
5it [00:00, 141.88it/s]
5it [00:00, 157.16it/s]

Average Test Accuracy:  0.48



