# Training/Testing on a single patient for each fold:

In [1]:
# import toolboxes 
# import the toolboxes 
import numpy as np 
import pandas as pd
import os
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from tqdm import tqdm
from sklearn.linear_model import BayesianRidge
from sklearn.feature_selection import SelectFromModel
from sklearn.kernel_approximation import RBFSampler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier


In [2]:
from warnings import simplefilter
from sklearn.exceptions import ConvergenceWarning
simplefilter("ignore", category=ConvergenceWarning)

In [3]:
directory = '/Users/amandabreton/Documents/Duke 2022-2023/ECE 590/Final Project/'
filename = 'meanfeatures.csv'

df = pd.read_csv(directory+filename, index_col=0)

#directory = 'cleaned_data/'
#filename = 'meanfeatures.csv'

#df = pd.read_csv(directory+filename, index_col=0)

In [4]:
# standardize all variables
df["Beta1"] = (df["Beta1"] - df["Beta1"].mean())/df["Beta1"].std()
df["Alpha1"] = (df["Alpha1"] - df["Alpha1"].mean())/df["Alpha1"].std()
df["Gamma2"] = (df["Gamma2"] - df["Gamma2"].mean())/df["Gamma2"].std()
df["Theta"] = (df["Theta"] - df["Theta"].mean())/df["Theta"].std()
df["Attention"] = (df["Attention"] - df["Attention"].mean())/df["Attention"].std()
df["Mediation"] = (df["Mediation"] - df["Mediation"].mean())/df["Mediation"].std()
df["Raw"] = (df["Raw"] - df["Raw"].mean())/df["Raw"].std()
df["Delta"] = (df["Delta"] - df["Delta"].mean())/df["Delta"].std()
df["Beta2"] = (df["Beta2"] - df["Beta2"].mean())/df["Beta2"].std()
df["Gamma1"] = (df["Gamma1"] - df["Gamma1"].mean())/df["Gamma1"].std()
df["Alpha2"] = (df["Alpha2"] - df["Alpha2"].mean())/df["Alpha2"].std()



In [5]:
df.head()

Unnamed: 0,SubjectID,VideoID,predefinedlabel,user-definedlabel,Attention,Mediation,Raw,Delta,Theta,Alpha1,Alpha2,Beta1,Beta2,Gamma1,Gamma2
0,0.0,0.0,0.0,0.0,0.799238,0.370443,-0.189427,-0.221254,-0.487078,-0.197569,-0.310132,-0.276612,0.048944,0.23606,0.121001
1,0.0,1.0,0.0,1.0,0.134575,0.080433,-0.367022,0.464321,-0.080545,-0.271106,-0.343943,-0.142894,-0.007846,0.153857,0.00621
2,0.0,2.0,0.0,1.0,0.154993,0.007747,-0.519804,0.30414,-0.20721,-0.454326,-0.488778,-0.160451,-0.027006,0.094858,0.007531
3,0.0,3.0,0.0,0.0,0.559343,0.260003,-0.309278,-0.023012,-0.062649,-0.335328,-0.292885,-0.464901,0.117254,0.250737,0.10809
4,0.0,4.0,0.0,0.0,0.797364,0.016508,-0.35522,-0.21314,-0.465362,-0.759959,-0.686226,-0.547697,-0.027376,0.138664,0.017863


In [6]:
df["SubjectID"].value_counts()

SubjectID
0.0    10
1.0    10
2.0    10
3.0    10
4.0    10
5.0    10
6.0    10
7.0    10
8.0    10
9.0    10
Name: count, dtype: int64

In [7]:
def pick_model(model_type, C=1,verb=False):
    if model_type == 'logreg':
        clf = LogisticRegression()
        param_grid = {"C": [0.1, 1, 10, 100], "penalty": ["l1", "l2"], "solver": ["liblinear"]}
    elif model_type == 'rvm':
        clf = BayesianRidge(compute_score=True)
        param_grid = {"alpha_1": [1e-6, 1e-5, 1e-4, 1e-3], "alpha_2": [1e-6, 1e-5, 1e-4, 1e-3], 
                      "lambda_1": [1e-6, 1e-5, 1e-4, 1e-3], "lambda_2": [1e-6, 1e-5, 1e-4, 1e-3]}
    elif model_type == 'knn':
        clf = KNeighborsClassifier(n_neighbors=5)
        Ns = [1,2,3,4]
        param_grid = {"n_neighbors": Ns}
    elif model_type == 'svm':
        clf = LinearSVC(C=C,verbose=verb, max_iter=10000)
        Cs = np.logspace(-11,6,18)
        param_grid = {"C": Cs, "penalty": ["l2"], "loss": ['hinge', 'squared_hinge']} 
    elif model_type == 'random forest': 
        clf = RandomForestClassifier()
        param_grid = {"n_estimators": [10, 50, 100, 200, 500, 1000], "max_depth": 
                      [None, 5, 10, 20, 50, 100]}
    elif model_type == 'decision tree': 
        clf = DecisionTreeClassifier()
        param_grid = {'criterion': ['gini', 'entropy', 'log_loss'], 'splitter': ['best', 'random']}
    else: #model_type = naivebayes
        clf = GaussianNB()
        param_grid = {}
    return clf, param_grid

In [23]:
def studentspecificCV(df,verb=False, model_type = 'svm', target = 'user-definedlabel'):
    # training on half the videos testing on the other half in a single student
    accs = np.zeros(50)
    counter = 0
    for i in range(10): 
        curr_student = df.loc[df['SubjectID'] == i]
        X = curr_student.drop(['SubjectID', 'VideoID', 'user-definedlabel', 'predefinedlabel'], axis=1)
        y = curr_student[[target]]
        y = y[target].values
        num_splits = KFold(n_splits=5,shuffle = True)
        for train_index, test_index in tqdm(num_splits.split(X)): # to actually split into training/testing
            
            #X.iloc[0].values
            X_train, X_test = X.iloc[train_index].values, X.iloc[test_index].values
            y_train, y_test = y[train_index], y[test_index]
    
            # Create an instance of the  model
            model, param_grid = pick_model(model_type)
            # Create an instance of the GridSearchCV object
            grid_search = GridSearchCV(model, param_grid, cv=2, scoring="accuracy")
            # Fit the GridSearchCV object to the data
            grid_search.fit(X_train, y_train)
            # Print the best hyperparameters and their accuracy score
            print("Training: Best Hyperparameters:", grid_search.best_params_)
            #print("Training: Best Accuracy Score:", grid_search.best_score_)
            best_est = grid_search.best_estimator_
            best_est.fit(X_train, y_train)
            y_pred = best_est.predict(X_test)
            acc = accuracy_score(y_test, y_pred)
            accs[counter] = acc
            #print('Test accuracy for target:', target, acc)
            counter += 1 
    print('Average Test Accuracy: ', np.mean(accs))

# Log reg 

In [9]:
studentspecificCV(df,verb=False, model_type = 'logreg', target = 'predefinedlabel')

5it [00:00, 17.04it/s]
5it [00:00, 20.49it/s]
5it [00:00, 17.22it/s]
5it [00:00, 20.77it/s]
5it [00:00, 19.07it/s]
5it [00:00, 21.02it/s]
5it [00:00, 20.61it/s]
5it [00:00, 20.40it/s]
5it [00:00, 14.38it/s]
5it [00:00, 18.84it/s]

Average Test Accuracy:  0.49





In [10]:
studentspecificCV(df,verb=False, model_type = 'logreg', target = 'user-definedlabel')

5it [00:00, 18.03it/s]
5it [00:00, 19.43it/s]
5it [00:00, 14.92it/s]
5it [00:00, 19.30it/s]
5it [00:00, 19.08it/s]
5it [00:00, 19.58it/s]
5it [00:00, 18.53it/s]
5it [00:00, 19.57it/s]
5it [00:00, 20.23it/s]
5it [00:00, 17.16it/s]

Average Test Accuracy:  0.65





# RVM 

In [11]:
#studentspecificCV(df,verb=False, model_type = 'rvm', target = 'user-definedlabel')

In [12]:
#studentspecificCV(df,verb=False, model_type = 'rvm', target = 'predefinedlabel')

# SVM 

In [13]:
studentspecificCV(df,verb=False, model_type = 'svm', target = 'predefinedlabel')

5it [00:00,  5.47it/s]
5it [00:00,  5.31it/s]
5it [00:01,  4.37it/s]
5it [00:00,  5.37it/s]
5it [00:01,  4.50it/s]
5it [00:01,  4.90it/s]
5it [00:01,  4.25it/s]
5it [00:00,  5.29it/s]
5it [00:01,  4.85it/s]
5it [00:00,  5.56it/s]

Average Test Accuracy:  0.41





In [14]:
studentspecificCV(df,verb=False, model_type = 'svm', target = 'user-definedlabel')

5it [00:00,  5.61it/s]
5it [00:00,  5.40it/s]
5it [00:01,  4.20it/s]
5it [00:00,  5.02it/s]
5it [00:01,  4.62it/s]
5it [00:01,  4.97it/s]
5it [00:01,  4.92it/s]
5it [00:01,  4.72it/s]
5it [00:01,  4.53it/s]
5it [00:00,  5.44it/s]

Average Test Accuracy:  0.71





# KNN

In [15]:
studentspecificCV(df,verb=False, model_type = 'knn', target = 'user-definedlabel')

5it [00:00,  5.76it/s]
5it [00:00,  7.35it/s]
5it [00:00,  7.55it/s]
5it [00:00,  9.72it/s]
5it [00:00,  6.79it/s]
5it [00:00,  6.90it/s]
5it [00:00,  6.32it/s]
5it [00:00,  9.84it/s]
5it [00:00,  6.49it/s]
5it [00:00,  8.58it/s]

Average Test Accuracy:  0.75





In [16]:
studentspecificCV(df,verb=False, model_type = 'knn', target = 'predefinedlabel')

5it [00:00,  6.71it/s]
5it [00:00,  7.19it/s]
5it [00:00,  6.85it/s]
5it [00:00,  7.51it/s]
5it [00:00,  7.95it/s]
5it [00:00,  6.85it/s]
5it [00:00,  7.46it/s]
5it [00:00,  7.87it/s]
5it [00:00,  7.46it/s]
5it [00:00,  7.81it/s]

Average Test Accuracy:  0.51





# Random Forest

In [17]:
studentspecificCV(df,verb=False, model_type = 'random forest', target = 'user-definedlabel')

5it [03:31, 42.22s/it]
5it [02:36, 31.27s/it]
5it [02:30, 30.15s/it]
0it [00:00, ?it/s]Bad pipe message: %s [b'\x17\xb8\xf9\x83\xd6\xef\xbbs\xed\xe9\x88\x17\xd6]\x87Q\xb3J \xfc\xcew,\xe4\xd2\xd7H.N\xecln\x84\x9f\xf0G?\x1e)\x82\xdaW}Y:\xca\x93K\x9b\xed2\x00\x08\x13\x02\x13\x03\x13\x01\x00\xff\x01\x00\x00\x8f\x00\x00\x00\x0e\x00\x0c\x00\x00\t127.0.0.1\x00\x0b\x00\x04\x03\x00\x01\x02\x00\n\x00\x0c\x00\n\x00\x1d\x00\x17\x00\x1e\x00\x19\x00', b'#\x00\x00\x00\x16\x00\x00\x00\x17\x00\x00\x00\r\x00\x1e\x00\x1c\x04\x03\x05\x03\x06\x03']
Bad pipe message: %s [b'\x08\x08\x08\t\x08\n\x08', b'\x04\x08\x05\x08\x06\x04\x01\x05\x01\x06']
Bad pipe message: %s [b'']
Bad pipe message: %s [b'\x03\x02\x03\x04\x00-\x00\x02\x01\x01\x003\x00&\x00$\x00\x1d\x00 za\x82\x0eS\x1d\x9fi1\x08\xa7\xba\xc2\xd5\xe6\xc1\xb7E\xf6I+\xa2']
Bad pipe message: %s [b'b\xe8\xe2\x120u`\xcaJ!\xe8 \xcd\x0fM\x8e\x9ff \xdb$\xd9\rdmlQ\xdc\x16\xb4\x07\x00/5\xca\xc1\x1d_U}\xf6N\x0cJ\xe3(yH\xb2\xc5\x95\x00\x08\x13\x02', b'\x13\x01\x00\xf

Average Test Accuracy:  0.79





In [18]:
studentspecificCV(df,verb=False, model_type = 'random forest', target = 'predefinedlabel')

5it [02:30, 30.12s/it]
5it [02:29, 29.93s/it]
5it [02:28, 29.78s/it]
5it [02:29, 29.93s/it]
5it [02:31, 30.34s/it]
5it [02:27, 29.51s/it]
5it [02:29, 29.88s/it]
5it [02:35, 31.13s/it]
5it [02:34, 30.97s/it]
5it [02:45, 33.08s/it]

Average Test Accuracy:  0.5





# Decision Tree

In [24]:
studentspecificCV(df,verb=False, model_type = 'decision tree', target = 'user-definedlabel')

3it [00:00, 16.05it/s]

Training: Best Hyperparameters: {'criterion': 'entropy', 'splitter': 'random'}
Training: Best Accuracy Score: 0.875
Training: Best Hyperparameters: {'criterion': 'log_loss', 'splitter': 'random'}
Training: Best Accuracy Score: 0.875
Training: Best Hyperparameters: {'criterion': 'entropy', 'splitter': 'random'}
Training: Best Accuracy Score: 0.75


5it [00:00, 15.53it/s]


Training: Best Hyperparameters: {'criterion': 'gini', 'splitter': 'random'}
Training: Best Accuracy Score: 0.875
Training: Best Hyperparameters: {'criterion': 'entropy', 'splitter': 'random'}
Training: Best Accuracy Score: 0.75


5it [00:00, 43.97it/s]


Training: Best Hyperparameters: {'criterion': 'log_loss', 'splitter': 'random'}
Training: Best Accuracy Score: 1.0
Training: Best Hyperparameters: {'criterion': 'entropy', 'splitter': 'best'}
Training: Best Accuracy Score: 1.0
Training: Best Hyperparameters: {'criterion': 'log_loss', 'splitter': 'best'}
Training: Best Accuracy Score: 0.875
Training: Best Hyperparameters: {'criterion': 'log_loss', 'splitter': 'random'}
Training: Best Accuracy Score: 0.875
Training: Best Hyperparameters: {'criterion': 'log_loss', 'splitter': 'best'}
Training: Best Accuracy Score: 1.0


5it [00:00, 45.65it/s]

Training: Best Hyperparameters: {'criterion': 'gini', 'splitter': 'random'}
Training: Best Accuracy Score: 0.875
Training: Best Hyperparameters: {'criterion': 'gini', 'splitter': 'random'}
Training: Best Accuracy Score: 0.875
Training: Best Hyperparameters: {'criterion': 'gini', 'splitter': 'best'}
Training: Best Accuracy Score: 1.0
Training: Best Hyperparameters: {'criterion': 'gini', 'splitter': 'best'}
Training: Best Accuracy Score: 0.75
Training: Best Hyperparameters: {'criterion': 'gini', 'splitter': 'best'}
Training: Best Accuracy Score: 0.75



5it [00:00, 47.67it/s]

Training: Best Hyperparameters: {'criterion': 'gini', 'splitter': 'best'}
Training: Best Accuracy Score: 0.875
Training: Best Hyperparameters: {'criterion': 'gini', 'splitter': 'best'}
Training: Best Accuracy Score: 0.875
Training: Best Hyperparameters: {'criterion': 'gini', 'splitter': 'best'}
Training: Best Accuracy Score: 0.875
Training: Best Hyperparameters: {'criterion': 'entropy', 'splitter': 'random'}
Training: Best Accuracy Score: 1.0
Training: Best Hyperparameters: {'criterion': 'gini', 'splitter': 'random'}
Training: Best Accuracy Score: 0.75



0it [00:00, ?it/s]

Training: Best Hyperparameters: {'criterion': 'log_loss', 'splitter': 'best'}
Training: Best Accuracy Score: 0.75
Training: Best Hyperparameters: {'criterion': 'gini', 'splitter': 'random'}
Training: Best Accuracy Score: 0.625
Training: Best Hyperparameters: {'criterion': 'entropy', 'splitter': 'best'}
Training: Best Accuracy Score: 0.75
Training: Best Hyperparameters: {'criterion': 'entropy', 'splitter': 'best'}
Training: Best Accuracy Score: 0.75


5it [00:00, 40.66it/s]


Training: Best Hyperparameters: {'criterion': 'gini', 'splitter': 'random'}
Training: Best Accuracy Score: 0.875


0it [00:00, ?it/s]

Training: Best Hyperparameters: {'criterion': 'gini', 'splitter': 'best'}
Training: Best Accuracy Score: 1.0
Training: Best Hyperparameters: {'criterion': 'gini', 'splitter': 'best'}
Training: Best Accuracy Score: 1.0
Training: Best Hyperparameters: {'criterion': 'entropy', 'splitter': 'best'}
Training: Best Accuracy Score: 1.0


5it [00:00, 38.95it/s]


Training: Best Hyperparameters: {'criterion': 'gini', 'splitter': 'random'}
Training: Best Accuracy Score: 1.0
Training: Best Hyperparameters: {'criterion': 'gini', 'splitter': 'best'}
Training: Best Accuracy Score: 1.0


0it [00:00, ?it/s]

Training: Best Hyperparameters: {'criterion': 'gini', 'splitter': 'random'}
Training: Best Accuracy Score: 0.875
Training: Best Hyperparameters: {'criterion': 'entropy', 'splitter': 'random'}
Training: Best Accuracy Score: 0.75
Training: Best Hyperparameters: {'criterion': 'entropy', 'splitter': 'best'}
Training: Best Accuracy Score: 0.75
Training: Best Hyperparameters: {'criterion': 'log_loss', 'splitter': 'best'}
Training: Best Accuracy Score: 0.75


5it [00:00, 43.56it/s]


Training: Best Hyperparameters: {'criterion': 'entropy', 'splitter': 'best'}
Training: Best Accuracy Score: 0.75


0it [00:00, ?it/s]

Training: Best Hyperparameters: {'criterion': 'gini', 'splitter': 'best'}
Training: Best Accuracy Score: 1.0
Training: Best Hyperparameters: {'criterion': 'gini', 'splitter': 'best'}
Training: Best Accuracy Score: 0.75
Training: Best Hyperparameters: {'criterion': 'entropy', 'splitter': 'random'}
Training: Best Accuracy Score: 1.0
Training: Best Hyperparameters: {'criterion': 'gini', 'splitter': 'best'}
Training: Best Accuracy Score: 0.75


5it [00:00, 42.37it/s]


Training: Best Hyperparameters: {'criterion': 'gini', 'splitter': 'best'}
Training: Best Accuracy Score: 0.75


0it [00:00, ?it/s]

Training: Best Hyperparameters: {'criterion': 'gini', 'splitter': 'best'}
Training: Best Accuracy Score: 1.0
Training: Best Hyperparameters: {'criterion': 'gini', 'splitter': 'best'}
Training: Best Accuracy Score: 1.0
Training: Best Hyperparameters: {'criterion': 'entropy', 'splitter': 'best'}
Training: Best Accuracy Score: 1.0
Training: Best Hyperparameters: {'criterion': 'gini', 'splitter': 'random'}
Training: Best Accuracy Score: 1.0


5it [00:00, 44.65it/s]


Training: Best Hyperparameters: {'criterion': 'gini', 'splitter': 'best'}
Training: Best Accuracy Score: 1.0


0it [00:00, ?it/s]

Training: Best Hyperparameters: {'criterion': 'entropy', 'splitter': 'random'}
Training: Best Accuracy Score: 0.625
Training: Best Hyperparameters: {'criterion': 'log_loss', 'splitter': 'best'}
Training: Best Accuracy Score: 1.0
Training: Best Hyperparameters: {'criterion': 'log_loss', 'splitter': 'best'}
Training: Best Accuracy Score: 0.75


4it [00:00, 34.33it/s]

Training: Best Hyperparameters: {'criterion': 'gini', 'splitter': 'best'}
Training: Best Accuracy Score: 0.875


5it [00:00, 35.64it/s]

Training: Best Hyperparameters: {'criterion': 'log_loss', 'splitter': 'random'}
Training: Best Accuracy Score: 0.875
Average Test Accuracy:  0.77





In [20]:
studentspecificCV(df,verb=False, model_type = 'decision tree', target = 'predefinedlabel')

5it [00:00, 36.57it/s]
5it [00:00, 44.99it/s]
5it [00:00, 43.28it/s]
5it [00:00, 45.09it/s]
5it [00:00, 44.08it/s]
5it [00:00, 45.37it/s]
5it [00:00, 46.22it/s]
5it [00:00, 45.55it/s]
5it [00:00, 44.06it/s]
5it [00:00, 44.12it/s]

Average Test Accuracy:  0.47





# Naive Bayes

In [21]:
studentspecificCV(df,verb=False, model_type = 'naivebayes', target = 'user-definedlabel')

5it [00:00, 83.08it/s]
5it [00:00, 125.39it/s]
5it [00:00, 114.69it/s]
5it [00:00, 153.32it/s]
5it [00:00, 115.49it/s]
5it [00:00, 150.15it/s]
5it [00:00, 141.41it/s]
5it [00:00, 127.23it/s]
5it [00:00, 124.20it/s]
5it [00:00, 103.77it/s]

Average Test Accuracy:  0.66





In [22]:
studentspecificCV(df,verb=False, model_type = 'naivebayes', target = 'predefinedlabel')

5it [00:00, 124.88it/s]
5it [00:00, 98.43it/s]
5it [00:00, 131.17it/s]
5it [00:00, 126.63it/s]
5it [00:00, 106.28it/s]
5it [00:00, 154.86it/s]
5it [00:00, 113.45it/s]
5it [00:00, 130.81it/s]
5it [00:00, 113.28it/s]
5it [00:00, 142.18it/s]

Average Test Accuracy:  0.46



