# Using KNN, SVM, Naive Bayes on the Means of the Channels as Features 
Like from the paper: http://www.cs.cmu.edu/~kkchang/paper/WangEtAl.2013.AIED.EEG-MOOC.pdf


In [1]:
# import the toolboxes 
import numpy as np 
import pandas as pd
import os
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from tqdm import tqdm

In [2]:
directory = '/Users/amandabreton/Documents/Duke 2022-2023/ECE 590/Final Project/'
filename = 'meanfeatures.csv'

df = pd.read_csv(directory+filename, index_col=0)

In [3]:
df.head()

Unnamed: 0,SubjectID,VideoID,user-definedlabel,Attention,Mediation,Raw,Delta,Theta,Alpha1,Alpha2,Beta1,Beta2,Gamma1,Gamma2
0,0.0,0.0,0.0,55.256944,53.826389,46.986111,544315.097222,124965.590278,36693.701389,25875.298611,20108.791667,40268.763889,40729.284722,16817.0625
1,0.0,1.0,1.0,43.621429,48.621429,28.8,739737.292857,161064.228571,34918.028571,25078.935714,22157.307143,37410.728571,36758.7,14519.407143
2,0.0,2.0,1.0,43.978873,47.316901,13.15493,694078.084507,149816.873239,30493.873239,21667.591549,21888.338028,36446.43662,33908.873239,14545.84507
3,0.0,3.0,0.0,51.057377,51.844262,34.713115,600823.688525,162653.360656,33367.278689,26281.5,17224.278689,43706.52459,41438.213115,16558.631148
4,0.0,4.0,0.0,55.224138,47.474138,30.008621,546628.017241,126893.948276,23113.844828,17017.051724,15955.87931,36427.836207,36024.818966,14752.655172


In [4]:
df.shape

(100, 14)

In [5]:
X = df.drop(['SubjectID', 'VideoID', 'user-definedlabel'], axis=1)
X.head()

Unnamed: 0,Attention,Mediation,Raw,Delta,Theta,Alpha1,Alpha2,Beta1,Beta2,Gamma1,Gamma2
0,55.256944,53.826389,46.986111,544315.097222,124965.590278,36693.701389,25875.298611,20108.791667,40268.763889,40729.284722,16817.0625
1,43.621429,48.621429,28.8,739737.292857,161064.228571,34918.028571,25078.935714,22157.307143,37410.728571,36758.7,14519.407143
2,43.978873,47.316901,13.15493,694078.084507,149816.873239,30493.873239,21667.591549,21888.338028,36446.43662,33908.873239,14545.84507
3,51.057377,51.844262,34.713115,600823.688525,162653.360656,33367.278689,26281.5,17224.278689,43706.52459,41438.213115,16558.631148
4,55.224138,47.474138,30.008621,546628.017241,126893.948276,23113.844828,17017.051724,15955.87931,36427.836207,36024.818966,14752.655172


In [6]:
y = df[['user-definedlabel']]
y.head()

Unnamed: 0,user-definedlabel
0,0.0
1,1.0
2,1.0
3,0.0
4,0.0


In [7]:
y = y['user-definedlabel'].values
y.shape


(100,)

In [51]:
np.logspace(-11,6,18)

array([1.e-11, 1.e-10, 1.e-09, 1.e-08, 1.e-07, 1.e-06, 1.e-05, 1.e-04,
       1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03, 1.e+04,
       1.e+05, 1.e+06])

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [59]:
def pick_model(model_type, C=1,verb=False):
    if model_type == 'knn':
        clf = KNeighborsClassifier(n_neighbors=5)
        Ns = [1,5,10,15,20,25,30,35,40,45,50]
        param_grid = {"n_neighbors": Ns}
    elif model_type == 'svm':
        clf = LinearSVC(C=C,verbose=verb, max_iter=10000)
        Cs = np.logspace(-11,6,18)
        param_grid = {"C": Cs, "penalty": ["l2"], "loss": ['hinge', 'squared_hinge']}
    else: # model_type = naivebayes
        clf = GaussianNB()
        param_grid = {}
    return clf, param_grid

In [32]:
# need to do cross fold validation --> grid search for best parameters
#num_splits = KFold(n_splits=10,shuffle = True)
#for train_index, test_index in tqdm(num_splits.split(X)): # to actually split into training/testing
#    X_train, X_test = X[train_index], X[test_index]
#    y_train, y_test = y[train_index], y[test_index]
    

# questions for alex: 
* we should probably do some sort of optimization of parameters? --. yes use grid search
* i only skimmed the paper but it seems like they only used means of each channel as the features so i guess we can just stick w/ that for this week, and then work on using better features next week? 
* do you want to plan to work on the decision tree classifier on like thursday/friday? 
* thinking we should probably work on the paper and presentation Monday/Tuesday next week, finish up by

grid search better --> 2 levels 
grid search CV 
target = user defined labels
check if they standardized/normalized the data --> we should probably do it in case it adds to computation time for grid search cv 
reconnect tomorrow 

In [33]:
from warnings import simplefilter
from sklearn.exceptions import ConvergenceWarning
simplefilter("ignore", category=ConvergenceWarning)

In [34]:
# Define the hyperparameters you want to tune and their respective values
Cs = np.logspace(-11,6,18)
param_grid = {"C": Cs, "penalty": ["l1", "l2"]}

# Create an instance of the  model
model, param_grid = pick_model(model_type = 'svm')

# Create an instance of the GridSearchCV object
grid_search = GridSearchCV(model, param_grid, cv=5, scoring="accuracy")

# Fit the GridSearchCV object to the data
grid_search.fit(X, y)

# Print the best hyperparameters and their accuracy score
print("Best Hyperparameters:", grid_search.best_params_)
print("Best Accuracy Score:", grid_search.best_score_)

# Use the best hyperparameters to train and test the model
best_log_reg = grid_search.best_estimator_
best_log_reg.fit(X_train, y_train)
y_pred = best_log_reg.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print('test accuracy:', acc)

svm
Best Hyperparameters: {'C': 1e-08, 'loss': 'hinge', 'penalty': 'l2'}
Best Accuracy Score: 0.5700000000000001
test accuracy: 0.55


In [35]:
# need to do 2 levels of cross validation?
# first level of cross validation being used to optimize accuracy
# and the second level being used to find the optimal parameters --> grid search through the training set

In [38]:
# first level: for optimizing accuracy 

num_splits = KFold(n_splits=5,shuffle = True)
for train_index, test_index in tqdm(num_splits.split(X)): # to actually split into training/testing
    #X.iloc[0].values
    X_train, X_test = X.iloc[train_index].values, X.iloc[test_index].values
    y_train, y_test = y[train_index], y[test_index]
    num_splits_2 = KFold(n_splits=5,shuffle = True) 
    # Create an instance of the  model
    model, param_grid = pick_model(model_type = 'svm')
    # Create an instance of the GridSearchCV object
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring="accuracy")
    # Fit the GridSearchCV object to the data
    grid_search.fit(X_train, y_train)
    # Print the best hyperparameters and their accuracy score
    print("Best Hyperparameters:", grid_search.best_params_)
    print("Best Accuracy Score:", grid_search.best_score_)     

1it [00:03,  3.12s/it]

Best Hyperparameters: {'C': 1e-08, 'loss': 'hinge', 'penalty': 'l2'}
Best Accuracy Score: 0.6125


2it [00:06,  3.15s/it]

Best Hyperparameters: {'C': 1e-08, 'loss': 'hinge', 'penalty': 'l2'}
Best Accuracy Score: 0.575


3it [00:09,  3.27s/it]

Best Hyperparameters: {'C': 1e-08, 'loss': 'squared_hinge', 'penalty': 'l2'}
Best Accuracy Score: 0.6


4it [00:13,  3.34s/it]

Best Hyperparameters: {'C': 1e-08, 'loss': 'hinge', 'penalty': 'l2'}
Best Accuracy Score: 0.575


5it [00:16,  3.28s/it]

Best Hyperparameters: {'C': 1e-07, 'loss': 'hinge', 'penalty': 'l2'}
Best Accuracy Score: 0.5875





In [40]:
# making a function for 2 level cross validation: 
# first level: for optimizing accuracy 
def twolevelCV(X,y,verb=False, model_type = 'svm'):
    num_splits = KFold(n_splits=5,shuffle = True)
    for train_index, test_index in tqdm(num_splits.split(X)): # to actually split into training/testing
        #X.iloc[0].values
        X_train, X_test = X.iloc[train_index].values, X.iloc[test_index].values
        y_train, y_test = y[train_index], y[test_index]
        num_splits_2 = KFold(n_splits=5,shuffle = True) 
        # Create an instance of the  model
        model, param_grid = pick_model(model_type)
        # Create an instance of the GridSearchCV object
        grid_search = GridSearchCV(model, param_grid, cv=5, scoring="accuracy")
        # Fit the GridSearchCV object to the data
        grid_search.fit(X_train, y_train)
        # Print the best hyperparameters and their accuracy score
        print("Training: Best Hyperparameters:", grid_search.best_params_)
        print("Training: Best Accuracy Score:", grid_search.best_score_)     

In [41]:
twolevelCV(X,y,model_type = 'svm')

1it [00:03,  3.40s/it]

Best Hyperparameters: {'C': 1e-08, 'loss': 'squared_hinge', 'penalty': 'l2'}
Best Accuracy Score: 0.5875


2it [00:06,  3.41s/it]

Best Hyperparameters: {'C': 1e-09, 'loss': 'squared_hinge', 'penalty': 'l2'}
Best Accuracy Score: 0.6375


3it [00:09,  3.28s/it]

Best Hyperparameters: {'C': 1e-07, 'loss': 'hinge', 'penalty': 'l2'}
Best Accuracy Score: 0.625


4it [00:13,  3.26s/it]

Best Hyperparameters: {'C': 1e-11, 'loss': 'squared_hinge', 'penalty': 'l2'}
Best Accuracy Score: 0.5625


5it [00:16,  3.22s/it]

Best Hyperparameters: {'C': 1e-09, 'loss': 'squared_hinge', 'penalty': 'l2'}
Best Accuracy Score: 0.575





In [60]:
twolevelCV(X,y,model_type = 'knn')

2it [00:00,  7.93it/s]

Best Hyperparameters: {'n_neighbors': 15}
Best Accuracy Score: 0.6625
Best Hyperparameters: {'n_neighbors': 10}
Best Accuracy Score: 0.7375


4it [00:00,  8.27it/s]

Best Hyperparameters: {'n_neighbors': 25}
Best Accuracy Score: 0.7125
Best Hyperparameters: {'n_neighbors': 15}
Best Accuracy Score: 0.7375


5it [00:00,  8.13it/s]

Best Hyperparameters: {'n_neighbors': 20}
Best Accuracy Score: 0.7





In [61]:
twolevelCV(X,y,model_type = 'bayes')

5it [00:00, 73.31it/s]

Best Hyperparameters: {}
Best Accuracy Score: 0.7125
Best Hyperparameters: {}
Best Accuracy Score: 0.725
Best Hyperparameters: {}
Best Accuracy Score: 0.625
Best Hyperparameters: {}
Best Accuracy Score: 0.6
Best Hyperparameters: {}
Best Accuracy Score: 0.6375





In [None]:
# next should probably move onto testing 