# Leave one out cross validation testing 

In [1]:
# import toolboxes 
# import the toolboxes 
import numpy as np 
import pandas as pd
import os
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from tqdm import tqdm
from sklearn.linear_model import BayesianRidge
from sklearn.feature_selection import SelectFromModel
from sklearn.kernel_approximation import RBFSampler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier


In [2]:
from warnings import simplefilter
from sklearn.exceptions import ConvergenceWarning
simplefilter("ignore", category=ConvergenceWarning)

In [3]:
directory = '/Users/amandabreton/Documents/Duke 2022-2023/ECE 590/Final Project/'
filename = 'meanfeatures.csv'

df = pd.read_csv(directory+filename, index_col=0)

In [15]:
def pick_model(model_type, C=1,verb=False):
    if model_type == 'logreg':
        clf = LogisticRegression()
        param_grid = {"C": [0.1, 1, 10, 100], "penalty": ["l1", "l2"], "solver": ["liblinear"]}
    elif model_type == 'rvm':
        clf = BayesianRidge(compute_score=True)
        param_grid = {"alpha_1": [1e-6, 1e-5, 1e-4, 1e-3], "alpha_2": [1e-6, 1e-5, 1e-4, 1e-3], 
                      "lambda_1": [1e-6, 1e-5, 1e-4, 1e-3], "lambda_2": [1e-6, 1e-5, 1e-4, 1e-3]}
    elif model_type == 'knn':
        clf = KNeighborsClassifier(n_neighbors=5)
        Ns = [1,5,10,15,20,25,30,35,40]
        param_grid = {"n_neighbors": Ns}
    elif model_type == 'svm':
        clf = LinearSVC(C=C,verbose=verb, max_iter=10000)
        Cs = np.logspace(-11,6,18)
        param_grid = {"C": Cs, "penalty": ["l2"], "loss": ['hinge', 'squared_hinge']} 
    elif model_type == 'random forest': 
        clf = RandomForestClassifier()
        param_grid = {"n_estimators": [10, 50, 100, 200, 500, 1000], "max_depth": 
                      [None, 5, 10, 20, 50, 100]}
    elif model_type == 'decision tree': 
        clf = DecisionTreeClassifier()
        param_grid = {'criterion': ['gini', 'entropy', 'log_loss'], 'splitter': ['best', 'random']}
    else: #model_type = naivebayes
        clf = GaussianNB()
        param_grid = {}
    return clf, param_grid

In [5]:
def studentIndependentCV(df,verb=False, model_type = 'svm', target = 'user-definedlabel'):
    # training on half the videos testing on the other half in a single student
    accs = np.zeros(10)
    counter = 0
    for i in range(10): 
        #curr_student = df.loc[df['SubjectID']== i]
        X_test = df.loc[df['SubjectID'] == i]
        y_test = X_test[[target]]
        y_test = y_test[target].values   
        X_test = X_test.drop(['SubjectID', 'VideoID', 'user-definedlabel', 'predefinedlabel'], axis=1)
            
        X_train = df.loc[df['SubjectID'] != i] # training 
        y_train = X_train[[target]]
        y_train = y_train[target].values         
        X_train = X_train.drop(['SubjectID', 'VideoID', 'user-definedlabel', 'predefinedlabel'], axis=1)
                 
        #num_splits = KFold(n_splits=5,shuffle = True)
        #for train_index, test_index in tqdm(num_splits.split(X)): # to actually split into training/testing
        #X_train, X_test = X.iloc[train_index].values, X.iloc[test_index].values
        #y_train, y_test = y[train_index], y[test_index]
    
        # Create an instance of the  model
        model, param_grid = pick_model(model_type)
        # Create an instance of the GridSearchCV object
        grid_search = GridSearchCV(model, param_grid, cv=2, scoring="accuracy")
        # Fit the GridSearchCV object to the data
        grid_search.fit(X_train, y_train)
        # Print the best hyperparameters and their accuracy score
        #print("Training: Best Hyperparameters:", grid_search.best_params_)
        #print("Training: Best Accuracy Score:", grid_search.best_score_)
        best_est = grid_search.best_estimator_
        best_est.fit(X_train, y_train)
        y_pred = best_est.predict(X_test)
        acc = accuracy_score(y_test, y_pred)
        accs[counter] = acc
        #print('Test accuracy for target:', target, acc)
        counter += 1 
    print('Average Test Accuracy: ', np.mean(accs))

# Logisitic Regression 

In [6]:
studentIndependentCV(df,verb=False, model_type = 'logreg', target = 'user-definedlabel')

Average Test Accuracy:  0.6699999999999999


In [7]:
studentIndependentCV(df,verb=False, model_type = 'logreg', target = 'predefinedlabel')

Average Test Accuracy:  0.5


# RVM

In [9]:
#studentIndependentCV(df,verb=False, model_type = 'rvm', target = 'user-definedlabel')

In [10]:
#studentIndependentCV(df,verb=False, model_type = 'rvm', target = 'predefinedlabel')

# SVM

In [11]:
studentIndependentCV(df,verb=False, model_type = 'svm', target = 'user-definedlabel')

Average Test Accuracy:  0.51


In [12]:
studentIndependentCV(df,verb=False, model_type = 'svm', target = 'predefinedlabel')

Average Test Accuracy:  0.47999999999999987


# KNN

In [16]:
studentIndependentCV(df,verb=False, model_type = 'knn', target = 'user-definedlabel')

Average Test Accuracy:  0.67


In [17]:
studentIndependentCV(df,verb=False, model_type = 'knn', target = 'predefinedlabel')

Average Test Accuracy:  0.53


# Random Forest

In [20]:
studentIndependentCV(df,verb=False, model_type = 'random forest', target = 'user-definedlabel')

Average Test Accuracy:  0.73


In [21]:
studentIndependentCV(df,verb=False, model_type = 'random forest', target = 'predefinedlabel')

Average Test Accuracy:  0.61


# Decision Tree

In [22]:
studentIndependentCV(df,verb=False, model_type = 'decision tree', target = 'user-definedlabel')

Average Test Accuracy:  0.72


In [23]:
studentIndependentCV(df,verb=False, model_type = 'decision tree', target = 'predefinedlabel')

Average Test Accuracy:  0.59


# Naive Bayes

In [18]:
studentIndependentCV(df,verb=False, model_type = 'naivebayes', target = 'user-definedlabel')

Average Test Accuracy:  0.65


In [19]:
studentIndependentCV(df,verb=False, model_type = 'naivebayes', target = 'predefinedlabel')

Average Test Accuracy:  0.45
