# Training/Testing on a single patient for each fold (feature selection)

In [2]:
import numpy as np
import pandas as pd
import os
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from tqdm import tqdm
from sklearn.linear_model import BayesianRidge
from sklearn.feature_selection import SelectFromModel
from sklearn.kernel_approximation import RBFSampler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

from warnings import simplefilter
from sklearn.exceptions import ConvergenceWarning

simplefilter("ignore", category=ConvergenceWarning)

In [4]:
directory = "cleaned_data/"
filename = "meanfeatures.csv"

df = pd.read_csv(directory + filename, index_col=0)

In [5]:
df.head()

Unnamed: 0,SubjectID,VideoID,predefinedlabel,user-definedlabel,Attention,Mediation,Raw,Delta,Theta,Alpha1,Alpha2,Beta1,Beta2,Gamma1,Gamma2
0,0.0,0.0,0.0,0.0,55.256944,53.826389,46.986111,544315.097222,124965.590278,36693.701389,25875.298611,20108.791667,40268.763889,40729.284722,16817.0625
1,0.0,1.0,0.0,1.0,43.621429,48.621429,28.8,739737.292857,161064.228571,34918.028571,25078.935714,22157.307143,37410.728571,36758.7,14519.407143
2,0.0,2.0,0.0,1.0,43.978873,47.316901,13.15493,694078.084507,149816.873239,30493.873239,21667.591549,21888.338028,36446.43662,33908.873239,14545.84507
3,0.0,3.0,0.0,0.0,51.057377,51.844262,34.713115,600823.688525,162653.360656,33367.278689,26281.5,17224.278689,43706.52459,41438.213115,16558.631148
4,0.0,4.0,0.0,0.0,55.224138,47.474138,30.008621,546628.017241,126893.948276,23113.844828,17017.051724,15955.87931,36427.836207,36024.818966,14752.655172


In [6]:
df["SubjectID"].value_counts()

SubjectID
0.0    10
1.0    10
2.0    10
3.0    10
4.0    10
5.0    10
6.0    10
7.0    10
8.0    10
9.0    10
Name: count, dtype: int64

In [7]:
# select beta1, alpha1, gamma2, and theta

df = df[
    [
        "SubjectID",
        "VideoID",
        "user-definedlabel",
        "predefinedlabel",
        "Beta1",
        "Alpha1",
        "Gamma2",
        "Theta",
    ]
]

In [8]:
def pick_model(model_type, C=1, verb=False):
    if model_type == "logreg":
        clf = LogisticRegression()
        param_grid = {
            "C": [0.1, 1, 10, 100],
            "penalty": ["l1", "l2"],
            "solver": ["liblinear"],
        }
    elif model_type == "rvm":
        clf = BayesianRidge(compute_score=True)
        param_grid = {
            "alpha_1": [1e-6, 1e-5, 1e-4, 1e-3],
            "alpha_2": [1e-6, 1e-5, 1e-4, 1e-3],
            "lambda_1": [1e-6, 1e-5, 1e-4, 1e-3],
            "lambda_2": [1e-6, 1e-5, 1e-4, 1e-3],
        }
    elif model_type == "knn":
        clf = KNeighborsClassifier(n_neighbors=5)
        Ns = [1, 2, 3, 4]
        param_grid = {"n_neighbors": Ns}
    elif model_type == "svm":
        clf = LinearSVC(C=C, verbose=verb, max_iter=10000)
        Cs = np.logspace(-11, 6, 18)
        param_grid = {"C": Cs, "penalty": ["l2"], "loss": ["hinge", "squared_hinge"]}
    elif model_type == "random forest":
        clf = RandomForestClassifier()
        param_grid = {
            "n_estimators": [10, 50, 100, 200, 500, 1000],
            "max_depth": [None, 5, 10, 20, 50, 100],
        }
    elif model_type == "decision tree":
        clf = DecisionTreeClassifier()
        param_grid = {
            "criterion": ["gini", "entropy", "log_loss"],
            "splitter": ["best", "random"],
        }
    else:  # model_type = naivebayes
        clf = GaussianNB()
        param_grid = {}
    return clf, param_grid

In [9]:
def studentspecificCV(df, verb=False, model_type="svm", target="user-definedlabel"):
    # training on half the videos testing on the other half in a single student
    accs = np.zeros(50)
    counter = 0
    for i in range(10):
        curr_student = df.loc[df["SubjectID"] == i]
        X = curr_student.drop(
            ["SubjectID", "VideoID", "user-definedlabel", "predefinedlabel"], axis=1
        )
        y = curr_student[[target]]
        y = y[target].values
        num_splits = KFold(n_splits=5, shuffle=True)
        for train_index, test_index in tqdm(
            num_splits.split(X)
        ):  # to actually split into training/testing
            # X.iloc[0].values
            X_train, X_test = X.iloc[train_index].values, X.iloc[test_index].values
            y_train, y_test = y[train_index], y[test_index]

            # Create an instance of the  model
            model, param_grid = pick_model(model_type)
            # Create an instance of the GridSearchCV object
            grid_search = GridSearchCV(model, param_grid, cv=2, scoring="accuracy")
            # Fit the GridSearchCV object to the data
            grid_search.fit(X_train, y_train)
            # Print the best hyperparameters and their accuracy score
            # print("Training: Best Hyperparameters:", grid_search.best_params_)
            # print("Training: Best Accuracy Score:", grid_search.best_score_)
            best_est = grid_search.best_estimator_
            best_est.fit(X_train, y_train)
            y_pred = best_est.predict(X_test)
            acc = accuracy_score(y_test, y_pred)
            accs[counter] = acc
            # print('Test accuracy for target:', target, acc)
            counter += 1
    print("Average Test Accuracy: ", np.mean(accs))

# Log reg 

In [10]:
studentspecificCV(df, verb=False, model_type="logreg", target="predefinedlabel")

5it [00:00, 25.10it/s]
5it [00:00, 29.13it/s]
5it [00:00, 24.67it/s]
5it [00:00, 27.38it/s]
5it [00:00, 28.14it/s]
5it [00:00, 30.96it/s]
5it [00:00, 24.96it/s]
5it [00:00, 28.03it/s]
5it [00:00, 24.65it/s]
5it [00:00, 27.92it/s]

Average Test Accuracy:  0.46





In [11]:
studentspecificCV(df, verb=False, model_type="logreg", target="user-definedlabel")

5it [00:00, 29.98it/s]
5it [00:00, 29.35it/s]
5it [00:00, 27.10it/s]
5it [00:00, 29.88it/s]
5it [00:00, 28.32it/s]
5it [00:00, 25.56it/s]
5it [00:00, 28.34it/s]
5it [00:00, 32.42it/s]
5it [00:00, 27.06it/s]
5it [00:00, 18.68it/s]

Average Test Accuracy:  0.64





# RVM 

In [12]:
# studentspecificCV(df,verb=False, model_type = 'rvm', target = 'user-definedlabel')

In [13]:
# studentspecificCV(df,verb=False, model_type = 'rvm', target = 'predefinedlabel')

# SVM 

In [14]:
studentspecificCV(df, verb=False, model_type="svm", target="predefinedlabel")

5it [00:00,  5.36it/s]
5it [00:00,  5.88it/s]
0it [00:00, ?it/s]


KeyboardInterrupt: 

In [None]:
studentspecificCV(df, verb=False, model_type="svm", target="user-definedlabel")

5it [00:00,  5.11it/s]
5it [00:00,  5.35it/s]
5it [00:00,  6.09it/s]
5it [00:00,  5.20it/s]
5it [00:00,  5.20it/s]
5it [00:01,  4.57it/s]
5it [00:00,  5.75it/s]
5it [00:00,  5.73it/s]
5it [00:00,  6.12it/s]
5it [00:00,  6.04it/s]

Average Test Accuracy:  0.55





# KNN

In [None]:
studentspecificCV(df, verb=False, model_type="knn", target="user-definedlabel")

5it [00:00, 32.68it/s]
5it [00:00, 40.70it/s]
5it [00:00, 20.27it/s]
5it [00:00, 38.24it/s]
5it [00:00, 40.67it/s]
5it [00:00, 42.65it/s]
5it [00:00, 36.17it/s]
5it [00:00, 42.84it/s]
5it [00:00, 42.72it/s]
5it [00:00, 40.99it/s]

Average Test Accuracy:  0.8





In [None]:
studentspecificCV(df, verb=False, model_type="knn", target="predefinedlabel")

5it [00:00, 38.51it/s]
5it [00:00, 42.56it/s]
5it [00:00, 42.09it/s]
5it [00:00, 35.79it/s]
5it [00:00, 43.05it/s]
5it [00:00, 42.73it/s]
5it [00:00, 41.42it/s]
5it [00:00, 42.45it/s]
5it [00:00, 40.07it/s]
5it [00:00, 27.61it/s]

Average Test Accuracy:  0.51





# Random Forest

In [16]:
studentspecificCV(
    df, verb=False, model_type="random forest", target="user-definedlabel"
)

5it [02:35, 31.12s/it]
5it [02:30, 30.01s/it]
5it [02:29, 29.97s/it]
5it [02:28, 29.80s/it]
5it [02:28, 29.61s/it]
5it [02:28, 29.62s/it]
5it [02:27, 29.58s/it]
5it [02:28, 29.71s/it]
5it [02:29, 29.82s/it]
5it [02:28, 29.78s/it]

Average Test Accuracy:  0.81





In [17]:
studentspecificCV(df, verb=False, model_type="random forest", target="predefinedlabel")

5it [02:28, 29.64s/it]
1it [00:31, 31.30s/it]


KeyboardInterrupt: 

# Decision Tree

In [18]:
studentspecificCV(
    df, verb=False, model_type="decision tree", target="user-definedlabel"
)

5it [00:00, 42.86it/s]
5it [00:00, 45.86it/s]
5it [00:00, 46.95it/s]
5it [00:00, 48.50it/s]
5it [00:00, 50.73it/s]
5it [00:00, 47.77it/s]
5it [00:00, 47.69it/s]
5it [00:00, 40.18it/s]
5it [00:00, 45.03it/s]
5it [00:00, 43.86it/s]

Average Test Accuracy:  0.78





In [19]:
studentspecificCV(df, verb=False, model_type="decision tree", target="predefinedlabel")

5it [00:00, 41.52it/s]
5it [00:00, 48.45it/s]
5it [00:00, 42.26it/s]
5it [00:00, 45.38it/s]
5it [00:00, 43.68it/s]
5it [00:00, 43.95it/s]
5it [00:00, 44.43it/s]
5it [00:00, 44.06it/s]
5it [00:00, 33.09it/s]
5it [00:00, 45.03it/s]

Average Test Accuracy:  0.58





# Naive Bayes

In [20]:
studentspecificCV(df, verb=False, model_type="naivebayes", target="user-definedlabel")

5it [00:00, 132.24it/s]
5it [00:00, 143.33it/s]
5it [00:00, 61.51it/s]
5it [00:00, 120.10it/s]
5it [00:00, 76.75it/s]
5it [00:00, 117.53it/s]
5it [00:00, 115.30it/s]
5it [00:00, 143.29it/s]
5it [00:00, 135.85it/s]
5it [00:00, 114.91it/s]

Average Test Accuracy:  0.72





In [21]:
studentspecificCV(df, verb=False, model_type="naivebayes", target="predefinedlabel")

5it [00:00, 130.68it/s]
5it [00:00, 97.47it/s]
5it [00:00, 142.92it/s]
5it [00:00, 151.81it/s]
5it [00:00, 133.03it/s]
5it [00:00, 126.98it/s]
5it [00:00, 125.86it/s]
5it [00:00, 130.06it/s]
5it [00:00, 139.06it/s]
5it [00:00, 151.89it/s]

Average Test Accuracy:  0.55



