# Training/Testing of Generalized Model

In [24]:
import numpy as np
import pandas as pd
import os
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from tqdm import tqdm
from sklearn.linear_model import BayesianRidge
from sklearn.feature_selection import SelectFromModel
from sklearn.kernel_approximation import RBFSampler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

from warnings import simplefilter
from sklearn.exceptions import ConvergenceWarning

simplefilter("ignore", category=ConvergenceWarning)

In [26]:
directory = "cleaned_data/"
filename = "meanfeatures.csv"

df = pd.read_csv(directory + filename, index_col=0)

In [27]:
df.columns

Index(['SubjectID', 'VideoID', 'predefinedlabel', 'user-definedlabel',
       'Attention', 'Mediation', 'Raw', 'Delta', 'Theta', 'Alpha1', 'Alpha2',
       'Beta1', 'Beta2', 'Gamma1', 'Gamma2'],
      dtype='object')

In [28]:
# standardize all variables
df["Beta1"] = (df["Beta1"] - df["Beta1"].mean()) / df["Beta1"].std()
df["Alpha1"] = (df["Alpha1"] - df["Alpha1"].mean()) / df["Alpha1"].std()
df["Gamma2"] = (df["Gamma2"] - df["Gamma2"].mean()) / df["Gamma2"].std()
df["Theta"] = (df["Theta"] - df["Theta"].mean()) / df["Theta"].std()
df["Attention"] = (df["Attention"] - df["Attention"].mean()) / df["Attention"].std()
df["Mediation"] = (df["Mediation"] - df["Mediation"].mean()) / df["Mediation"].std()
df["Raw"] = (df["Raw"] - df["Raw"].mean()) / df["Raw"].std()
df["Delta"] = (df["Delta"] - df["Delta"].mean()) / df["Delta"].std()
df["Beta2"] = (df["Beta2"] - df["Beta2"].mean()) / df["Beta2"].std()
df["Gamma1"] = (df["Gamma1"] - df["Gamma1"].mean()) / df["Gamma1"].std()
df["Alpha2"] = (df["Alpha2"] - df["Alpha2"].mean()) / df["Alpha2"].std()

In [29]:
def pick_model(model_type, C=1, verb=False):
    if model_type == "logreg":
        clf = LogisticRegression()
        param_grid = {
            "C": [0.1, 1, 10, 100],
            "penalty": ["l1", "l2"],
            "solver": ["liblinear"],
        }
    elif model_type == "rvm":
        clf = BayesianRidge(compute_score=True)
        param_grid = {
            "alpha_1": [1e-6, 1e-5, 1e-4, 1e-3],
            "alpha_2": [1e-6, 1e-5, 1e-4, 1e-3],
            "lambda_1": [1e-6, 1e-5, 1e-4, 1e-3],
            "lambda_2": [1e-6, 1e-5, 1e-4, 1e-3],
        }
    elif model_type == "knn":
        clf = KNeighborsClassifier(n_neighbors=5)
        Ns = [1, 5, 10, 15, 20, 25, 30, 35, 40]
        param_grid = {"n_neighbors": Ns}
    elif model_type == "svm":
        clf = LinearSVC(C=C, verbose=verb, max_iter=10000)
        Cs = np.logspace(-11, 6, 18)
        param_grid = {"C": Cs, "penalty": ["l2"], "loss": ["hinge", "squared_hinge"]}
    elif model_type == "random forest":
        clf = RandomForestClassifier()
        param_grid = {
            "n_estimators": [10, 50, 100, 200, 500, 1000],
            "max_depth": [None, 5, 10, 20, 50, 100],
        }
    elif model_type == "decision tree":
        clf = DecisionTreeClassifier()
        param_grid = {
            "criterion": ["gini", "entropy", "log_loss"],
            "splitter": ["best", "random"],
        }
    else:  # model_type = naivebayes
        clf = GaussianNB()
        param_grid = {}
    return clf, param_grid

In [30]:
def studentIndependentCV(df, verb=False, model_type="svm", target="user-definedlabel"):
    # training on half the videos testing on the other half in a single student
    accs = np.zeros(10)
    counter = 0
    for i in range(10):
        # curr_student = df.loc[df['SubjectID']== i]
        X_test = df.loc[df["SubjectID"] == i]
        y_test = X_test[[target]]
        y_test = y_test[target].values
        X_test = X_test.drop(
            ["SubjectID", "VideoID", "user-definedlabel", "predefinedlabel"], axis=1
        )

        X_train = df.loc[df["SubjectID"] != i]  # training
        y_train = X_train[[target]]
        y_train = y_train[target].values
        X_train = X_train.drop(
            ["SubjectID", "VideoID", "user-definedlabel", "predefinedlabel"], axis=1
        )

        # num_splits = KFold(n_splits=5,shuffle = True)
        # for train_index, test_index in tqdm(num_splits.split(X)): # to actually split into training/testing
        # X_train, X_test = X.iloc[train_index].values, X.iloc[test_index].values
        # y_train, y_test = y[train_index], y[test_index]

        # Create an instance of the  model
        model, param_grid = pick_model(model_type)
        # Create an instance of the GridSearchCV object
        grid_search = GridSearchCV(model, param_grid, cv=2, scoring="accuracy")
        # Fit the GridSearchCV object to the data
        grid_search.fit(X_train, y_train)
        # Print the best hyperparameters and their accuracy score
        # print("Training: Best Hyperparameters:", grid_search.best_params_)
        # print("Training: Best Accuracy Score:", grid_search.best_score_)
        best_est = grid_search.best_estimator_
        best_est.fit(X_train, y_train)
        y_pred = best_est.predict(X_test)
        acc = accuracy_score(y_test, y_pred)
        accs[counter] = acc
        # print('Test accuracy for target:', target, acc)
        counter += 1
    print("Average Test Accuracy: ", np.mean(accs))

# Logisitic Regression 

In [31]:
studentIndependentCV(df, verb=False, model_type="logreg", target="user-definedlabel")

Average Test Accuracy:  0.6399999999999999


In [32]:
studentIndependentCV(df, verb=False, model_type="logreg", target="predefinedlabel")

Average Test Accuracy:  0.43000000000000005


# RVM

In [33]:
# studentIndependentCV(df,verb=False, model_type = 'rvm', target = 'user-definedlabel')

In [34]:
# studentIndependentCV(df,verb=False, model_type = 'rvm', target = 'predefinedlabel')

# SVM

In [35]:
studentIndependentCV(df, verb=False, model_type="svm", target="user-definedlabel")

Average Test Accuracy:  0.63


In [36]:
studentIndependentCV(df, verb=False, model_type="svm", target="predefinedlabel")

Average Test Accuracy:  0.47000000000000003


# KNN

In [37]:
studentIndependentCV(df, verb=False, model_type="knn", target="user-definedlabel")

Average Test Accuracy:  0.7


In [38]:
studentIndependentCV(df, verb=False, model_type="knn", target="predefinedlabel")

Average Test Accuracy:  0.5200000000000001


# Random Forest

In [39]:
studentIndependentCV(
    df, verb=False, model_type="random forest", target="user-definedlabel"
)

Bad pipe message: %s [b'\xd1\xbb5\xae\xcd>O\x1c\x9a\xa9R\x0bj\xf4\x94\x07QH \x97N\xa3o8"t\xdc\x9b']
Bad pipe message: %s [b"<\xdd\xf2\x06\xaa\xdcL\xf1\x9e\xdb\x06\xdc\xf9\x8b8;R\x80\x00\x00|\xc0,\xc00\x00\xa3\x00\x9f\xcc\xa9\xcc\xa8\xcc\xaa\xc0\xaf\xc0\xad\xc0\xa3\xc0\x9f\xc0]\xc0a\xc0W\xc0S\xc0+\xc0/\x00\xa2\x00\x9e\xc0\xae\xc0\xac\xc0\xa2\xc0\x9e\xc0\\\xc0`\xc0V\xc0R\xc0$\xc0(\x00k\x00j\xc0#\xc0'\x00g\x00@\xc0\n\xc0\x14\x009\x008\xc0\t\xc0\x13\x003\x002\x00\x9d\xc0\xa1\xc0\x9d\xc0Q\x00\x9c\xc0\xa0\xc0\x9c\xc0P\x00=\x00<\x005\x00/\x00\x9a\x00\x99\xc0\x07\xc0\x11\x00\x96\x00\x05\x00\xff\x01\x00\x00j\x00\x00\x00\x0e\x00\x0c\x00\x00\t127.0.0.1\x00\x0b\x00\x04\x03\x00\x01\x02\x00\n\x00\x0c\x00\n\x00\x1d\x00\x17\x00\x1e\x00\x19\x00\x18\x00#\x00\x00\x00\x16\x00\x00\x00\x17\x00\x00\x00\r\x000\x00.\x04\x03\x05\x03\x06\x03\x08\x07\x08\x08\x08\t\x08\n\x08\x0b\x08\x04\x08\x05\x08\x06\x04\x01\x05\x01\x06\x01\x03\x03\x02\x03\x03\x01\x02\x01\x03\x02", b'\x04']
Bad pipe message: %s [b'46\xa1\x1as\xb

Average Test Accuracy:  0.74


In [40]:
studentIndependentCV(
    df, verb=False, model_type="random forest", target="predefinedlabel"
)

Average Test Accuracy:  0.5599999999999999


# Decision Tree

In [41]:
studentIndependentCV(
    df, verb=False, model_type="decision tree", target="user-definedlabel"
)

Average Test Accuracy:  0.6500000000000001


In [42]:
studentIndependentCV(
    df, verb=False, model_type="decision tree", target="predefinedlabel"
)

Average Test Accuracy:  0.6199999999999999


# Naive Bayes

In [43]:
studentIndependentCV(
    df, verb=False, model_type="naivebayes", target="user-definedlabel"
)

Average Test Accuracy:  0.65


In [44]:
studentIndependentCV(df, verb=False, model_type="naivebayes", target="predefinedlabel")

Average Test Accuracy:  0.5
