## Data Imports, Preprocessing

In [13]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import BayesianRidge
from sklearn.feature_selection import SelectFromModel
from sklearn.kernel_approximation import RBFSampler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold

In [14]:
filename = 'meanfeatures.csv'

df = pd.read_csv(f'cleaned_data/{filename}', index_col=0)


df.head()

Unnamed: 0,SubjectID,VideoID,user-definedlabel,Attention,Mediation,Raw,Delta,Theta,Alpha1,Alpha2,Beta1,Beta2,Gamma1,Gamma2
0,0.0,0.0,0.0,55.256944,53.826389,46.986111,544315.097222,124965.590278,36693.701389,25875.298611,20108.791667,40268.763889,40729.284722,16817.0625
1,0.0,1.0,1.0,43.621429,48.621429,28.8,739737.292857,161064.228571,34918.028571,25078.935714,22157.307143,37410.728571,36758.7,14519.407143
2,0.0,2.0,1.0,43.978873,47.316901,13.15493,694078.084507,149816.873239,30493.873239,21667.591549,21888.338028,36446.43662,33908.873239,14545.84507
3,0.0,3.0,0.0,51.057377,51.844262,34.713115,600823.688525,162653.360656,33367.278689,26281.5,17224.278689,43706.52459,41438.213115,16558.631148
4,0.0,4.0,0.0,55.224138,47.474138,30.008621,546628.017241,126893.948276,23113.844828,17017.051724,15955.87931,36427.836207,36024.818966,14752.655172


## Baseline Logistic Regression

No feature selection, hyperparameter tuning, or cross validation

In [15]:
target = "user-definedlabel"

# set up x and y, train test split
X = df.drop(["SubjectID", "VideoID", "user-definedlabel"], axis=1)

# standardize X
X = (X - X.mean()) / X.std()

y = df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [16]:
log_reg = LogisticRegression()

log_reg.fit(X_train, y_train)

y_pred = log_reg.predict(X_test)

print(f"accuracy for baseline logistic regression: {accuracy_score(y_test, y_pred)}")

accuracy for baseline logistic regression: 0.65


In [17]:
## Splitting data into separate dataframes for each subject

subjects = df["SubjectID"].unique()

subject_dfs = []

for subject in subjects:
    subject_dfs.append(df[df["SubjectID"] == subject])

In [18]:
# run model for each subject dataframe

for df in subject_dfs:

    # running existing model - unsure if we need train test split again
    X = df.drop(["SubjectID", "VideoID", "user-definedlabel"], axis=1)
    y = df[target]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

    y_pred = log_reg.predict(X_test)
    print(f"accuracy for baseline logistic regression for subject {df['SubjectID'].iloc[0]}: {accuracy_score(y_test, y_pred)}")

accuracy for baseline logistic regression for subject 0.0: 0.5
accuracy for baseline logistic regression for subject 1.0: 0.5
accuracy for baseline logistic regression for subject 2.0: 0.5
accuracy for baseline logistic regression for subject 3.0: 0.0
accuracy for baseline logistic regression for subject 4.0: 0.5
accuracy for baseline logistic regression for subject 5.0: 0.0
accuracy for baseline logistic regression for subject 6.0: 0.5
accuracy for baseline logistic regression for subject 7.0: 0.5
accuracy for baseline logistic regression for subject 8.0: 0.5
accuracy for baseline logistic regression for subject 9.0: 0.5
