## Data Imports, Preprocessing

In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import BayesianRidge
from sklearn.feature_selection import SelectFromModel
from sklearn.kernel_approximation import RBFSampler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold

In [2]:
filename = "meanfeatures.csv"

df = pd.read_csv(f"cleaned_data/{filename}", index_col=0)

print(f"size of df: {df.shape}")
df.head()

size of df: (100, 14)


Unnamed: 0,SubjectID,VideoID,user-definedlabel,Attention,Mediation,Raw,Delta,Theta,Alpha1,Alpha2,Beta1,Beta2,Gamma1,Gamma2
0,0.0,0.0,0.0,55.256944,53.826389,46.986111,544315.097222,124965.590278,36693.701389,25875.298611,20108.791667,40268.763889,40729.284722,16817.0625
1,0.0,1.0,1.0,43.621429,48.621429,28.8,739737.292857,161064.228571,34918.028571,25078.935714,22157.307143,37410.728571,36758.7,14519.407143
2,0.0,2.0,1.0,43.978873,47.316901,13.15493,694078.084507,149816.873239,30493.873239,21667.591549,21888.338028,36446.43662,33908.873239,14545.84507
3,0.0,3.0,0.0,51.057377,51.844262,34.713115,600823.688525,162653.360656,33367.278689,26281.5,17224.278689,43706.52459,41438.213115,16558.631148
4,0.0,4.0,0.0,55.224138,47.474138,30.008621,546628.017241,126893.948276,23113.844828,17017.051724,15955.87931,36427.836207,36024.818966,14752.655172


## Baseline Logistic Regression

No feature selection, hyperparameter tuning, or cross validation

In [3]:
target = "user-definedlabel"

# set up x and y, train test split
X = df.drop(["VideoID", "user-definedlabel"], axis=1)
# need to drop subject id later because of the way we're testing on each one

# standardize X
# X = (X - X.mean()) / X.std()

y = df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# save subject ids for later
X_train_subjects = X_train.copy()
X_test_subjects = X_test.copy()

# standardize
X_train = (X_train - X_train.mean()) / X_train.std()
X_test = (X_test - X_test.mean()) / X_test.std()

X_train = X_train.drop("SubjectID", axis=1)
X_test = X_test.drop("SubjectID", axis=1)

In [4]:
X_test_subjects["SubjectID"].value_counts()

SubjectID
2.0    5
4.0    4
8.0    3
0.0    3
5.0    3
7.0    3
6.0    3
3.0    2
9.0    2
1.0    2
Name: count, dtype: int64

In [5]:
log_reg = LogisticRegression()

log_reg.fit(X_train, y_train)

y_pred = log_reg.predict(X_test)

print(f"accuracy for baseline logistic regression: {accuracy_score(y_test, y_pred)}")

accuracy for baseline logistic regression: 0.6666666666666666


In [6]:
## Splitting data into separate dataframes for each subject

# use X_test to ensure no data leakage
subjects = X_test_subjects["SubjectID"].unique()

subject_dfs = []

for subject in subjects:
    subject_dfs.append(X_test_subjects[X_test_subjects["SubjectID"] == subject])

In [7]:
subject_dfs[4]

Unnamed: 0,SubjectID,Attention,Mediation,Raw,Delta,Theta,Alpha1,Alpha2,Beta1,Beta2,Gamma1,Gamma2
44,4.0,49.979167,57.604167,40.222222,426037.4375,124686.395833,28027.020833,26126.784722,17899.111111,14794.840278,11439.930556,9558.291667
48,4.0,36.633588,50.709924,39.641221,362813.549618,110029.900763,24901.870229,21531.21374,17510.221374,16318.152672,10363.687023,7859.381679
42,4.0,47.393443,62.540984,30.983607,437520.647541,128973.434426,28338.959016,18699.688525,19848.811475,13467.942623,9842.97541,8480.221311
45,4.0,58.463415,69.97561,23.113821,247162.113821,88324.512195,20752.512195,18107.382114,14827.056911,15787.186992,9590.813008,7072.113821


In [8]:
# run model for each subject dataframe

res = pd.DataFrame()
subjects = []
accuracy = []
num_samples = []

for df in subject_dfs:
    cur_subject = df["SubjectID"].iloc[0]

    assert len(df) > 0, "subject df is empty"

    df.drop("SubjectID", axis=1, inplace=True)

    y_pred = log_reg.predict(df)

    print(
        f"accuracy for baseline logistic regression for subject {cur_subject}: {np.round(accuracy_score(y_test, y_pred), 3)}"
    )

    # add subject and accuracy to subjects and accuracy lists
    subjects.append(cur_subject)
    accuracy.append(accuracy_score(y_test, y_pred))
    num_samples.append(len(df))

# res df =  subjects and accuracy
res["SubjectID"] = subjects
res["accuracy"] = accuracy
res["num_samples"] = num_samples

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop("SubjectID", axis=1, inplace=True)


ValueError: Found input variables with inconsistent numbers of samples: [30, 3]

In [None]:
res

Unnamed: 0,SubjectID,accuracy,num_samples
