In [9]:
# import sklearn logistic regression
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import BayesianRidge
from sklearn.feature_selection import SelectFromModel
from sklearn.kernel_approximation import RBFSampler

In [13]:
# load in data
df = pd.read_csv("cleaned_data/meanfeatures.csv")

# drop unnamed 0
df = df.drop(["Unnamed: 0"], axis=1)

## Baseline logistic regression model

In [26]:
target = "user-definedlabel"

# set up x and y, train test split
X = df.drop(target, axis=1)
X = df.drop(["SubjectID", "VideoID"], axis=1)
y = df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [27]:
df["user-definedlabel"].value_counts()

user-definedlabel
1.0    51
0.0    49
Name: count, dtype: int64

In [28]:
# Define the hyperparameters you want to tune and their respective values
param_grid = {"C": [0.1, 1, 10, 100], "penalty": ["l1", "l2"], "solver": ["liblinear"]}

# Create an instance of the logistic regression model
log_reg = LogisticRegression()

# Create an instance of the GridSearchCV object
grid_search = GridSearchCV(log_reg, param_grid, cv=5, scoring="accuracy")

# Fit the GridSearchCV object to the data
grid_search.fit(X, y)

# Print the best hyperparameters and their accuracy score
print("Best Hyperparameters:", grid_search.best_params_)
print("Best Accuracy Score:", grid_search.best_score_)

# Use the best hyperparameters to train and test the model
best_log_reg = grid_search.best_estimator_
best_log_reg.fit(X_train, y_train)
y_pred = best_log_reg.predict(X_test)

# check imbalance
# confusuion matrix>?
# good recall not a huge deal
# look at f1 score! best thing to look at

# feature selection - PCA
# LASSO by itself - check coefficients of this, put into model
# run l1 regularization
# if goof performance, look at coefficients (some will be pushed to zero)
# use features that it kept for other models

Best Hyperparameters: {'C': 10, 'penalty': 'l1', 'solver': 'liblinear'}
Best Accuracy Score: 0.9800000000000001


## Relevance Vector Machine

In [22]:
# Define the hyperparameters you want to tune and their respective values
param_grid = {
    "classify__alpha_1": [1e-7, 1e-6, 1e-5],  # alpha is learning rate
    "classify__alpha_2": [1e-7, 1e-6, 1e-5],
    "classify__lambda_1": [
        1e-7,
        1e-6,
        1e-5,
    ],  # lambda is regularization term (.1 or .2 - penalizes models with lots of parameters)
    "classify__lambda_2": [1e-7, 1e-6, 1e-5],
    "classify__n_iter": [300, 400, 500],
    "feature_select__estimator": [BayesianRidge()],
    "feature_select__threshold": ["mean", "median", None],
    "kernel_approx__gamma": [0.1, 1, 10],
}

# Create a pipeline for the RVM model
pipe = Pipeline(
    [
        ("feature_select", SelectFromModel(BayesianRidge())),
        ("kernel_approx", RBFSampler(random_state=0)),
        ("classify", BayesianRidge()),
    ]
)

# Create an instance of the GridSearchCV object
grid_search = GridSearchCV(pipe, param_grid, cv=5, scoring="neg_mean_squared_error")

# Fit the GridSearchCV object to the data
grid_search.fit(X, y)

# Print the best hyperparameters and their accuracy score
print("Best Hyperparameters:", grid_search.best_params_)
print("Best Accuracy Score:", -grid_search.best_score_)

# Use the best hyperparameters to train and test the model
best_rvm = grid_search.best_estimator_
best_rvm.fit(X_train, y_train)
y_pred = best_rvm.predict(X_test)

Best Hyperparameters: {'classify__alpha_1': 1e-07, 'classify__alpha_2': 1e-05, 'classify__lambda_1': 1e-05, 'classify__lambda_2': 1e-07, 'classify__n_iter': 300, 'feature_select__estimator': BayesianRidge(), 'feature_select__threshold': 'median', 'kernel_approx__gamma': 0.1}
Best Accuracy Score: 0.24304549947924392


## Random Forest

In [23]:
# Import necessary libraries
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

# Define the hyperparameters you want to tune and their respective values
param_grid = {
    "n_estimators": [100, 200, 300],
    "max_depth": [10, 20, 30, None],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "max_features": ["auto", "sqrt", "log2", None],
    "bootstrap": [True, False],
}

# Create an instance of the random forest model
rf = RandomForestRegressor(random_state=0)

# Create an instance of the GridSearchCV object
grid_search = GridSearchCV(rf, param_grid, cv=5, scoring="neg_mean_squared_error")

# Fit the GridSearchCV object to the data
grid_search.fit(X, y)

# Print the best hyperparameters and their accuracy score
print("Best Hyperparameters:", grid_search.best_params_)
print("Best Accuracy Score:", -grid_search.best_score_)

# Use the best hyperparameters to train and test the model
best_rf = grid_search.best_estimator_
best_rf.fit(X_train, y_train)
y_pred = best_rf.predict(X_test)

  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


Best Hyperparameters: {'max_depth': None, 'max_features': 'sqrt', 'n_estimators': 100}
Best Accuracy Score: 0.21481599999999998
