In [9]:
# import sklearn logistic regression
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import BayesianRidge
from sklearn.feature_selection import SelectFromModel
from sklearn.kernel_approximation import RBFSampler

In [13]:
# load in data
df = pd.read_csv('cleaned_data/meanfeatures.csv')

# drop unnamed 0
df = df.drop(['Unnamed: 0'], axis=1)

## Baseline logistic regression model

In [20]:
target = 'user-definedlabel'

# set up x and y, train test split
X = df.drop(target, axis=1)
y = df[target]
X_train,X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [21]:
# Define the hyperparameters you want to tune and their respective values
param_grid = {'C': [0.1, 1, 10, 100], 'penalty': ['l1', 'l2'], 'solver': ['liblinear']}

# Create an instance of the logistic regression model
log_reg = LogisticRegression()

# Create an instance of the GridSearchCV object
grid_search = GridSearchCV(log_reg, param_grid, cv=5, scoring='accuracy')

# Fit the GridSearchCV object to the data
grid_search.fit(X, y)

# Print the best hyperparameters and their accuracy score
print('Best Hyperparameters:', grid_search.best_params_)
print('Best Accuracy Score:', grid_search.best_score_)

# Use the best hyperparameters to train and test the model
best_log_reg = grid_search.best_estimator_
best_log_reg.fit(X_train, y_train)
y_pred = best_log_reg.predict(X_test)


Best Hyperparameters: {'C': 0.1, 'penalty': 'l2', 'solver': 'liblinear'}
Best Accuracy Score: 0.6799999999999999


Bad pipe message: %s [b'1\xabim\x89\x0b\xbb\x1eTb\x7f\x13}\xfbR\x17\xb2\x1a \x87\x98\xe96+w\x13\x7f\x8e\x06\x95\x9bd\xcev\x1c\x92\xc8i\x00\xac_U\xa7\x1a\x9b(4\xa8)\xa0\xf0\x00\x08\x13\x02\x13\x03\x13\x01\x00\xff\x01\x00\x00\x8f']
Bad pipe message: %s [b'\xd0\xab\xab=m\xc7\xcf\xdd\xe7"\xe5\xc6G\xa6\xee4\x1eg\x00\x00|\xc0,\xc00\x00\xa3\x00\x9f\xcc\xa9\xcc\xa8\xcc\xaa\xc0\xaf\xc0\xad\xc0\xa3\xc0\x9f\xc0]\xc0a\xc0W\xc0S\xc0+\xc0/\x00\xa2\x00\x9e\xc0\xae\xc0\xac\xc0\xa2\xc0\x9e\xc0\\\xc0`\xc0V\xc0R\xc0$\xc0(\x00k\x00j\xc0#\xc0\'\x00g\x00@\xc0\n\xc0\x14\x009\x008\xc0\t\xc0\x13\x003\x002\x00\x9d\xc0\xa1\xc0\x9d\xc0Q\x00\x9c\xc0\xa0\xc0\x9c\xc0P\x00=\x00<\x005\x00/\x00\x9a\x00\x99\xc0\x07\xc0\x11\x00\x96\x00\x05\x00\xff\x01\x00\x00j\x00\x00\x00\x0e\x00\x0c\x00\x00\t127.0.0.1\x00\x0b\x00\x04\x03\x00\x01\x02\x00\n\x00\x0c\x00\n\x00\x1d\x00\x17\x00\x1e\x00\x19\x00\x18\x00#\x00\x00\x00\x16\x00\x00\x00\x17\x00\x00\x00\r\x00']
Bad pipe message: %s [b'.\x04\x03\x05\x03\x06\x03\x08\x07\x08\x08\x08\t\x

## Relevance Vector Machine

In [22]:
# Define the hyperparameters you want to tune and their respective values
param_grid = {
    'classify__alpha_1': [1e-7, 1e-6, 1e-5],
    'classify__alpha_2': [1e-7, 1e-6, 1e-5],
    'classify__lambda_1': [1e-7, 1e-6, 1e-5],
    'classify__lambda_2': [1e-7, 1e-6, 1e-5],
    'classify__n_iter': [300, 400, 500],
    'feature_select__estimator': [BayesianRidge()],
    'feature_select__threshold': ['mean', 'median', None],
    'kernel_approx__gamma': [0.1, 1, 10]
}

# Create a pipeline for the RVM model
pipe = Pipeline([
    ('feature_select', SelectFromModel(BayesianRidge())),
    ('kernel_approx', RBFSampler(random_state=0)),
    ('classify', BayesianRidge())
])

# Create an instance of the GridSearchCV object
grid_search = GridSearchCV(pipe, param_grid, cv=5, scoring='neg_mean_squared_error')

# Fit the GridSearchCV object to the data
grid_search.fit(X, y)

# Print the best hyperparameters and their accuracy score
print('Best Hyperparameters:', grid_search.best_params_)
print('Best Accuracy Score:', -grid_search.best_score_)

# Use the best hyperparameters to train and test the model
best_rvm = grid_search.best_estimator_
best_rvm.fit(X_train, y_train)
y_pred = best_rvm.predict(X_test)


Best Hyperparameters: {'classify__alpha_1': 1e-07, 'classify__alpha_2': 1e-05, 'classify__lambda_1': 1e-05, 'classify__lambda_2': 1e-07, 'classify__n_iter': 300, 'feature_select__estimator': BayesianRidge(), 'feature_select__threshold': 'median', 'kernel_approx__gamma': 0.1}
Best Accuracy Score: 0.24304549947924392


## Random Forest

In [23]:
# Import necessary libraries
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

# Define the hyperparameters you want to tune and their respective values
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2', None],
    'bootstrap': [True, False]
}

# Create an instance of the random forest model
rf = RandomForestRegressor(random_state=0)

# Create an instance of the GridSearchCV object
grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='neg_mean_squared_error')

# Fit the GridSearchCV object to the data
grid_search.fit(X, y)

# Print the best hyperparameters and their accuracy score
print('Best Hyperparameters:', grid_search.best_params_)
print('Best Accuracy Score:', -grid_search.best_score_)

# Use the best hyperparameters to train and test the model
best_rf = grid_search.best_estimator_
best_rf.fit(X_train, y_train)
y_pred = best_rf.predict(X_test)


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


Best Hyperparameters: {'max_depth': None, 'max_features': 'sqrt', 'n_estimators': 100}
Best Accuracy Score: 0.21481599999999998
