In [None]:
import pandas as pd
import os
import matplotlib.pyplot as plt
import numpy as np
import pickle

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, explained_variance_score
from sklearn.model_selection import train_test_split, cross_validate, RandomizedSearchCV

from tqdm.auto import tqdm

import warnings
warnings.filterwarnings('ignore')

fname = 'data.csv'
LABEL = 'label'
SPLIT = 0.3

In [None]:
data = pd.read_csv(fname)
data.head()

In [None]:
# column order in CSV file
column_names = data.columns


Y = data[LABEL]
X = data.drop(columns=LABEL)
X = pd.get_dummies(X)

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=SPLIT)

In [None]:
models = {'LR':LogisticRegression(), 'SVR':SVR(), 'MLP':MLPRegressor(), 'GBT':GradientBoostingRegressor(), 'RF': RandomForestRegressor()}

In [None]:
results = []
for m in tqdm(models):
    model = models[m]
    scoring = ['explained_variance', 'neg_mean_absolute_error', 'neg_mean_squared_error', 'r2']
    score = cross_validate(model, X, Y, scoring=scoring, cv=5)
    scores = [] 
    for k in score.keys():
        scores.append(np.mean(score[k]))

    results.append(scores)
results = pd.DataFrame(results, columns = score.keys(), index=list(models.keys()))
results['test_explained_variance'] = -results['test_explained_variance']
results['test_neg_mean_absolute_error'] = -results['test_neg_mean_absolute_error']
results['test_neg_mean_squared_error'] = np.sqrt(-results['test_neg_mean_squared_error'])
results['test_r2'] = -results['test_r2']
results = results.sort_values('test_neg_mean_absolute_error')
best = results.index[0]
results

In [None]:
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]

learning_rate = [0.5, 0.1, 0.05, 0.01, 0.001]
loss = ['ls', 'lad', 'huber', 'quantile']

penalty = ['l1', 'l2', 'elasticnet', 'none']
C= [0.1, 0.5, 1, 2, 5, 10]

kernel = ['linear', 'poly', 'rbf', 'sigmoid']
degree = [2, 3, 4, 5, 6]
gamma = ['scale', 'auto']
epsilon = [0.5, 0.1, 0.05, 0.01, 0.001]

hidden_layer_sizes = [(10,), (100,), (200,), (10, 10), (50, 50), (100, 100), (100, 100, 100), (50, 100, 50)]
activation = ['logistic', 'tanh', 'relu']
solver = ['lbfgs', 'sgd', 'adam']
batch_size = [32, 64, 128, 256]
learning_rate_schedule = ['constant', 'invscaling', 'adaptive']


rf_grid = {'n_estimators': n_estimators,
    'max_features': max_features,
    'max_depth': max_depth,
    'min_samples_split': min_samples_split,
    'min_samples_leaf': min_samples_leaf,
    'bootstrap': bootstrap}

gbr_grid = {'n_estimators': n_estimators,
    'loss': loss,
    'learning_rate': learning_rate,
    'max_features': max_features,
    'max_depth': max_depth,
    'min_samples_split': min_samples_split,
    'min_samples_leaf': min_samples_leaf}

lr_grid = {
    'penalty': penalty,
    'C': C
}

svr_grid = {
    'kernel': kernel,
    'degree': degree,
    'gamma': gamma,
    'epsilon': epsilon
}

mlp_grid = {
    'hidden_layer_sizes': hidden_layer_sizes,
    'activation': activation ,
    'solver': solver,
    'batch_size': batch_size,
    'learning_rate': learning_rate_schedule,
    'learning_rate_init': learning_rate
}

grids = {'LR':lr_grid, 'SVR':svr_grid, 'MLP':mlp_grid, 'GBT':gbr_grid, 'RF': rf_grid}

In [None]:
best_model = models[best]
best_model = RandomizedSearchCV(estimator = best_model, param_distributions = grids[best], n_iter = 100, cv = 3, verbose=5, random_state=42, n_jobs = 8)
best_model.fit(X, Y)
best_model.best_params_

In [None]:
final_model = models[best].set_params(**best_model.best_params_)
final_model.fit(x_train, y_train)
prediction = final_model.predict(x_test)

In [None]:
mae = mean_absolute_error(y_test, prediction)
msre = np.sqrt(mean_squared_error(y_test, prediction))
ev = explained_variance_score(y_test, prediction)
print('MAE', mae)
print('MSRE', msre)
print('EV', ev)