In [88]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold, GridSearchCV, ParameterGrid
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import pickle

file = open('../data/data_prep.save', 'rb')
other_sets, test_sets = pickle.load(file)
file.close()

In [75]:
def ML_Pipeline(preprocessor, clf, rs_index, param_grid, xgbc=False, rs=True):
    
    random_state = 42 * rs_index

    X_other, y_other = other_sets[rs_index]

    kf = KFold(n_splits=5,shuffle=True,random_state=random_state)
    
    if xgbc:
        clf.set_params(seed=random_state)
    if rs:
        clf.set_params(random_state=random_state)
    
    pipe = Pipeline(steps=[('preprocessor', preprocessor),
                       ('classifier', clf)])

    grid = GridSearchCV(pipe, param_grid=param_grid, 
                            scoring='accuracy',
                            cv=kf, return_train_score=True, n_jobs=-1)

    grid.fit(X_other, y_other)
    
    results = pd.DataFrame(grid.cv_results_)
    print(results)

    return grid

In [76]:
# simple logistic regression
from sklearn.linear_model import LogisticRegression

def simpleLogisticPipeline(preprocessor, rs_index):
    
    X_train, y_train = other_sets[rs_index]
    clf = LogisticRegression(penalty='none', max_iter=100000, random_state=rs_index*42)
    
    pipe = Pipeline(steps=[('preprocessor', preprocessor),
                       ('classifier', clf)])
    
    pipe.fit(X_train, y_train)
    
    return pipe

In [98]:
std_ftrs = list(other_sets[0][0].columns)
preprocessor = ColumnTransformer(transformers=[('std', StandardScaler(), std_ftrs)])

param_grid_L1L = {'classifier__C': [1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3, 1e4],
                  'classifier__max_iter': [100000]}
param_grid_L2L = {'classifier__C': [1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3, 1e4],
                  'classifier__max_iter': [100000]}
param_grid_ENL = {'classifier__C': [1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3, 1e4],
                  'classifier__l1_ratio': [0.01, 0.1, 0.25, 0.5, 0.75, 0.9, 0.99],
                  'classifier__max_iter': [100000]}
param_grid_RFC = {'classifier__max_features': [1, 3, 5, 10, 20, None],
                  'classifier__max_depth': [1, 3, 5, 10, 20, None],
                  'classifier__min_samples_split': [2, 5, 10]}
param_grid_SVC = {'classifier__gamma': [1e-2, 1e-1, 1e0, 1e1, 1e2, 'auto', 'scale'],
                  'classifier__C': np.logspace(-1, 1, 5)}
param_grid_KNN = {'classifier__n_neighbors': [1, 2, 3, 5, 10, 30, 100, 200], 
                  'classifier__weights': ['uniform', 'distance']}
param_grid_XGB = {'classifier__max_depth': [1, 3, 5, 10, 30, 100],
                  'classifier__min_child_weight': [1, 3, 5, 7],
                  'classifier__gamma': [0, 0.1, 0.2 , 0.3, 0.4],
                  'classifier__subsample': [0.5, 0.66, 0.75, 1],
                  'classifier__colsample_bytree': [0.3, 0.4, 0.5, 0.7, 1]}

In [99]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier

l1l = LogisticRegression(penalty='l1', solver='saga') # random_state
l2l = LogisticRegression(penalty='l2', solver='saga') # random_state
enl = LogisticRegression(penalty='elasticnet', solver='saga') # random_state
rfc = RandomForestClassifier() # random_state
svc = SVC() # random_state
knn = KNeighborsClassifier()
xgb = XGBClassifier() # seed

In [100]:
logistic_models = []
l1_models = []
l2_models = []
en_models = []
rf_models = []
sv_models = []
knn_models = []
xgb_models = []

for i in tqdm(range(10)):
    
    model = simpleLogisticPipeline(preprocessor, i)
    logistic_models.append(model)
    
    model = ML_Pipeline(preprocessor, l1l, i, param_grid_L1L, xgbc=False, rs=True)
    l1_models.append(model)
    
    model = ML_Pipeline(preprocessor, l2l, i, param_grid_L2L, xgbc=False, rs=True)
    l2_models.append(model)
    
    model = ML_Pipeline(preprocessor, enl, i, param_grid_ENL, xgbc=False, rs=True)
    en_models.append(model)
    
    model = ML_Pipeline(preprocessor, rfc, i, param_grid_RFC, xgbc=False, rs=True)
    rf_models.append(model)
    
    model = ML_Pipeline(preprocessor, svc, i, param_grid_SVC, xgbc=False, rs=True)
    sv_models.append(model)
    
    model = ML_Pipeline(preprocessor, knn, i, param_grid_KNN, xgbc=False, rs=False)
    knn_models.append(model)
    
    model = ML_Pipeline(preprocessor, xgb, i, param_grid_XGB, xgbc=True, rs=False)
    xgb_models.append(model)
    
    
file = open('../results/log_models_tuned.save', 'wb')
pickle.dump(logistic_models, file)
file.close()

file = open('../results/l1_models_tuned.save', 'wb')
pickle.dump(l1_models, file)
file.close()

file = open('../results/l2_models_tuned.save', 'wb')
pickle.dump(l2_models, file)
file.close()

file = open('../results/en_models_tuned.save', 'wb')
pickle.dump(en_models, file)
file.close()

file = open('../results/rf_models_tuned.save', 'wb')
pickle.dump(rf_models, file)
file.close()

file = open('../results/sv_models_tuned.save', 'wb')
pickle.dump(sv_models, file)
file.close()

file = open('../results/knn_models_tuned.save', 'wb')
pickle.dump(knn_models, file)
file.close()

100%|██████████| 10/10 [3:16:11<00:00, 1177.19s/it] 
