In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from skopt.space import Real
from skopt.space import Integer
import joblib

In [None]:
model_tag = 'LogReg'

In [None]:
data = load_data(directory='D0', datafilename=f'trainD0')
ID_train, X_train, y_train = data[:, 0].astype(int), data[:, 1:-1], data[:, -1]


scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)


# hyperparameter space::
if model_tag == 'LogReg':
  model = LogisticRegression()
  param_space = {
    'C': Real(1e-6, 1e+6, 'log-uniform'),  # default=1.0
    'solver': ['lbfgs', 'newton-cg', 'sag', 'saga'],  # default=lbfgs
  }
#   
if model_tag == 'DT':
  model = DecisionTreeClassifier()
  param_space = {
    'criterion': ['gini', 'entropy', 'log_loss'],  # default: 'gini'
    'splitter': ['best', 'random'],  # default: 'best'
    'min_samples_split': Integer(2, 100),  # default: 2
    'min_samples_leaf': Integer(2, 100)  # default: 1
  }
# 
if model_tag == 'GaussNB':
  model = GaussianNB()
  param_space = {
    'var_smoothing': Real(1e-12, 1e-7, 'log-uniform'),  # default: 1e-9
  }
# 
if model_tag == 'KNN':
  model = KNeighborsClassifier()
  param_space = {
    'n_neighbors': Integer(1, 50),  # default: 5
    # 'weights': ['uniform', 'distance'],  # default: uniform
    # 'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']  #default=’auto’
  }
# 
if model_tag == 'SVM':
  model = SVC()
  param_space = {
    'C': Real(1e-6, 1e+6, 'log-uniform'),  # default: 1
    'kernel': ['linear', 'rbf', 'poly', 'sigmoid']  #default: rbf
  }

# hyperparam optimization:
opt = BayesSearchCV(
  estimator=model, search_spaces=param_space, n_iter=100, cv=3,
  n_jobs=-1, n_points=5, verbose=3, scoring='neg_mean_squared_error'
)
opt.fit(X_train, y_train)

results_df = pd.DataFrame(opt.cv_results_)
results_df.to_csv(f'{model_tag}_optResults.csv', index=False)
best_params = opt.best_params_
model = Ridge(**best_params)
model.fit(X_train, y_train)

joblib.dump(model, f'{model_tag}.joblib', compress=3)
joblib.dump(scaler, f'{model_tag}_scaler.joblib', compress=3)

In [None]:
results = []

model = joblib.load(f'{model_tag}.joblib')
scaler = joblib.load(f'{model_tag}_scaler.joblib')

results += ut.test_allDatasets(model, scalar)

f = open('classification_AllExceptNN_results.txt', 'w')
f.write('\n'.join(['\t'.join([str(item2) for item2 in item]) for item in results]))
f.close()