In [None]:
from descriptive_statistics import DiabetesDataBase
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, RobustScaler, QuantileTransformer
from utils import grid_search, halving_random_search, validate
import numpy as np

## Hyperparameters

In [None]:
normalization_method = "Q" # ("S", "R", "Q") S for standard, R for robust, Q for quantile
match normalization_method:
    case "S":
        SCALER = StandardScaler()
    case "R":
        SCALER = RobustScaler() #Eliminated never better than S or Q and take 
    case "Q":
        SCALER = QuantileTransformer(n_quantiles=334)

RANDOM_STATE = 17

## Tensorboard

In [None]:
from torch.utils.tensorboard import SummaryWriter

from datetime import datetime

time = datetime.now().strftime("%Y%B%d_%H_%M")
print(time)
log_folder = "logs/"+time+normalization_method+str(RANDOM_STATE)
writer = SummaryWriter(log_dir=log_folder)

## Read in and split data

In [None]:
csv_path = "diabetes.csv"
ddb = DiabetesDataBase(csv_path, train_split=0.8, val_split=0.1, test_split=0.1, random_state=RANDOM_STATE, augment=True)
X_train, X_val, X_test, y_train, y_val, y_test = ddb.get_splits()

## KNN Model

In [None]:
knn = KNeighborsClassifier()

parameters = {
    "classifier__n_neighbors": list(range(1, 31)),
    "classifier__metric": ['euclidean','manhattan'],
    "classifier__weights":['uniform','distance']
}

knn_cls = grid_search(knn, SCALER, parameters)

knn_cls.fit(X_train, y_train)

print(knn_cls.best_estimator_.get_params()['classifier'])

knn_avg, knn_cm = validate(knn_cls, X_val, y_val)
writer.add_scalars("knn", knn_avg)
writer.flush()

## Random Forest

In [None]:
random_forest = RandomForestClassifier(random_state = RANDOM_STATE)

# https://towardsdatascience.com/hyperparameter-tuning-the-random-forest-in-python-using-scikit-learn-28d2aa77dd74
parameters = {
    
    "classifier__n_estimators": [int(x) for x in np.linspace(start = 100, stop = 1000, num = 100)],
    "classifier__max_features": ['log2', 'sqrt'],
    "classifier__max_depth" : [int(x) for x in np.linspace(10, 110, num = 11)],
    "classifier__min_samples_split": [2,5,10],
    "classifier__min_samples_leaf": [1, 2, 4],
    "classifier__bootstrap": [True, False],
}

random_forest_cls = halving_random_search(random_forest, SCALER, parameters, random_state=RANDOM_STATE)

random_forest_cls.fit(X_train, y_train)

print(random_forest_cls.best_estimator_.get_params()['classifier'])

random_forest_avg, random_forest_cm = validate(random_forest_cls, X_val, y_val)
writer.add_scalars("random_forest", random_forest_avg)
writer.flush()

## Logistic Regression

In [None]:
log_reg = LogisticRegression()

parameters = {
    "classifier__penalty": [None, "l2"]
}

reg_cls = grid_search(log_reg, SCALER, parameters)

reg_cls.fit(X_train, y_train)

print(reg_cls.best_estimator_.get_params()['classifier'])

log_reg_avg, log_reg_cm = validate(reg_cls, X_val, y_val)
writer.add_scalars("log_reg", log_reg_avg)
writer.flush()

## SVM

In [None]:
from sklearn.svm import SVC

svm = SVC()

parameters = {
    "classifier__C": [0.01, 0.1, 1, 10, 100],
    "classifier__gamma": [0.01, 0.1, 1, 10, 100],
    "classifier__kernel": ["linear", "rbf", "sigmoid"]
}
print(parameters)


svm_cls = grid_search(svm, SCALER, parameters)
svm_cls.fit(X_train, y_train)
print(svm_cls.best_estimator_.get_params()['classifier'])

svm_avg, svm_cm = validate(svm_cls, X_val, y_val)
writer.add_scalars("svm", svm_avg)
writer.flush()

## MLP

In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingRandomSearchCV


mlp = MLPClassifier(activation='relu', #relu
                    solver='adam', 
                    max_iter=30000, #300000
                    batch_size='auto',
                    learning_rate_init=0.001,
                    # Early stopping kinda does CV too https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html#sklearn.neural_network.MLPClassifier
                    early_stopping=True,
                    shuffle=True,
                    random_state=RANDOM_STATE,
                    alpha=0.0001, # L2 loss strenght
                    beta_1=0.9, # 0.9 org Exponential decay rate for estimates of first moment vector in adam
                    beta_2=0.999, # 0.999 org Exponential decay rate for estimates of second moment vector in adam
                    epsilon=1e-8 # 1e-8 org Value for numerical stability in adam.
                    )

parameters = {
    "classifier__solver": ["adam", "sgd"],
    "classifier__activation": ["relu", "tanh", "logistic"],
    "classifier__learning_rate_init": [0.0001, 0.001, 0.01, 0.005],
    "classifier__hidden_layer_sizes": [[10,10], [100,10], [50,100,50], [100], [10]],
    "classifier__beta_1": [round(i*0.001, 3) for i in range(90, 95)],
    "classifier__beta_2": [round(i*0.001, 4) for i in range(985, 999, 3)]
}
print(parameters)

mlp_cls = halving_random_search(mlp, SCALER, parameters, random_state=RANDOM_STATE)

mlp_cls.fit(X_train, y_train)

print(mlp_cls.best_estimator_.get_params()['classifier'])

mlp_avg, mlp_cm = validate(mlp_cls, X_val, y_val)
writer.add_scalars("mlp", mlp_avg)
writer.flush()
writer.close()

In [None]:
%load_ext tensorboard
%tensorboard --logdir=logs