# Catboost Tuning
## Summary
In this notebook I will primarily tune some models


### Importing Data and Required Packages

In [26]:
# Imports
import pandas as pd
from imblearn.over_sampling import RandomOverSampler
from catboost import CatBoostClassifier, metrics, cv
from sklearn.metrics import roc_auc_score, f1_score
import optuna

from functions import metrics as custom_metric

In [27]:
# Training Data
X_train = pd.read_csv('../Data/train/X_train.csv', index_col=0)
y_train = pd.read_csv('../Data/train/y_train.csv', index_col=0)

# Testing Data
X_test = pd.read_csv('../Data/test/X_test.csv', index_col=0)
y_test = pd.read_csv('../Data/test/y_test.csv', index_col=0)

In [28]:
# Currently the classes are labelled as "1" as dignosed ADHD and "2" as not diagnosed, but models seem to dislike this.
testing = {2: 0, 1: 1}
labels = y_train.replace(testing)
test_labels = y_test.replace(testing)

# This cell will be moved into the "Data cleaning" notebook in the future.

In [29]:
# Initiate Over sampler
ros = RandomOverSampler(random_state=15)

# Applying ONLY to training set to prevent data leakage.
X_train_os, y_train_os = ros.fit_resample(X_train, labels)

## Optimizing with the Optuna
For the hyperparameter tuning process, I'll be using the [Optuna](https://optuna.readthedocs.io/en/stable/index.html) library. I'll also be making use of Catboost's cross-validation in selecting the best model.

In [30]:
# Optuna requires us to define the "objective" as a function. This is, essentially, where we define the parameters to check.
def objective(trial):
    # Parameters to check
    param = {
        'iterations':trial.suggest_categorical('iterations', [100,200,300,500,1000]),
        'learning_rate':trial.suggest_float("learning_rate", 0.001, 0.3),
        'random_strength':trial.suggest_int("random_strength", 1,10),
        'bagging_temperature':trial.suggest_int("bagging_temperature", 0,10),
        'max_bin':trial.suggest_categorical('max_bin', [4,5,6,8,10,20,30]),
        'grow_policy':trial.suggest_categorical('grow_policy', ['SymmetricTree', 'Depthwise', 'Lossguide']),
        'min_data_in_leaf':trial.suggest_int("min_data_in_leaf", 1,10),
        "depth": trial.suggest_int("max_depth", 2,10),
        "l2_leaf_reg": trial.suggest_loguniform("l2_leaf_reg", 1e-8, 100),
        'one_hot_max_size':trial.suggest_categorical('one_hot_max_size', [5,10,12,100]),
        'custom_metric' : ['AUC'],
    }

    trial_model = CatBoostClassifier(**param)

    trial_model.fit(X_train_os,
                    y_train_os,
                    eval_set=[(X_test, test_labels)],
                    verbose=0,
                    early_stopping_rounds=100)

    preds = trial_model.predict(X_test)
    Fscore = f1_score(test_labels, preds)
    return Fscore

In [None]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100, timeout=600)

print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

[32m[I 2021-11-29 17:50:03,845][0m A new study created in memory with name: no-name-61f41af4-8c54-431a-80ba-7f8428e3cab8[0m
[32m[I 2021-11-29 17:50:37,368][0m Trial 0 finished with value: 0.5871006630500302 and parameters: {'iterations': 300, 'learning_rate': 0.14252531134014657, 'random_strength': 8, 'bagging_temperature': 9, 'max_bin': 6, 'grow_policy': 'SymmetricTree', 'min_data_in_leaf': 4, 'max_depth': 9, 'l2_leaf_reg': 0.0003216226621087834, 'one_hot_max_size': 12}. Best is trial 0 with value: 0.5871006630500302.[0m
[32m[I 2021-11-29 17:51:26,274][0m Trial 1 finished with value: 0.6287292817679557 and parameters: {'iterations': 200, 'learning_rate': 0.18193774591916695, 'random_strength': 1, 'bagging_temperature': 2, 'max_bin': 6, 'grow_policy': 'Lossguide', 'min_data_in_leaf': 5, 'max_depth': 10, 'l2_leaf_reg': 0.11825332582825888, 'one_hot_max_size': 100}. Best is trial 1 with value: 0.6287292817679557.[0m
[32m[I 2021-11-29 17:52:13,441][0m Trial 2 finished with valu

In [None]:
final_model = CatBoostClassifier(verbose=False,
                                 **trial.params)

In [None]:
final_model.fit(X_test, test_labels)

In [None]:
custom_metric(test_labels, final_model.predict(X_test))