# Catboost Tuning
## Summary
In this notebook I will primarily tune some models


### Importing Data and Required Packages

In [58]:
# Imports
import pandas as pd
from imblearn.over_sampling import RandomOverSampler
from catboost import CatBoostClassifier, metrics, cv
from sklearn.metrics import roc_auc_score, f1_score, recall_score
import optuna

from functions import metrics as custom_metric

In [59]:
# Training Data
X_train = pd.read_csv('../Data/train/X_train.csv', index_col=0)
y_train = pd.read_csv('../Data/train/y_train.csv', index_col=0)

# Testing Data
X_test = pd.read_csv('../Data/test/X_test.csv', index_col=0)
y_test = pd.read_csv('../Data/test/y_test.csv', index_col=0)

In [60]:
# Currently the classes are labelled as "1" as dignosed ADHD and "2" as not diagnosed, but models seem to dislike this.
testing = {2: 0, 1: 1}
labels = y_train.replace(testing)
test_labels = y_test.replace(testing)

# This cell will be moved into the "Data cleaning" notebook in the future.

In [61]:
# Initiate Over sampler
ros = RandomOverSampler(random_state=15)

# Applying ONLY to training set to prevent data leakage.
X_train_os, y_train_os = ros.fit_resample(X_train, labels)

KeyboardInterrupt: 

## Optimizing with the Optuna
For the hyperparameter tuning process, I'll be using the [Optuna](https://optuna.readthedocs.io/en/stable/index.html) library. I'll also be making use of Catboost's cross-validation in selecting the best model.

In [None]:
# Optuna requires us to define the "objective function". Which will be called upon each iteration during our "trials"
def objective(trial):
    # Dict of Parameters to check
    param = {
        # Metric used for model optimization
        'loss_function':trial.suggest_categorical('loss_function', ['Logloss', 'CrossEntropy']),

        # The maximum number of trees that can be built.
        'iterations':trial.suggest_categorical('iterations', [100,200,300,500,1000]),

        # learning rate for gradient descent calculations.
        'learning_rate':trial.suggest_float("learning_rate", 0.001, 0.3),

        # Coefficient at the L2 regularization term of the cost function.
        'l2_leaf_reg': trial.suggest_loguniform("l2_leaf_reg", 1e-8, 100),

        # Affects the speed and regularization of tree
        'bootstrap_type': trial.suggest_categorical('bootstrap_type', ['Bayesian', 'Bernoulli', 'MVS']),

        # Frequency to sample weights and objects when building trees.
        'sampling_frequency': trial.suggest_categorical(['PerTree', 'PerTreeLevel']),

        # Nice
        'sampling_unit': trial.suggest_categorical(['Object', 'Group']),

        # The amount of randomness to use for scoring splits.
        'random_strength':trial.suggest_int("random_strength", 1,10),

        # The number of splits for numerical features.
        'max_bin':trial.suggest_categorical('max_bin', [4,5,6,8,10,20,30]),

        # Allowed depth of tree.
        "depth": trial.suggest_int("max_depth", 2,16),

        # Defines how to perform greedy tree construction.
        'grow_policy':trial.suggest_categorical('grow_policy', ['SymmetricTree', 'Depthwise', 'Lossguide']),

        # The minimum number of training samples in a leaf.
        'min_data_in_leaf':trial.suggest_int("min_data_in_leaf", 1,10),

        # Only OHE encodes features if the number of unique values will be <= the parameter vale.
        'one_hot_max_size':trial.suggest_categorical('one_hot_max_size', [5,10,12,100]),
    }

    # Certain parameters are "subparameters" and can only be set if their parent parameter has a certain value.

    # Bootstrap types
    if param['bootstrap_type'] == "Bayesian":

        # Use Baysesian bootstrapping to assign random weights to objects.
        param['bagging_temperature'] = trial.suggest_float('bagging_temperature', 0, 10)

    elif param['bootstrap_type'] in ['Bernoulli', 'MVS']:
        # Sample rate for bagging using Bernoulli/MVS type
        param['subsample'] = trial.suggest_float('subsample', 0.1, 1)

    # Grow policy params
    if param['grow_policy'] != 'SymmetricTree':

        # The minimum number of training samples in a leaf.
        param['min_data_in_leaf'] = trial.suggest_int('min_data_in_leaf', 1, 10)

        if param['grow_policy'] == 'LossGuide':

            # The maximum number of leafs in the tree.
            param['max_leaves'] = trial.suggest_int('max_leaves', 16, 64)

    # Creates the trial model with parameters specified above.
    trial_model = CatBoostClassifier(**param)

    # Fit the training model on training data
    trial_model.fit(X_train_os,
                    y_train_os,
                    eval_set=[(X_test, test_labels)],
                    verbose=0, # Stops Catboost from printing training results.
                    early_stopping_rounds=10 # Specify rounds of no improvement needed before stopping
                    )

    # Create predictions for test set
    preds = trial_model.predict(X_test)

    # Calculate recall score
    recall = recall_score(test_labels, preds)

    return recall

In [None]:
# Instantiate a "trial" object and specify we want to MAXIMIZE the value being returned by the obj function
study = optuna.create_study(direction="maximize")

# Running 100 trials, setting a timeout value of 15 minutes to prevent my computer from exploding.
study.optimize(objective, n_trials=100, timeout=900)

print("Number of finished trials: {}".format(len(study.trials)))
trial = study.best_trial

# "Prettify" our trial results
print("Best trial:")

# Print metric value achived from best trial
print("  Value: {}".format(trial.value))

# Print all parameters from best trial
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

In [None]:
# Create a model with the parameters from our best trial
final_model = CatBoostClassifier(verbose=False, **trial.params)
final_model.fit(X_train_os, y_train_os)

In [None]:
# Show custom metrics
final_results = custom_metric(test_labels, final_model.predict(X_test))