# Advance catboost option tested on titanic

## Import lib

In [None]:
import os
import sys

import numpy as np
import pandas as pd

import matplotlib as mpl
import matplotlib.pyplot as plt

# ML
import sklearn

## Replace missing
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

## Stratified
from sklearn.model_selection import StratifiedShuffleSplit

## Metric
from sklearn.metrics import roc_auc_score

## Train
from catboost import CatBoostClassifier, Pool

%load_ext autoreload
%autoreload 2

## Import data

In [None]:
df_raw = pd.read_csv('../data/titanic/train.csv')

## Select columns

In [None]:
id_cols = ['PassengerId', 'Name', 'Cabin', 'Ticket']
y_cols = ['Survived']
X_cols = [col for col in df_raw.columns if col not in (id_cols + y_cols) ]

# Define what is a categorical feature
type_list = df_raw[X_cols].dtypes
cat_cols = list(type_list[type_list == 'object'].to_dict().keys())
print(50 * '-')
print('Categorical')
print(cat_cols)
print('')

num_cols = [col for col in X_cols if col not in cat_cols]
print(50 * '-')
print('Numerical')
print(num_cols)
print('')

## Fill missing values

https://scikit-learn.org/stable/modules/generated/sklearn.impute.SimpleImputer.html


In [None]:
imputer_numeric = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
])
imputer_categoric = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent'))
])

# then we put the features list and the transformers together
# using the column transformer

preprocessor = ColumnTransformer(transformers=[
    ('path_passthrough_id', 'passthrough', id_cols + y_cols),  # can use 'drop' to drop column
    ('imputer_numeric', imputer_numeric, num_cols),
    ('imputer_categoric', imputer_categoric, cat_cols),
])

df_train = pd.DataFrame(
    data=preprocessor.fit_transform(df_raw),
    columns=id_cols + y_cols + num_cols + cat_cols
)

## Define X and y

In [None]:
X = df_train[X_cols]
y = df_train[y_cols]

## Train valid split 

In [None]:
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=0)

count = 0
for train_index, valid_index in sss.split(X, y):
    # print("TRAIN:", train_index, "VALID:", valid_index)
    X_train, X_valid = X[X.index.isin(train_index)], X[X.index.isin(valid_index)]
    y_train, y_valid = y[y.index.isin(train_index)].astype(int), y[y.index.isin(valid_index)].astype(int)

## Catboost training

In [None]:
nb_estimators = [2, 5, 10, 20, 50, 100, 200, 500, 1000]  #, 2000]
records = []

y_true = y_valid.values.reshape(1,-1)[0]

for i_esti, nb_esti in enumerate(nb_estimators):
    # Define classifier
    early_stopping_step = np.max([5, int(np.ceil(4*np.log10(nb_esti)))]) 
    clf = CatBoostClassifier(
        # Define categorical features
        cat_features=cat_cols,
        # Define overfit detector parameters
        od_pval=None,
        od_wait=early_stopping_step,
        od_type='Iter',
        # Define number of estimator
        n_estimators=nb_esti,
        random_seed=42,
    )

    # Train classifier
    clf.fit(
        X_train,
        y_train,
        eval_set=[(X_valid, y_valid)],
        plot=True,
        logging_level="Silent",
    )
    
    # Compute scores and metric
    y_scores = clf.predict_proba(X_valid)[:,1]
    auc = roc_auc_score(y_true, y_scores)

    # Compute feature importance
    df_feat_imp = pd.DataFrame(
        data=clf.get_feature_importance().reshape(1, -1),
        columns=X_train.columns
    )
    
    # Save results in dictionary
    model_record = {
        'nb_esti': nb_esti,
        'auc': auc,
        'clf': clf,
        'early_stopping_step': early_stopping_step,
        **df_feat_imp.transpose().sort_values(by=0, ascending=False).to_dict()[0]
#        'feature_imp': df_feat_imp.transpose().sort_values(by=0, ascending=False).to_dict()[0],
    }
    records.append(model_record)
    
    # Log infos
    print(25*"-")
    print(f"nb_esti: {nb_esti}")
    print(f"early_stopping_step: {early_stopping_step}")
    print(f"AUC {auc}")
    print(df_feat_imp.sort_values(by=0, ascending=False, axis=1))
    print(25*"=")
    print("")
    
    if i_esti==0:
        sorted_input_cols = list(df_feat_imp.sort_values(by=0, ascending=False, axis=1).columns)

## Order results

In [None]:
ordered_cols = (
    ["nb_esti", "auc", "early_stopping_step"] +
    sorted_input_cols +
    ["clf"]
)

df_records = pd.DataFrame(records)[ordered_cols]

display(df_records[df_records.columns[:-1]].set_index("nb_esti"))

## Compare results

In [None]:
plt.figure()
plt.plot(df_records['nb_esti'], df_records['auc'])
plt.xscale('log')
plt.xlabel('nb_esti')
plt.ylabel('AUC')
plt.title('Hyperoptimization')
plt.show()

## Select best model

In [None]:
max_auc = df_records['auc'].max()
model_max = df_records[df_records['auc']==max_auc].to_dict('r')[0]
clf_max = model_max['clf'] 
print(model_max)

---

# End of script