# Cloner la branche publique

In [None]:
!rm -rf ActuarialThesis
!git clone https://github.com/aderdouri/ActuarialThesis.git
%ls -ltr ActuarialThesis

In [None]:
!mkdir ActuarialThesis/plots_clf
!ls -ltr ActuarialThesis/plots_clf

In [None]:
# Ajouter le répértoire src
import sys
sys.path.insert(0,'./ActuarialThesis/src/')

In [None]:
import installHelper

In [None]:
print(list(dir(installHelper)))

In [None]:
!ls -ltr

# Installer les packages nécéssaires

In [None]:
installHelper.installALL()

In [None]:
# On doit trouver tous les packages mentionés dans le grep
!pip list -v | grep -e catboost -e 'imbalanced-learn' -e 'optuna' -e 'catboost' -e 'lime' -e 'shap' -e kaleido

# Importer les packages nécéssaires

In [None]:
from helper import *

In [None]:
#Appliquer le theme par défaut
sns.set_theme()

# Partir du dataset déja encodé.

In [None]:
# Partir du dataset déja encodé.
df = pd.read_csv('ActuarialThesis/Data/encodedBASEAUTO.csv')
df.head()

In [None]:
# Separation de la variable target du dataset 
Y = df['CHARGE']
X = df.drop('CHARGE', axis=1)

# Target variable mapping

In [None]:
# Ici nous allons considérer le problème comme un problème de classification
# Class 0 si la CHARGE==0 sinon Class 1.
y = Y.map(lambda val: 0 if val<=0 else 1)

In [None]:
y.unique()

# Modeling

In [None]:
import numpy as np
import optuna
from optuna.integration import CatBoostPruningCallback

import catboost as cb
from sklearn.metrics import accuracy_score, f1_score, average_precision_score
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5)

classes = np.unique(y_train)
weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
class_weights = dict(zip(classes, weights))


def objective(trial: optuna.Trial) -> float:

    param = {
        "verbose": 0,
        "objective": trial.suggest_categorical("objective", ["Logloss"]),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 0.1, log=True),
        "depth": trial.suggest_int("depth", 1, 12),
        "boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
        "bootstrap_type": trial.suggest_categorical(
            "bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]
        ),

        "eval_metric": trial.suggest_categorical("eval_metric", ["PRAUC"]),
        "class_weights": trial.suggest_categorical("class_weights", [class_weights, class_weights]),
        "used_ram_limit": "3gb"
    }

    if param["bootstrap_type"] == "Bayesian":
        param["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 10)
    elif param["bootstrap_type"] == "Bernoulli":
        param["subsample"] = trial.suggest_float("subsample", 0.1, 1, log=True)

    gbm = cb.CatBoostClassifier(**param)

    pruning_callback = CatBoostPruningCallback(trial, "PRAUC")
    gbm.fit(
        X_train,
        y_train,
        eval_set=[(X_val, y_val)],
        verbose=0,
        early_stopping_rounds=100,
        callbacks=[pruning_callback],
    )

    # evoke pruning manually.
    pruning_callback.check_pruned()

    preds = gbm.predict(X_val)
    pred_labels = np.rint(preds)
    #return accuracy_score(y_val, pred_labels)
    #return f1_score(y_val, pred_labels)

    return average_precision_score(y_val, pred_labels)

In [None]:
study = optuna.create_study(
  pruner=optuna.pruners.MedianPruner(n_warmup_steps=5), direction="maximize"
)
study.optimize(objective, n_trials=20)#, timeout=600)

In [None]:
cb_hp = study.best_params
print("Number of finished trials: {}".format(len(study.trials)))
print("Best trial:")
trial = study.best_trial
print("  Value: {}".format(trial.value))
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

In [None]:
optimized_cb_clf = cb.CatBoostClassifier(**cb_hp)
optimized_cb_clf.fit(X_train, y_train, verbose=0)

In [None]:
y_pred = optimized_cb_clf.predict(X_test)
print(classification_report(y_test, y_pred))
print('Accuracy (balanced): {:.5f}'.format(balanced_accuracy_score(y_test, y_pred)))

In [None]:
!pip install kaleido

In [None]:
import kaleido
fig = optuna.visualization.plot_param_importances(study)
fig.write_image("./ActuarialThesis/plots_clf/myCatBoostParamImportances.pdf")
fig.show()

In [None]:
feature_importances = zip(X_train.columns, optimized_cb_clf.feature_importances_)
feature_importances = sorted(feature_importances, key=lambda x: x[1])
feature_importances = pd.DataFrame(feature_importances, columns=['feature', 'importance'])

fig = plt.gcf()
fig.figure.set_size_inches(20, 6)
plt.title('Feature importances for CatBoostClassifier')
sns.barplot(x='importance', y='feature', data=feature_importances[-10:])
plt.savefig('./ActuarialThesis/plots_clf/myCatBoostFeatureImportance.pdf')

In [None]:
fig = plot_pr_auc(optimized_cb_clf, X_train, y_train, X_test, y_test, OurModelName= '_', title='')
fig.set_size_inches(20, 6)
fig.savefig('./ActuarialThesis/plots_clf/myCatBoostGBMAUCPR.pdf', bbox_inches='tight')
plt.show();

In [None]:
fig = plot_classification_report_confusion_matrix(optimized_cb_clf, X_test, y_test)
fig.set_size_inches(20, 6)
fig.savefig('./ActuarialThesis/plots_clf/myCatBoostConfusionMatrix.pdf', bbox_inches='tight')