# Cloner la branche 

In [None]:
!rm -rf ActuarialThesis
!git clone https://github.com/aderdouri/ActuarialThesis.git
%ls -ltr ActuarialThesis

In [None]:
!mkdir ActuarialThesis/plots_clf
!ls -ltr ActuarialThesis/plots_clf

In [None]:
# Ajouter le répértoire src
import sys
sys.path.insert(0,'./ActuarialThesis/src/')

In [None]:
import installHelper

In [None]:
print(list(dir(installHelper)))

In [None]:
!ls -ltr

# Installer les packages nécéssaires

In [None]:
installHelper.installALL()

In [None]:
# On doit trouver tous les packages mentionés dans le grep
!pip list -v | grep -e catboost -e 'imbalanced-learn' -e 'optuna' -e 'catboost' -e 'lime' -e 'shap'

# Importer les packages nécéssaires

In [None]:
from helper import *

In [None]:
#Appliquer le theme par défaut
sns.set_theme()

# Partir du dataset déja encodé.

In [None]:
# Partir du dataset déja encodé.
df = pd.read_csv('ActuarialThesis/Data/encodedBASEAUTO.csv')
df.head()

In [None]:
# Separation de la variable target du dataset 
Y = df['CHARGE']
X = df.drop('CHARGE', axis=1)

# Target variable mapping

In [None]:
# Ici nous allons considérer le problème comme un problème de classification
# Class 0 si la CHARGE==0 sinon Class 1.
y = Y.map(lambda val: 0 if val<=0 else 1)

In [None]:
y.unique()

# Modeling

## 3.1 Handling the imbalances

In [None]:
from sklearn.utils.class_weight import compute_class_weight

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

classes = np.unique(y_train)
weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
class_weight = dict(zip(classes, weights))

scale_pos_weight = sum(class_weight.values()) / sum(class_weight.keys())
class_weight, scale_pos_weight

In [None]:
class_weight, scale_pos_weight

In [None]:
import numpy as np
import optuna
import sklearn.datasets
import sklearn.metrics
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import average_precision_score

def objective(trial):
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dvalid = xgb.DMatrix(X_val, label=y_val)

    param = {
        "objective": trial.suggest_categorical("objective", ["binary:logistic"]),
        "eval_metric": trial.suggest_categorical("auc", ["auc"]),
        "class_weight": trial.suggest_categorical("class_weights", [class_weight, class_weight]),
        "scale_pos_weight": trial.suggest_float("scale_pos_weight", scale_pos_weight, scale_pos_weight),
        "tree_method": trial.suggest_categorical("tree_method", ["exact"]),
        "booster": trial.suggest_categorical("booster", ["gbtree", "gblinear", "dart"]),
        "lambda": trial.suggest_float("lambda", 1e-8, 1.0, log=True),
        "alpha": trial.suggest_float("alpha", 1e-8, 1.0, log=True),
      }

    if param["booster"] == "gbtree" or param["booster"] == "dart":
        param["max_depth"] = trial.suggest_int("max_depth", 1, 9)
        param["eta"] = trial.suggest_float("eta", 1e-8, 1.0, log=True)
        param["gamma"] = trial.suggest_float("gamma", 1e-8, 1.0, log=True)
        param["grow_policy"] = trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"])
    if param["booster"] == "dart":
        param["sample_type"] = trial.suggest_categorical("sample_type", ["uniform", "weighted"])
        param["normalize_type"] = trial.suggest_categorical("normalize_type", ["tree", "forest"])
        param["rate_drop"] = trial.suggest_float("rate_drop", 1e-8, 1.0, log=True)
        param["skip_drop"] = trial.suggest_float("skip_drop", 1e-8, 1.0, log=True)

    # Add a callback for pruning.
    pruning_callback = optuna.integration.XGBoostPruningCallback(trial, "validation-auc")
    bst = xgb.train(param, 
                    dtrain, 
                    evals=[(dvalid, "validation")], 
                    callbacks=[pruning_callback])
    
    preds = bst.predict(dvalid)
    pred_labels = np.rint(preds)
    #return accuracy_score(y_val, pred_labels)
    return average_precision_score(y_val, pred_labels)

In [None]:
study = optuna.create_study(
    pruner=optuna.pruners.MedianPruner(n_warmup_steps=5), direction="maximize"
)
study.optimize(objective, n_trials=1000)

In [None]:
xgb_hp = study.best_params
print("Number of finished trials: ", len(study.trials))
print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

In [None]:
optimized_xgb_clf = xgb.XGBClassifier(**xgb_hp)
optimized_xgb_clf.fit(X_train, y_train)

In [None]:
print(optimized_xgb_clf.get_xgb_params())

In [None]:
y_pred = optimized_xgb_clf.predict(X_val)
print(classification_report(y_val, y_pred))
print('Accuracy (balanced): {:.5f}'.format(balanced_accuracy_score(y_val, y_pred)))

In [None]:
import kaleido

In [None]:
fig = optuna.visualization.plot_param_importances(study)
fig.write_image("./ActuarialThesis/plots_clf/myXGBoostParamImportances.pdf")
fig.show()

In [None]:
from xgboost import plot_importance
ax = plot_importance(optimized_xgb_clf, max_num_features=10)
ax.figure.set_size_inches(20, 6)
ax.figure.savefig('./ActuarialThesis/plots_clf/myXGBoostFeatureImportance.pdf')

In [None]:
fig = plot_pr_auc(optimized_xgb_clf, X_train, y_train, X_test, y_test, OurModelName= '_', title='')
fig.set_size_inches(20, 6)
fig.savefig('./ActuarialThesis/plots_clf/myXGBoostAUCPR.pdf', bbox_inches='tight')
plt.show();

In [None]:
fig = plot_classification_report_confusion_matrix(optimized_xgb_clf, X_test, y_test)
fig.set_size_inches(20, 6)
fig.savefig('./ActuarialThesis/plots_clf/myXGBoostConfusionMatrix.pdf', bbox_inches='tight')