<a href="https://colab.research.google.com/github/aderdouri/ActuarialThesis/blob/master/myLightGBMClassifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Classification LightGBM

## Cloner la branche contenant le dataset le code qui va avec.

In [None]:
!rm -rf ActuarialThesis
!git clone https://github.com/aderdouri/ActuarialThesis.git
%ls -ltr ActuarialThesis

In [None]:
!mkdir ActuarialThesis/plots_clf
!ls -ltr ActuarialThesis/plots_clf

In [None]:
# Ajouter le répértoire src
import sys
sys.path.insert(0,'./ActuarialThesis/src/')

In [None]:
import installHelper

In [None]:
print(list(dir(installHelper)))

In [None]:
!ls -ltr

# Installer les packages nécéssaires

In [None]:
%time installHelper.installALL()

In [None]:
# On doit trouver tous les packages mentionés dans le grep
!pip list -v | grep -e catboost -e 'imbalanced-learn' -e 'optuna' -e 'catboost' -e 'lime' -e 'shap' -e 'seaborn' -e 'scikit-plot'

# Importer les packages nécéssaires

In [None]:
from helper import *

In [None]:
#Appliquer le theme par défaut
sns.set_theme()

# Partir du dataset déja encodé.

In [None]:
# Partir du dataset déja encodé.
df = pd.read_csv('ActuarialThesis/Data/encodedBASEAUTO.csv')
df.head()

In [None]:
# Separation de la variable target du dataset
Y = df['CHARGE']
X = df.drop('CHARGE', axis=1)

# Target variable mapping

In [None]:
# Ici nous allons considérer le problème comme un problème de classification
# Class 0 si la CHARGE==0 sinon Class 1.
y = Y.map(lambda val: 0 if val<=0 else 1)

In [None]:
y.unique()

# Modeling

## 3.1 Handling the imbalances

In [None]:
from sklearn.utils.class_weight import compute_class_weight

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5)

classes = np.unique(y_train)
weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
class_weights = dict(zip(classes, weights))

import optuna.integration.lightgbm as lgb
from lightgbm import early_stopping
from lightgbm import log_evaluation
import sklearn.datasets
from sklearn.model_selection import KFold

dtrain = lgb.Dataset(X_train, label=y_train)

params = {
    "objective": "binary",
    #"metric": "binary_logloss",
    "metric": "auc",
    "verbosity": -1,
    "boosting_type": "gbdt",
    "class_weight": "balanced",
}

my_study = optuna.create_study(direction='maximize')

tuner = lgb.LightGBMTunerCV(
    params,
    dtrain,
    folds=KFold(n_splits=5),
    callbacks=[early_stopping(1000),
               log_evaluation(1000)],
    study=my_study,
)

tuner.run()

In [None]:
print("Best score:", tuner.best_score)
best_params = tuner.best_params
print("Best params:", best_params)
print("  Params: ")
for key, value in best_params.items():
    print("    {}: {}".format(key, value))

In [None]:
optimized_lgbm_clf = lgbm.LGBMClassifier(**best_params)
optimized_lgbm_clf.fit(X_train, y_train)

In [None]:
y_pred = optimized_lgbm_clf.predict(X_val)
print(classification_report(y_val, y_pred))
print('Accuracy (balanced): {:.5f}'.format(balanced_accuracy_score(y_val, y_pred)))

In [None]:
tuner.study

In [None]:
#fig = optuna.visualization.plot_param_importances(my_study)
#fig.write_image("./ActuarialThesis/plots_clf/myLightGBMClassifierParamImportances.pdf")
#fig.show()

In [None]:
from lightgbm import plot_importance
ax = plot_importance(optimized_lgbm_clf, max_num_features=10)
ax.figure.set_size_inches(20, 6)
ax.figure.savefig('./ActuarialThesis/plots_clf/myLightGBMClassifierFeatureImportances.pdf')

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, sharey=True, figsize=(18, 6))
y_probas = optimized_lgbm_clf.predict_proba(X_train)
skplt.metrics.plot_precision_recall(y_train, y_probas, ax = ax1, title = 'Precision-Recall Curve on Train set')

y_probas = optimized_lgbm_clf.predict_proba(X_test)
skplt.metrics.plot_precision_recall(y_test, y_probas, ax = ax2, title = 'Precision-Recall Curve on Test set')

fig.set_size_inches(20, 6)
fig.savefig('./ActuarialThesis/plots_clf/myLightGBMClassifierPrecisionRecallCurve.pdf', bbox_inches='tight')
plt.show();

In [None]:
fig = plot_classification_report_confusion_matrix(optimized_lgbm_clf, X_test, y_test)
fig.set_size_inches(20, 6)
fig.savefig('./ActuarialThesis/plots_clf/myLightGBMClassifierConfusionMatrix.pdf', bbox_inches='tight')

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(18, 6))
y_probas = optimized_lgbm_clf.predict_proba(X_test)
skplt.metrics.plot_cumulative_gain(y_test, y_probas, ax = ax1)
skplt.metrics.plot_lift_curve(y_test, y_probas, ax = ax2)
fig.set_size_inches(20, 6)
fig.savefig('./ActuarialThesis/plots_clf/myLightGBMClassifier_cumulative_gain_lift_curve.pdf', bbox_inches='tight')
plt.show();