# XGBoost

In [None]:
import time

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import xgboost as xgb
from IPython.display import display
from sklearn.model_selection import (GridSearchCV, ParameterGrid,
                                     RandomizedSearchCV)
from xgboost import XGBClassifier
import pickle
import random
from tqdm.notebook import trange, tqdm
from imblearn.pipeline import Pipeline as Pipeline_imb
from imblearn.over_sampling import SMOTENC, RandomOverSampler
from tools import (Modelisation, datasets, SearchCV, restauration_CV, 
                   graph_2scores_CV, graph_3scores_CV, graph_param_CV, 
                   best_score_CV, graph_2scores_CV_comp)
import json

## Matplotlib : 
# Pour affichage interactif (notamment 3D) dans notebook
# %matplotlib widget

# Pour affichage interactif (notamment 3D) dans une fenêtre qt externe au notebook
# %matplotlib qt

# Pour affichage simple dans notebook
# %config InlineBackend.figure_format = 'png'

t = time.time()

In [None]:
df = pd.read_csv('data/df_train_prepro.csv').sample(frac=1)
df.shape

**Définition des variables quantitatives, des variables catégorielles et des datasets :**

In [None]:
datasets_df = datasets(df)
X_quant = datasets_df['X_quant']
X_cat = datasets_df['X_cat']
X = datasets_df['X']
y = datasets_df['y']

In [None]:
len(y[y == 0]) / (len(y[y == 1]))

## 1. Modèle brut
### 1.1. Variables quantitatives

In [None]:
m = Modelisation(X_quant, y, XGBClassifier(n_jobs=-1))
m.show_conf_matrix()
m.show_metrics_score()

In [None]:
m.show_ROC()

In [None]:
m.plot_importance()

Affichage d'un arbre (par défaut le premier parmi les autres) :

In [None]:
m.show_graph()

### 1.2. Variables catégorielles

In [None]:
m = Modelisation(X_cat, y, XGBClassifier(n_jobs=-1))
m.show_conf_matrix()
m.show_metrics_score()

In [None]:
m.show_ROC()

In [None]:
fig, ax = plt.subplots(figsize=(10, 8))
m.plot_importance(ax=ax)

In [None]:
m.show_graph()

### 1.3. Variables quantitatives + catégorielles

In [None]:
m = Modelisation(X, y, XGBClassifier(n_jobs=-1))
m.show_conf_matrix()
m.show_metrics_score()

In [None]:
m.show_ROC()

In [None]:
fig, ax = plt.subplots(figsize=(8, 6))
m.plot_importance(ax=ax)

In [None]:
m.show_graph()

## 2. Tuning des hyperparamètres avec GridSearchCV

On calcule le GridSearchCV sur un dataset plus petit pour avoir un temps de calcul raisonnable.

### 2.1. Calcul et sauvegarde

https://xgboost.readthedocs.io/en/latest/python/python_api.html#module-xgboost.sklearn

In [None]:
model = XGBClassifier(booster='gbtree', objective='binary:logistic', n_jobs=1)

cv_params = {
        'n_estimators': [50, 100, 150, 200, 300],    
        'max_depth': [2, 4, 6, 8, 10],
        'learning_rate': [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'min_child_weight': [1, 3, 5, 10],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],    
        'scale_pos_weight': [1, 16.5]   # A typical value to consider: sum(negative instances) / sum(positive instances)
        }

print(len(ParameterGrid(cv_params)))

In [None]:
SearchCV(model, cv_params, data_frac=0.02, random=True, n_iter=10000, random_state=1234, n_jobs=28)

### 2.2. Restauration des résultats

In [None]:
dico, results = restauration_CV('XGBoost_CV_Randomized10000_54000_0.02')

In [None]:
results.head(10)

**Graphiques XY avec 2 scores**

In [None]:
graph_2scores_CV(dico, results, 'recall', 'f1', s=1)

In [None]:
graph_2scores_CV(dico, results, 'recall', 'f1', s=20, zoom=0.1)

In [None]:
graph_2scores_CV(dico, results, 'recall', 'f3', s=1)

In [None]:
graph_2scores_CV(dico, results, 'recall', 'f5', s=1)

In [None]:
graph_2scores_CV(dico, results, 'recall', 'precision', s=1)

**Graphique 3D avec 3 scores**

In [None]:
graph_3scores_CV(dico, results, 'recall', 'precision', 'f1', s=0.1)

In [None]:
graph_3scores_CV(dico, results, 'f1', 'f3', 'f5', s=0.1)

**Graphiques de l'effet des paramètres**

In [None]:
graph_param_CV(dico, results, ncols=2, height=3.5, width=6)

**Paramètres donnant le meilleur score**

In [None]:
best_params = best_score_CV(dico, results, 'f3')

In [None]:
best_params['n_jobs'] = -1
PARAMS = {}
PARAMS[dico['model_name']] = best_params

**Analyse sur la base complète**

In [None]:
m = Modelisation(X, y, XGBClassifier(**best_params))
m.show_conf_matrix()
m.show_metrics_score()

Essayons avec le `learning_rate` par défaut :

In [None]:
if 'learning_rate' in best_params:
    del best_params['learning_rate']
m = Modelisation(X, y, XGBClassifier(**best_params))
m.show_conf_matrix()
m.show_metrics_score()

Nous reproduisons le graphique f1 vs recall en prenant 10% des meilleures combinaisons (selon le recall), en fittant le modèle sur 10% des données.

In [None]:
%%time

df_small = pd.read_csv('data/df_train_prepro.csv').sample(frac=0.1)
datasets_df_small = datasets(df_small, verbose=False)
X_small = datasets_df_small['X']
y_small = datasets_df_small['y']

f1_list = []
recall_list = []
params_list = []


results_sort = results.sort_values(by=f'mean_test_recall', ascending=False)


nb_tot = int(0.10 * len(results))
nb = 25

random.seed(1)
sample = random.sample(list(range(nb_tot)), nb)
for j in trange(nb):
    i = sample[j]
    params = results_sort.iloc[i].params
    m = Modelisation(X_small, y_small, XGBClassifier(**params))
    params_list.append(params)
    f1_list.append(m.metrics_score['f1'])
    recall_list.append(m.metrics_score['recall'])
    
plt.figure(figsize=(14, 8))
plt.scatter(recall_list, f1_list, marker='o')
plt.xlabel('recall')
plt.ylabel('f1')
plt.show()
 
# dico_ = {'params': params_list, 'f1': f1_list, 'recall': recall_list}     
# r = pd.DataFrame(dico_).sort_values(by='recall', ascending=False)

Même graphique avec le `learning_rate` par défaut :

In [None]:
%%time

sample = random.sample(list(range(nb_tot)), nb)
for j in trange(nb):
    i = sample[j]
    params = results_sort.iloc[i].params
    if 'learning_rate' in params:
        del params['learning_rate']
    m = Modelisation(X_small, y_small, XGBClassifier(**params))
    params_list.append(params)
    f1_list.append(m.metrics_score['f1'])
    recall_list.append(m.metrics_score['recall'])
    
plt.figure(figsize=(14, 8))
plt.scatter(recall_list, f1_list, marker='o')
plt.xlabel('recall')
plt.ylabel('f1')
plt.show()

**Test de paramètres**

In [None]:
params = {'colsample_bytree': 1.0, 
          'gamma': 1,
          'min_child_weight': 15,
          'scale_pos_weight': 16.5, 
          'subsample': 0.8,
          'n_jobs': -1}

In [None]:
m = Modelisation(X, y, XGBClassifier(**params))
m.show_conf_matrix()
m.show_metrics_score()

## 3. XGBoost avec oversampling

### 3.1. GridSearchCV avec RandomOverSampler

In [None]:
over = RandomOverSampler(random_state=1234)

pipeline = Pipeline_imb([
            ('over', over),
            ('model', XGBClassifier(booster='gbtree', objective='binary:logistic', n_jobs=1))
            ])

cv_params = {
        "over__sampling_strategy": [0.1, 0.2, 0.4, 0.6, 0.8, 1],
        "model__n_estimators": [50, 100, 150, 200, 300],    
        "model__max_depth": [2, 4, 6, 8, 10],
        "model__learning_rate": [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3],
        "model__gamma": [0.5, 1, 1.5, 2, 5],
        "model__min_child_weight": [1, 3, 5, 10],
        "model__subsample": [0.6, 0.8, 1.0],
        "model__colsample_bytree": [0.6, 0.8, 1.0],    
        "model__scale_pos_weight": [1, 16.5]   # A typical value to consider: sum(negative instances) / sum(positive instances)
        }

scoring = {'recall': 'recall',
           'precision': 'precision',
           'f1': 'f1',
           'f3': make_scorer(fbeta_score, beta=3),
           'f5': make_scorer(fbeta_score, beta=5)
          }

print(len(ParameterGrid(cv_params)))

In [None]:
SearchCV(pipeline, cv_params, data_frac=0.02, random=True, n_iter=20000, random_state=1234, n_jobs=28)

In [None]:
dico, results = restauration_CV('RandomOver_XGBoost_CV_Randomized20000_324000_0.02')

In [None]:
graph_2scores_CV(dico, results, 'recall', 'f1', s=0.2)

In [None]:
graph_2scores_CV(dico, results, 'recall', 'f3', s=0.2)

In [None]:
graph_2scores_CV(dico, results, 'recall', 'f5', s=0.2)

In [None]:
graph_2scores_CV(dico, results, 'recall', 'precision', s=0.2)

In [None]:
graph_3scores_CV(dico, results, 'recall', 'precision', 'f1', s=0.05)

In [None]:
graph_3scores_CV(dico, results, 'f1', 'f3', 'f5', s=0.05)

In [None]:
graph_param_CV(dico, results, ncols=2, height=3.5, width=6)

In [None]:
best_params = best_score_CV(dico, results, 'f3')

In [None]:
best_params['model__n_jobs'] = -1
PARAMS[dico['model_name']] = best_params

In [None]:
m = Modelisation(X, y, pipeline.set_params(**best_params))
m.show_conf_matrix()
m.show_metrics_score()

### 3.2. GridSearchCV avec SMOTE

### 3.3. Comparaison

In [None]:
dr1 = restauration_CV('XGBoost_CV_Randomized10000_54000_0.02', verbose=False)
dr2 = restauration_CV('RandomOver_XGBoost_CV_Randomized20000_324000_0.02', verbose=False)

In [None]:
graph_2scores_CV_comp([dr2, dr1], 'recall', 'f3', s=[0.5, 1], alpha=0.3)

In [None]:
graph_2scores_CV_comp([dr2, dr1], 'recall', 'precision', s=[1, 2], alpha=0.3)

## 4. XGBoost avec RFECV
### 4.1. Sans oversampling

In [None]:
with open("backups/RFECV_XGBoost.json", 'r') as f:
    export = json.load(f)
    columns_quant_RFECV = export['columns_quant']
    columns_cat_RFECV = export['columns_cat']

In [None]:
model = XGBClassifier(booster='gbtree', objective='binary:logistic', n_jobs=1)

cv_params = {
        'n_estimators': [50, 100, 150, 200, 300],    
        'max_depth': [2, 4, 6, 8, 10],
        'learning_rate': [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'min_child_weight': [1, 3, 5, 10],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],    
        'scale_pos_weight': [1, 16.5]   # A typical value to consider: sum(negative instances) / sum(positive instances)
        }

print(len(ParameterGrid(cv_params)))

In [None]:
SearchCV(model, cv_params, columns_quant=columns_quant_RFECV, columns_cat=columns_cat_RFECV, data_frac=0.02, random=True, n_iter=10000, random_state=1234, n_jobs=28, name='RFECV')

In [None]:
dico, results = restauration_CV('XGBoost_RFECV_CV_Randomized10000_54000_0.02')

In [None]:
best_params = best_score_CV(dico, results, 'f3', display_table=False)

In [None]:
best_params['n_jobs'] = -1
PARAMS[dico['model_name']] = best_params

In [None]:
datasets_df_RFECV = datasets(df, columns_quant=columns_quant_RFECV, columns_cat=columns_cat_RFECV)
X_RFECV = datasets_df_RFECV['X']

In [None]:
m = Modelisation(X_RFECV, y, XGBClassifier(**best_params))
m.show_conf_matrix()
m.show_metrics_score()

### 4.2. Avec oversampling

In [None]:
over = RandomOverSampler(random_state=1234)

pipeline = Pipeline_imb([
            ('over', over),
            ('model', XGBClassifier(booster='gbtree', objective='binary:logistic', n_jobs=1))
            ])

cv_params = {
        "over__sampling_strategy": [0.1, 0.2, 0.4, 0.6, 0.8, 1],
        "model__n_estimators": [50, 100, 150, 200, 300],    
        "model__max_depth": [2, 4, 6, 8, 10],
        "model__learning_rate": [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3],
        "model__gamma": [0.5, 1, 1.5, 2, 5],
        "model__min_child_weight": [1, 3, 5, 10],
        "model__subsample": [0.6, 0.8, 1.0],
        "model__colsample_bytree": [0.6, 0.8, 1.0],    
        "model__scale_pos_weight": [1, 16.5]   # A typical value to consider: sum(negative instances) / sum(positive instances)
        }

print(len(ParameterGrid(cv_params)))

In [None]:
SearchCV(pipeline, cv_params, columns_quant=columns_quant_RFECV, columns_cat=columns_cat_RFECV, data_frac=0.02, random=True, n_iter=20000, random_state=1234, n_jobs=28, name='RFECV')

In [None]:
dico, results = restauration_CV('RandomOver_XGBoost__RFECV_CV_Randomized20000_324000_0.02')

In [None]:
best_params = best_score_CV(dico, results, 'f3', display_table=False)

In [None]:
best_params['model__n_jobs'] = -1
PARAMS[dico['model_name']] = best_params
with open(f"backups/PARAMS_XGBoost.json", 'w') as f:
    json.dump(PARAMS, f, indent=2)

In [None]:
m = Modelisation(X, y, pipeline.set_params(**best_params))
m.show_conf_matrix()
m.show_metrics_score()

### 4.3. Comparaison

In [None]:
dr1 = restauration_CV('XGBoost_CV_Randomized10000_54000_0.02', verbose=False)
dr2 = restauration_CV('XGBoost_RFECV_CV_Randomized10000_54000_0.02', verbose=False)
dr3 = restauration_CV('RandomOver_XGBoost_CV_Randomized20000_324000_0.02', verbose=False)
dr4 = restauration_CV('RandomOver_XGBoost__RFECV_CV_Randomized20000_324000_0.02', verbose=False)

In [None]:
graph_2scores_CV_comp([dr1, dr2], 'recall', 'f3', s=[1, 1], alpha=0.3)

In [None]:
graph_2scores_CV_comp([dr1, dr2], 'recall', 'precision', s=[2, 2], alpha=0.3)

In [None]:
graph_2scores_CV_comp([dr3, dr4], 'recall', 'f3', s=[0.5, 0.5], alpha=0.3)

In [None]:
graph_2scores_CV_comp([dr3, dr4], 'recall', 'precision', s=[1, 1], alpha=0.3)

In [None]:
print(f"Temps d'exécution total : {time.strftime('%H:%M:%S', time.gmtime(time.time()-t))}")