# Regression XGBoost

## Cloner la branche contenant le dateset le le code qui va avec.

In [None]:
!rm -rf ActuarialThesis
!git clone https://github.com/aderdouri/ActuarialThesis.git
%ls -ltr ActuarialThesis

In [None]:
!mkdir ActuarialThesis/plots_rgr
!ls -ltr ActuarialThesis/plots_rgr

In [None]:
# Ajouter le répértoire src
import sys
sys.path.insert(0,'./ActuarialThesis/src/')

In [None]:
import installHelper

In [None]:
print(list(dir(installHelper)))

In [None]:
!ls -ltr

## Installer les packages nécéssaires

In [None]:
installHelper.installALL()

In [None]:
# On doit trouver tous les packages mentionés dans le grep
!pip list -v | grep -e catboost -e 'imbalanced-learn' -e 'optuna' -e 'catboost' -e 'lime' -e 'shap'

## Importer les packages nécéssaires

In [None]:
from helper import *

In [None]:
#Appliquer le theme par défaut
sns.set_theme()

## Partir du dataset déjé encodé.

In [None]:
# Partir du dataset déja encodé.
df = pd.read_csv('ActuarialThesis/Data/encodedBASEAUTO.csv')
df.head()

In [None]:
X = df.drop('CHARGE', axis=1)
y = df['CHARGE']

Tout d'abord, nous séparons la cible du cadre de données avec des caractéristiques (df -> X, y).

Ensuite, nous divisons les données en ensembles train/val/test dans le rapport 60:20:20. L'idée est que nous utiliserons l'ensemble train pour entraîner nos modèles, l'ensemble val pour les valider et l'ensemble test pour calculer l'erreur finale du mélange. L'ensemble de test sera donc constitué de données totalement inédites.

Pour ce faire, utilisez un train_test_split régulier de sklearn pour diviser X et y en parties train et val/test dans le ratio 60:40. Ensuite, utilisez à nouveau train_test_split, mais pour diviser la partie val/test obtenue en validation et test dans un rapport 50:50. Dans chaque application de train_test_split, utilisez random_state=13 et les autres valeurs de paramètres par défaut.

Au final, vous devriez obtenir X_train, X_val, X_test avec les formes suivantes, respectivement : (23786, 58), (7929, 58), (7929, 58). La même logique s'applique à y_train, y_val, y_test.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=13)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=13)

In [None]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error

In [None]:
xgb_rgr = XGBRegressor(objective='reg:squarederror', 
                       n_estimators=200, 
                       learning_rate=0.01, 
                       max_depth=5, 
                       random_state=13)

xgb_rgr.fit(X_train, y_train, 
        eval_set=[(X_val, y_val)], 
        early_stopping_rounds=50, 
        verbose=False)

y_pred_xgb_rgr = xgb_rgr.predict(X_val)
rmse = mean_squared_error(y_val, y_pred_xgb_rgr, squared=False)
print("RMSE: %.5f" % rmse)

In [None]:
xgb_rgr = XGBRegressor(objective='reg:tweedie',
                       tweedie_variance_power=1.7,
                       n_estimators=200, 
                       learning_rate=0.01, 
                       max_depth=5, 
                       random_state=13
                       )

xgb_rgr.fit(X_train, y_train, 
        eval_set=[(X_val, y_val)], 
        early_stopping_rounds=50, 
        verbose=False)

y_pred_xgb_rgr = xgb_rgr.predict(X_val)
rmse = mean_squared_error(y_val, y_pred_xgb_rgr, squared=False)
print("RMSE: %.5f" % rmse)

In [None]:
# Create two subplots and unpack the output array immediately
fig, (ax1, ax2) = plt.subplots(1, 2, sharey=True,  figsize=(15, 4))

# Create scatter plot with actual and predicted values
sns.scatterplot(ax=ax1, x=y_val, y=y_pred_xgb_rgr)
ax1.set_xlabel('Actual Values')
ax1.set_ylabel('Predicted Values')
ax1.set_title('Actual vs Predicted Values')

# Create regression plot with actual and predicted values
sns.regplot(ax=ax2, x=y_val, y=y_pred_xgb_rgr, scatter_kws={'s': 10}, line_kws={'color': 'red'})
ax2.set_xlabel('Predicted Values')
ax2.set_ylabel('Residuals')
ax2.set_title('Residual Plot of Actual vs Predicted Values');

In [None]:
SAMPLE_RATE = 0.4
RANDOM_SEED = 1
EARLY_STOPPING_ROUND = 100

In [None]:
def objective(trial):
    param = {
        'objective': trial.suggest_categorical('objective', ['reg:squarederror', 'reg:tweedie']),
        'max_depth': trial.suggest_int('max_depth', 1, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 1.0),
        'n_estimators': trial.suggest_int('n_estimators', 50, 1000),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'gamma': trial.suggest_float('gamma', 0.01, 1.0),
        'subsample': trial.suggest_float('subsample', 0.01, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.01, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.01, 1.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.01, 1.0),
        #'random_state': trial.suggest_int('random_state', 1, 1000)
    }

    if param["objective"] == "reg:tweedie":
        param["tweedie_variance_power"] = trial.suggest_float("tweedie_variance_power", 1.1, 1.9)

    regressor = XGBRegressor(**param,
                             early_stopping_rounds=EARLY_STOPPING_ROUND
                             )

    regressor.fit(X_train, y_train,
                  eval_set=[(X_val, y_val)],
                  #early_stopping_rounds=EARLY_STOPPING_ROUND,
                  verbose=False)

    y_pred = regressor.predict(X_val)
    return mean_squared_error(y_val, y_pred, squared=False)

In [None]:
# Create the study
study = optuna.create_study(direction='minimize', study_name='xgb_regression')
study.optimize(objective, n_trials=1000)

In [None]:
xgx_reg_hp = study.best_params
print("Number of finished trials: {}".format(len(study.trials)))
print("Best trial:")
trial = study.best_trial
print("  Value: {}".format(trial.value))
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

In [None]:
fig = optuna.visualization.plot_param_importances(study)
fig.write_image("./ActuarialThesis/plots_rgr/myXGBoostRegressorParamImportances.pdf")
fig.show()

In [None]:
optimized_xgb_rgr = XGBRegressor(**study.best_params)
optimized_xgb_rgr.fit(X_train, y_train)
y_pred_xgb_rgr = optimized_xgb_rgr.predict(X_val)
print('RMSE: ', mean_squared_error(y_val, y_pred_xgb_rgr, squared=False))

In [None]:
from xgboost import plot_importance
ax = plot_importance(optimized_xgb_rgr, max_num_features=10)
ax.figure.set_size_inches(20, 6)
ax.figure.savefig('./ActuarialThesis/plots_rgr/myXGBoostRegressorFeatureImportance.pdf')

In [None]:
# Create two subplots and unpack the output array immediately
fig, (ax1, ax2) = plt.subplots(1, 2, sharey=True,  figsize=(15, 4))

# Create scatter plot with actual and predicted values
sns.scatterplot(ax=ax1, x=y_val, y=y_pred_xgb_rgr)
ax1.set_xlabel('Actual Values')
ax1.set_ylabel('Predicted Values')
ax1.set_title('Actual vs Predicted Values')

# Create regression plot with actual and predicted values
sns.regplot(ax=ax2, x=y_val, y=y_pred_xgb_rgr, scatter_kws={'s': 10}, line_kws={'color': 'red'})
ax2.set_xlabel('Predicted Values')
ax2.set_ylabel('Residuals')
ax2.set_title('Residual Plot of Actual vs Predicted Values');

fig.savefig('./ActuarialThesis/plots_rgr/myXGBoostRegressorActualvsPredicted.pdf')

In [None]:
#filename = "optimized_xgb_rgr.pkl"
#pickle.dump(optimized_xgb_rgr, open(filename, "wb"))