# Regression CatBoost

## Cloner la branche contenant le dateset le le code qui va avec.

In [None]:
!rm -rf ActuarialThesis
!git clone https://github.com/aderdouri/ActuarialThesis.git
%ls -ltr ActuarialThesis

In [None]:
!mkdir ActuarialThesis/plots_rgr
!ls -ltr ActuarialThesis/plots_rgr

In [None]:
# Ajouter le répértoire src
import sys
sys.path.insert(0,'./ActuarialThesis/src/')

In [None]:
import installHelper

In [None]:
print(list(dir(installHelper)))

In [None]:
!ls -ltr

## Installer les packages nécéssaires

In [None]:
installHelper.installALL()

In [None]:
# On doit trouver tous les packages mentionés dans le grep
!pip list -v | grep -e catboost -e 'imbalanced-learn' -e 'optuna' -e 'catboost' -e 'lime' -e 'shap'

## Importer les packages nécéssaires

In [None]:
from helper import *

In [None]:
#Appliquer le theme par défaut
sns.set_theme()

## Partir du dataset déjé encodé.

In [None]:
# Partir du dataset déja encodé.
df = pd.read_csv('ActuarialThesis/Data/encodedBASEAUTO.csv')
df.head()

In [None]:
X = df.drop('CHARGE', axis=1)
y = df['CHARGE']

Tout d'abord, nous séparons la cible du cadre de données avec des caractéristiques (df -> X, y).

Ensuite, nous divisons les données en ensembles train/val/test dans le rapport 60:20:20. L'idée est que nous utiliserons l'ensemble train pour entraîner nos modèles, l'ensemble val pour les valider et l'ensemble test pour calculer l'erreur finale du mélange. L'ensemble de test sera donc constitué de données totalement inédites.

Pour ce faire, utilisez un train_test_split régulier de sklearn pour diviser X et y en parties train et val/test dans le ratio 60:40. Ensuite, utilisez à nouveau train_test_split, mais pour diviser la partie val/test obtenue en validation et test dans un rapport 50:50. Dans chaque application de train_test_split, utilisez random_state=13 et les autres valeurs de paramètres par défaut.

Au final, vous devriez obtenir X_train, X_val, X_test avec les formes suivantes, respectivement : (xxx, xxx), (xxx, xx), (xxx, xxx). La même logique s'applique à y_train, y_val, y_test.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=13)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=13)

In [None]:
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score
import optuna

In [None]:
SAMPLE_RATE = 0.4
RANDOM_SEED = 1
EARLY_STOPPING_ROUND = 100

In [None]:
# 'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.2,log=True),
def objective(trial):
    param = {}
    param['loss_function'] = trial.suggest_categorical("loss_function", ['RMSE', 'Tweedie:variance_power=1.9'])
    param['learning_rate'] = trial.suggest_float("learning_rate", 0.001, 0.02, log=True)
    param['depth'] = trial.suggest_int('depth', 9, 15)
    param['l2_leaf_reg'] = trial.suggest_float('l2_leaf_reg', 1.0, 5.5, log=True)
    param['min_child_samples'] = trial.suggest_categorical('min_child_samples', [1, 4, 8, 16, 32])
    param['grow_policy'] = trial.suggest_categorical('grow_policy', ['Depthwise']) 
    param['iterations'] = trial.suggest_categorical('iterations', [10000])
    param['use_best_model'] = trial.suggest_categorical('use_best_model', [True]) 
    param['eval_metric'] = trial.suggest_categorical('eval_metric', ['RMSE'])
    param['od_type'] = trial.suggest_categorical('od_type', ['Iter'])
    param['od_wait'] = trial.suggest_categorical('od_wait', [20])
    param['random_state'] = trial.suggest_categorical('random_state', [RANDOM_SEED])
    param['logging_level'] = trial.suggest_categorical('logging_level', ['Silent'])
    
    regressor = CatBoostRegressor(**param)

    regressor.fit(X_train, y_train,
                  eval_set=[(X_val, y_val)],
                  early_stopping_rounds=EARLY_STOPPING_ROUND)
    
    y_pred = regressor.predict(X_val)
    loss = mean_squared_error(y_val, y_pred, squared=False)
    return loss

In [None]:
%%time
study = optuna.create_study(study_name=f'catboost-seed{RANDOM_SEED}')
study.optimize(objective, n_trials=3, n_jobs=-1) #, timeout=24000)

In [None]:
print("Number of finished trials: ", len(study.trials))
print("Best trial:")
trial = study.best_trial
hp_cb = study.best_params

# Print the objective value and the set of hyperparameters of the best trial
print("  Value: {}".format(trial.value))
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

In [None]:
import kaleido
fig = optuna.visualization.plot_param_importances(study)
fig.write_image("./ActuarialThesis/plots_rgr/myCatBoostRegressorParamImportances.pdf")
fig.show()

In [None]:
#optuna.visualization.plot_edf(study)

In [None]:
optimized_cb_rgr = CatBoostRegressor(**hp_cb)
optimized_cb_rgr.fit(X_train, y_train,
                     eval_set=[(X_val, y_val)], 
                     early_stopping_rounds=50, 
                     verbose=False,
                     )

In [None]:
y_pred_cb_rgr = optimized_cb_rgr.predict(X_test)
print("Best rmse:", mean_squared_error(y_pred_cb_rgr, y_test, squared=False))
print("R2 using CatBoost: ", r2_score(y_test, y_pred_cb_rgr ))

In [None]:
feature_importances = zip(X_train.columns, optimized_cb_rgr.feature_importances_)
feature_importances = sorted(feature_importances, key=lambda x: x[1])
feature_importances = pd.DataFrame(feature_importances, columns=['feature', 'importance'])

fig = plt.gcf()
fig.figure.set_size_inches(20, 6)
plt.title('Feature importances for CatBoostClassifier')
sns.barplot(x='importance', y='feature', data=feature_importances[-10:])
plt.savefig('./ActuarialThesis/plots_rgr/myCatBoostRegressorFeatureImportance.pdf')

In [None]:
# Create two subplots and unpack the output array immediately
fig, (ax1, ax2) = plt.subplots(1, 2, sharey=True,  figsize=(15, 4))

# Create scatter plot with actual and predicted values
sns.scatterplot(ax=ax1, x=y_test, y=y_pred_cb_rgr)
ax1.set_xlabel('Actual Values')
ax1.set_ylabel('Predicted Values')
ax1.set_title('Actual vs Predicted Values')

# Create regression plot with actual and predicted values
sns.regplot(ax=ax2, x=y_test, y=y_pred_cb_rgr, scatter_kws={'s': 10}, line_kws={'color': 'red'})
ax2.set_xlabel('Predicted Values')
ax2.set_ylabel('Residuals')
ax2.set_title('Residual Plot of Actual vs Predicted Values');

fig.savefig('./ActuarialThesis/plots_rgr/myCatBoostRegressorActualvsPredicted.pdf')

In [None]:
#filename = "optimized_cb_rgr.pkl"
#pickle.dump(optimized_cb_rgr, open(filename, "wb"))

In [None]:
from joblib import dump, load

In [None]:
# Saving model
dump(optimized_cb_rgr, './ActuarialThesis/Models/optimized_cb_rgr.joblib') 

In [None]:
# loading model
my_optimized_cb_rgr = load('./ActuarialThesis/Models/optimized_cb_rgr.joblib') 

In [None]:
y_pred_cb_rgr = my_optimized_cb_rgr.predict(X_test)
print("Best rmse:", mean_squared_error(y_pred_cb_rgr, y_test, squared=False))
print("R2 using CatBoost: ", r2_score(y_test, y_pred_cb_rgr ))

In [None]:
shap.initjs()

In [None]:
# Explain the model's predictions using SHAP values
explainer = shap.TreeExplainer(optimized_cb_rgr)
shap_values = explainer.shap_values(X_test)

In [None]:
shap.force_plot(base_value=explainer.expected_value,
                shap_values=shap_values[-1, :], 
                features=X_test.iloc[-1, :],
                matplotlib=True,
                show=False)
plt.savefig('./ActuarialThesis/plots_rgr/myCatBoostRegressorSHAP01.pdf', format='pdf', dpi=600, bbox_inches='tight')
plt.show()

In [None]:
shap.summary_plot(shap_values, X_test, show=False)
plt.savefig('./ActuarialThesis/plots_rgr/myCatBoostRegressorSHAP02.pdf', format='pdf', dpi=600, bbox_inches='tight')
plt.show()

In [None]:
# LIME has one explainer for all the models
explainer = LimeTabularExplainer(X_test.values, 
                                 feature_names=X_train.columns.values.tolist(),
                                 class_names=['MEDV'], 
                                 verbose=True, 
                                 mode='regression')

In [None]:
# Choose the 5th instance and use it to predict the results
j = 5
exp = explainer.explain_instance(X_test.values[j], optimized_cb_rgr.predict, num_features=6)

In [None]:
# Show the predictions
exp.show_in_notebook(show_table=True)

In [None]:
exp.as_pyplot_figure()
plt.savefig('./ActuarialThesis/plots_rgr/myCatBoostRegressorLime.pdf', format='pdf', dpi=600, bbox_inches='tight')
plt.show();