# Regression LightGBM

## Cloner la branche contenant le dateset le le code qui va avec.

In [None]:
!rm -rf ActuarialThesis
!git clone https://github.com/aderdouri/ActuarialThesis.git
%ls -ltr ActuarialThesis

In [None]:
!mkdir ActuarialThesis/plots_rgr
!ls -ltr ActuarialThesis/plots_rgr

In [None]:
# Ajouter le répértoire src
import sys
sys.path.insert(0,'./ActuarialThesis/src/')

In [None]:
import installHelper

In [None]:
print(list(dir(installHelper)))

In [None]:
!ls -ltr

## Installer les packages nécéssaires

In [None]:
installHelper.installALL()

In [None]:
# On doit trouver tous les packages mentionés dans le grep
!pip list -v | grep -e catboost -e 'imbalanced-learn' -e 'optuna' -e 'catboost' -e 'lime' -e 'shap'

## Importer les packages nécéssaires

In [None]:
from helper import *

In [None]:
#Appliquer le theme par défaut
sns.set_theme()

## Partir du dataset déjé encodé.

In [None]:
# Partir du dataset déja encodé.
df = pd.read_csv('ActuarialThesis/Data/encodedBASEAUTO.csv')
df.head()

In [None]:
X = df.drop('CHARGE', axis=1)
y = df['CHARGE']

Tout d'abord, nous séparons la cible du cadre de données avec des caractéristiques (df -> X, y).

Ensuite, nous divisons les données en ensembles train/val/test dans le rapport 60:20:20. L'idée est que nous utiliserons l'ensemble train pour entraîner nos modèles, l'ensemble val pour les valider et l'ensemble test pour calculer l'erreur finale du mélange. L'ensemble de test sera donc constitué de données totalement inédites.

Pour ce faire, utilisez un train_test_split régulier de sklearn pour diviser X et y en parties train et val/test dans le ratio 60:40. Ensuite, utilisez à nouveau train_test_split, mais pour diviser la partie val/test obtenue en validation et test dans un rapport 50:50. Dans chaque application de train_test_split, utilisez random_state=13 et les autres valeurs de paramètres par défaut.

Au final, vous devriez obtenir X_train, X_val, X_test avec les formes suivantes, respectivement : (23786, 58), (7929, 58), (7929, 58). La même logique s'applique à y_train, y_val, y_test.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=13)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=13)

In [None]:
from lightgbm import LGBMRegressor
from lightgbm import early_stopping, log_evaluation
from sklearn.metrics import mean_squared_error

In [None]:
lgbm_rgr = LGBMRegressor(objective='regression', 
                         n_estimators=200,
                         learning_rate=0.01, 
                         max_depth=5, 
                         random_state=13)

lgbm_rgr.fit(X_train, y_train, 
             eval_set=[(X_val, y_val)], 
             eval_metric='rmse', 
             callbacks=[early_stopping(stopping_rounds=50),
             log_evaluation(period=20, show_stdv=True)
             ]
        )
y_pred_lgbm_rgr = lgbm_rgr.predict(X_val)
rmse = mean_squared_error(y_val, y_pred_lgbm_rgr, squared=False)
print("RMSE: %.5f" % rmse)

In [None]:
# Create two subplots and unpack the output array immediately
fig, (ax1, ax2) = plt.subplots(1, 2, sharey=True,  figsize=(15, 4))

# Create scatter plot with actual and predicted values
sns.scatterplot(ax=ax1, x=y_val, y=y_pred_lgbm_rgr)
ax1.set_xlabel('Actual Values')
ax1.set_ylabel('Predicted Values')
ax1.set_title('Actual vs Predicted Values')

# Create regression plot with actual and predicted values
sns.regplot(ax=ax2, x=y_val, y=y_pred_lgbm_rgr, scatter_kws={'s': 10}, line_kws={'color': 'red'})
ax2.set_xlabel('Predicted Values')
ax2.set_ylabel('Residuals')
ax2.set_title('Residual Plot of Actual vs Predicted Values');

In [None]:
def cross_validation_fcn(X_train, model, early_stopping_flag=False):
    """
    Performs cross-validation on a given model using KFold and returns the average
    mean squared error (MSE) score across all folds.

    Parameters:
    - X_train: the training data to use for cross-validation
    - model: the machine learning model to use for cross-validation
    - early_stopping_flag: a boolean flag to indicate whether early stopping should be used

    Returns:
    - model: the trained machine learning model
    - mean_mse: the average MSE score across all folds
    """
    mse_list = []
    for train_index, val_index in kf.split(X_train):
        # Split the data into training and validation sets
        X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
        y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]
        
        # Train the model on the training set
        if early_stopping_flag:
            # Use early stopping if enabled
            model.fit(X_train_fold, y_train_fold, 
                      eval_set=[(X_val_fold, y_val_fold)],
                      callbacks=[early_stopping(stopping_rounds=250, verbose=0)])
        else:
            model.fit(X_train_fold, y_train_fold)
            
        # Make predictions on the validation set and calculate the MSE score
        y_pred = model.predict(X_val_fold)
        mse = mean_squared_error(y_val_fold, y_pred, squared=False)
        mse_list.append(mse)
        
    # Return the trained model and the average MSE score
    return model, np.mean(mse_list)

In [None]:
seed = 2042
from sklearn.model_selection import KFold
n_folds = 5
# create KFold object
kf = KFold(n_splits=n_folds, shuffle=True, random_state=seed)
from sklearn.metrics import r2_score, accuracy_score

In [None]:
# define the objective function for Optuna optimization
def objective(trial):
    # set up the parameters to be optimized
    param = {
        'objective': trial.suggest_categorical('objective', ['regression', 'tweedie']),
        'metric': trial.suggest_categorical('metric', ['rmse']),
        'random_state': trial.suggest_categorical('random_state', [seed]),
        'n_estimators': trial.suggest_categorical('n_estimators', [10000]),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-4, 10.0,log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-4, 10.0,log=True),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.3, 1.0),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.2,log=True),
        'max_depth': trial.suggest_int('max_depth', 4, 8),
        'num_leaves': trial.suggest_int('num_leaves', 40, 100),
        'min_child_samples': trial.suggest_int('min_child_samples', 1, 100),
        'cat_smooth': trial.suggest_int('cat_smooth', 1, 100),
        'force_col_wise': trial.suggest_categorical('force_col_wise', [True])      
    }

    if param["objective"] == "tweedie":
        param["tweedie_variance_power"] = trial.suggest_float("tweedie_variance_power", 1.1, 1.9)


    # create the LightGBM regressor with the optimized parameters
    model = LGBMRegressor(**param)
    
    # perform cross-validation using the optimized LightGBM regressor
    model, mean_score =  cross_validation_fcn(X_train, 
                                              model, 
                                              early_stopping_flag=True)
        
    # retrieve the best iteration of the model and store it as a user attribute in the trial object
    best_iteration = model.best_iteration_
    trial.set_user_attr('best_iteration', best_iteration)
        
    return mean_score

In [None]:
# Create an optimization study with Optuna library
study = optuna.create_study(direction="minimize",study_name="lgbm_opt")
# Optimize the study using a user-defined objective function, for a total of 50 trials
study.optimize(objective, n_trials=3)

In [None]:
print("Number of finished trials: ", len(study.trials))
print("Best trial:")
trial = study.best_trial
hp_lgbm = study.best_params
hp_lgbm["n_estimators"] = study.best_trial.user_attrs['best_iteration']

# Print the objective value and the set of hyperparameters of the best trial
print("  Value: {}".format(trial.value))
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

In [None]:
fig = optuna.visualization.plot_param_importances(study)
fig.write_image("./ActuarialThesis/plots_rgr/myLightGBMRegressorParamImportances.pdf")
fig.show()

In [None]:
optimized_lgbm_rgr = LGBMRegressor(**hp_lgbm)
optimized_lgbm_rgr.fit(X_train, y_train)
y_pred_lgbm_rgr = optimized_lgbm_rgr.predict(X_val)

print("Best rmse:", mean_squared_error(y_pred_lgbm_rgr, y_val, squared=False))
print("R2 using LightGBM: ", r2_score(y_val, y_pred_lgbm_rgr ))

In [None]:
reduced_features = ['AGECOND', 'RM', 'ACV']
X_train_reduced = X_train[reduced_features]
X_val_reduced = X_val[reduced_features]

optimized_lgbm_rgr = LGBMRegressor(**hp_lgbm)
optimized_lgbm_rgr.fit(X_train_reduced, y_train)
y_pred_lgbm_rgr = optimized_lgbm_rgr.predict(X_val_reduced)

print("Best rmse:", mean_squared_error(y_pred_lgbm_rgr, y_val, squared=False))
print("R2 using LightGBM: ", r2_score(y_val, y_pred_lgbm_rgr ))

In [None]:
type(X_train)

In [None]:
from lightgbm import plot_importance
ax = plot_importance(optimized_lgbm_rgr, max_num_features=10)
ax.figure.set_size_inches(20, 6)
ax.figure.savefig('./ActuarialThesis/plots_rgr/myLightGBMRegressorFeatureImportance.pdf')

In [None]:
# Create two subplots and unpack the output array immediately
fig, (ax1, ax2) = plt.subplots(1, 2, sharey=True,  figsize=(15, 4))

# Create scatter plot with actual and predicted values
sns.scatterplot(ax=ax1, x=y_test, y=y_pred_lgbm_rgr)
ax1.set_xlabel('Actual Values')
ax1.set_ylabel('Predicted Values')
ax1.set_title('Actual vs Predicted Values')

# Create regression plot with actual and predicted values
sns.regplot(ax=ax2, x=y_test, y=y_pred_lgbm_rgr, scatter_kws={'s': 10}, line_kws={'color': 'red'})
ax2.set_xlabel('Predicted Values')
ax2.set_ylabel('Residuals')
ax2.set_title('Residual Plot of Actual vs Predicted Values');

fig.savefig('./ActuarialThesis/plots_rgr/myLightGBMRegressorActualvsPredicted.pdf')

In [None]:
#filename = "optimized_xgb_rgr.pkl"
#pickle.dump(optimized_xgb_rgr, open(filename, "wb"))