# Keep in mind
## Automated Hyperparameter Tuning: 
While Grid Search and Random Search are good starting points, exploring automated methods like Hyperopt, Optuna, or the hyperparameter tuning functionalities within libraries like XGBoost could be beneficial.

In [1]:
import os
import sys
import pandas as pd
import numpy as np

#Due to imbalance we are going to use stratified Kfold
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

from xgboost import XGBClassifier

from joblib import dump
import warnings
warnings.filterwarnings('ignore')

In [2]:
#%% Set project directory
current_dir = os.getcwd()
project_root = os.path.abspath(os.path.join(current_dir, '..'))
print(project_root)
sys.path.append(os.path.abspath(project_root))
from utils import load_config, dump_json
from scripts.Processing import preprocessing

/Users/aboubakr/ML-100-Projects/beginner/p2_TitanicSurvival
/Users/aboubakr/ML-100-Projects/beginner/p2_TitanicSurvival


In [3]:
#%% Fetch configs paths
config_path = os.path.join(project_root, 'config.json')
config = load_config(config_path)
train_path = os.path.join(project_root, config["train_path"])
train_path

'/Users/aboubakr/ML-100-Projects/beginner/p2_TitanicSurvival/data/train.csv'

In [4]:
titanic = pd.read_csv(train_path)

In [5]:
titanic.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [6]:
X,y = preprocessing(titanic, train=True)

In [7]:
X.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title,Age^2,Age Fare,Fare^2,Pclass_Fare,Log_fare,FamilySize
0,0.827377,True,-0.592481,0,0,-0.502445,2,2,-0.636573,-0.474875,-0.199305,-0.577965,-0.879741,0.05916
1,-1.566107,False,0.638789,0,0,0.786845,0,3,0.441412,0.939304,0.091101,0.382936,1.36122,0.05916
2,0.827377,False,-0.284663,2,0,-0.488854,2,1,-0.420976,-0.449052,-0.198713,-0.538682,-0.79854,-0.560975
3,-1.566107,False,0.407926,0,0,0.42073,2,3,0.195497,0.467628,-0.03951,0.030196,1.062038,0.05916
4,0.827377,True,0.407926,2,0,-0.486337,2,2,0.195497,-0.407058,-0.198598,-0.531407,-0.784179,-0.560975


In [8]:
y.value_counts(normalize=True)

Survived
0    0.616162
1    0.383838
Name: proportion, dtype: float64

In [9]:
## Now Let's train some stuffs
#from sklearn.model_selection import cross_val_score
#scores = cross_val_score(lr, X, Y, cv=5)
#print(scores)

In [10]:
# Initialize Logistic Regression model
lr = LogisticRegression()
rf = RandomForestClassifier(criterion = 'entropy', class_weight = {0:0.61, 1:0.39})
xgb = XGBClassifier()

# Initialize Stratified K-Fold with 5 splits
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

# List to store the accuracy scores for each fold
accuracy_scores_lr = []
accuracy_scores_rf = []
accuracy_scores_xgb = []

# Perform Stratified K-Fold Cross-Validation
for train_index, test_index in skf.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    # Train the model
    lr.fit(X_train, y_train)
    rf.fit(X_train, y_train)
    xgb.fit(X_train, y_train)

    # Make predictions
    y_pred_lr = lr.predict(X_test)
    y_pred_rf = rf.predict(X_test)
    y_pred_xgb = xgb.predict(X_test)
    
    # Calculate accuracy for the current fold
    accuracy_lr = accuracy_score(y_test, y_pred_lr)
    accuracy_rf = accuracy_score(y_test, y_pred_rf)
    accuracy_xgb = accuracy_score(y_test, y_pred_xgb)

    accuracy_scores_lr.append(accuracy_lr)
    accuracy_scores_rf.append(accuracy_rf)
    accuracy_scores_xgb.append(accuracy_xgb)
# Print accuracy for each fold
for i in range(n_splits):
    print(f"Fold {i+1}:")
    print(f"Accuracy Logistic Regression = {accuracy_scores_lr[i]:.2f}")
    print(f"Accuracy Random Forest = {accuracy_scores_rf[i]:.2f}")
    print(f"Accuracy XGBoost = {accuracy_scores_xgb[i]:.2f}")
    if i < 4 : print("-"*10)

# Print mean accuracy across all folds
print("-"*10 + "Average Accuracy" + '-'*10)
print(f"Mean Accuracy Logistic Regression: {np.mean(accuracy_scores_lr):.2f}")
print(f"Mean Accuracy Random Forest: {np.mean(accuracy_scores_rf):.2f}")
print(f"Mean Accuracy XGBoost: {np.mean(accuracy_scores_xgb):.2f}")

Fold 1:
Accuracy Logistic Regression = 0.80
Accuracy Random Forest = 0.83
Accuracy XGBoost = 0.84
----------
Fold 2:
Accuracy Logistic Regression = 0.82
Accuracy Random Forest = 0.85
Accuracy XGBoost = 0.81
----------
Fold 3:
Accuracy Logistic Regression = 0.80
Accuracy Random Forest = 0.80
Accuracy XGBoost = 0.80
----------
Fold 4:
Accuracy Logistic Regression = 0.79
Accuracy Random Forest = 0.80
Accuracy XGBoost = 0.80
----------
Fold 5:
Accuracy Logistic Regression = 0.82
Accuracy Random Forest = 0.84
Accuracy XGBoost = 0.82
----------Average Accuracy----------
Mean Accuracy Logistic Regression: 0.81
Mean Accuracy Random Forest: 0.83
Mean Accuracy XGBoost: 0.81


In [11]:
#Let's try HyperparamOptimization with Optuna
import optuna
from sklearn.model_selection import cross_val_score

# Define the objective function
def objective(trial):
    n_estimators = trial.suggest_int('n_estimators', 100, 1000, step=50)
    max_depth = trial.suggest_int('max_depth', 3, 10)
    learning_rate = trial.suggest_float('learning_rate', 0.01, 0.3)
    subsample = trial.suggest_float('subsample', 0.5, 1.0)
    colsample_bytree = trial.suggest_float('colsample_bytree', 0.5, 1.0)
    gamma = trial.suggest_float('gamma', 0, 5)
    reg_alpha = trial.suggest_float('reg_alpha', 0, 10)
    reg_lambda = trial.suggest_float('reg_lambda', 0, 10)

    clf = XGBClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        learning_rate=learning_rate,
        subsample=subsample,
        colsample_bytree=colsample_bytree,
        gamma=gamma,
        reg_alpha=reg_alpha,
        reg_lambda=reg_lambda,
        eval_metric = 'logloss')
    return cross_val_score(clf, X, y, cv=5).mean()

# Run the optimization
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=25)

print("Best parameters:", study.best_params)
print("Best accuracy:", study.best_value)

# Visualization
optuna.visualization.plot_optimization_history(study).show()
optuna.visualization.plot_param_importances(study).show()


[I 2024-09-08 23:28:21,212] A new study created in memory with name: no-name-6f018506-687a-447a-bb35-7a742713e2c7
[I 2024-09-08 23:28:21,549] Trial 0 finished with value: 0.8148389931579938 and parameters: {'n_estimators': 500, 'max_depth': 3, 'learning_rate': 0.28458896400564015, 'subsample': 0.5557708763026865, 'colsample_bytree': 0.685662004933344, 'gamma': 0.626451251596572, 'reg_alpha': 9.565393742189913, 'reg_lambda': 9.886389167803042}. Best is trial 0 with value: 0.8148389931579938.
[I 2024-09-08 23:28:21,807] Trial 1 finished with value: 0.8114744837110036 and parameters: {'n_estimators': 400, 'max_depth': 10, 'learning_rate': 0.06393703753078255, 'subsample': 0.6558507468100343, 'colsample_bytree': 0.6651415345943776, 'gamma': 1.4505414741226934, 'reg_alpha': 4.519401867411126, 'reg_lambda': 9.021015415617732}. Best is trial 0 with value: 0.8148389931579938.
[I 2024-09-08 23:28:21,949] Trial 2 finished with value: 0.7980101688531793 and parameters: {'n_estimators': 100, 'max_

Best parameters: {'n_estimators': 800, 'max_depth': 4, 'learning_rate': 0.12614942033795007, 'subsample': 0.5522715636181271, 'colsample_bytree': 0.7103451508534306, 'gamma': 0.3161442306930229, 'reg_alpha': 2.6439934793172037, 'reg_lambda': 4.200663493825497}
Best accuracy: 0.8350134957002071


In [12]:
#!pip install optuna

In [13]:
#Let's try hyperopt
from hyperopt import fmin, tpe, hp, Trials
from hyperopt.pyll.base import scope
from hyperopt.early_stop import no_progress_loss

# Define the objective function
def objective(params):
    clf = XGBClassifier(**params)
    accuracy = cross_val_score(clf, X, y, cv=5).mean()
    return -accuracy  # Minimize negative accuracy

# Define the search space
space = {
    'n_estimators': scope.int(hp.quniform('n_estimators', 100, 1000, 50)),
    'max_depth': scope.int(hp.quniform('max_depth', 3, 10, 1)),
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.3),
    'subsample': hp.uniform('subsample', 0.5, 1.0),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1.0),
    'gamma': hp.uniform('gamma', 0, 5),
    'reg_alpha': hp.uniform('reg_alpha', 0, 10),
    'reg_lambda': hp.uniform('reg_lambda', 0, 10),
}

# Run the optimization
trials = Trials()
best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=100,
            trials=trials,
            early_stop_fn=no_progress_loss(50))
best['n_estimators'] = int(best['n_estimators'])
best['max_depth'] = int(best['max_depth'])
print("Best parameters:", best)


 94%|█████████▍| 94/100 [00:26<00:01,  3.48trial/s, best loss: -0.8451384093904967]
Best parameters: {'colsample_bytree': 0.923443106131321, 'gamma': 0.8084824521606915, 'learning_rate': 0.23446866003248415, 'max_depth': 5, 'n_estimators': 150, 'reg_alpha': 0.4518808253700202, 'reg_lambda': 4.546341774936819, 'subsample': 0.6130960711365323}


In [14]:
#!pip install hyperopt

In [15]:
#Good exercise with hyperopt and optuna that are hyperparam opt that use differant search algorithms to find the best params in the dedicated space
dump_json(best,os.path.join(project_root,config["xgb_params"]))

{'colsample_bytree': 0.923443106131321,
 'gamma': 0.8084824521606915,
 'learning_rate': 0.23446866003248415,
 'max_depth': 5,
 'n_estimators': 150,
 'reg_alpha': 0.4518808253700202,
 'reg_lambda': 4.546341774936819,
 'subsample': 0.6130960711365323}

In [16]:
best_xgb = XGBClassifier(**best)
accuracy_scores_best_xgb = []
# Perform Stratified K-Fold Cross-Validation
for train_index, test_index in skf.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    # Train the model
    best_xgb.fit(X_train, y_train)

    # Make predictions
    y_pred_best_xgb = best_xgb.predict(X_test)
    
    # Calculate accuracy for the current fold
    accuracy_best_xgb = accuracy_score(y_test, y_pred_best_xgb)

    accuracy_scores_best_xgb.append(accuracy_best_xgb)
# Print accuracy for each fold
for i in range(n_splits):
    print(f"Fold {i+1}:")
    print(f"Accuracy Best XGBoost = {accuracy_scores_best_xgb[i]:.2f}")
    if i < 4 : print("-"*10)
    else : print("-"*20)

# Print mean accuracy across all folds
print(f"Mean Accuracy Best XGBoost: {np.mean(accuracy_scores_best_xgb):.2f}")

#Le cout n'en vaut peut à pas la peine mais pour l'exercice on va garder ce best_xgb pour notre model pipeline
xgb_filepath = os.path.join(project_root, config['xgb_save_path'])
dump(best_xgb,xgb_filepath)

Fold 1:
Accuracy Best XGBoost = 0.84
----------
Fold 2:
Accuracy Best XGBoost = 0.86
----------
Fold 3:
Accuracy Best XGBoost = 0.83
----------
Fold 4:
Accuracy Best XGBoost = 0.82
----------
Fold 5:
Accuracy Best XGBoost = 0.85
--------------------
Mean Accuracy Best XGBoost: 0.84


['/Users/aboubakr/ML-100-Projects/beginner/p2_TitanicSurvival/models/xgb.pkl']