# Keep in mind

## Custom Metrics: 
custom metrics are needed (e.g., a weighted F1-score or other domain-specific metrics). Understanding how to implement and use these can be very useful.

## Automated Hyperparameter Tuning: 
While Grid Search and Random Search are good starting points, exploring automated methods like Hyperopt, Optuna, or the hyperparameter tuning functionalities within libraries like XGBoost could be beneficial.

In [15]:
import os
import sys
import pandas as pd
import numpy as np

#Due to imbalance we are going to use stratified Kfold
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

import warnings
warnings.filterwarnings('ignore')

In [16]:
#%% Set project directory
current_dir = os.getcwd()
project_root = os.path.abspath(os.path.join(current_dir, '..'))
print(project_root)
sys.path.append(os.path.abspath(project_root))
from utils import load_config
from scripts.Processing import preprocessing

/Users/aboubakr/ML-100-Projects/beginner/p2_TitanicSurvival


In [17]:
#%% Fetch configs paths
config_path = os.path.join(project_root, 'config.json')
config = load_config(config_path)
train_path = os.path.join(project_root, config["train_path"])
train_path

'/Users/aboubakr/ML-100-Projects/beginner/p2_TitanicSurvival/data/train.csv'

In [18]:
titanic = pd.read_csv(train_path)

In [19]:
titanic.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [20]:
X,y = preprocessing(titanic, train=True)

In [21]:
X.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title,Age^2,Age Fare,Fare^2,Pclass_Fare,Log_fare,FamilySize
0,0.827377,True,-0.592481,0,0,-0.502445,2,2,-0.636573,-0.474875,-0.199305,-0.577965,-0.879741,0.05916
1,-1.566107,False,0.638789,0,0,0.786845,0,3,0.441412,0.939304,0.091101,0.382936,1.36122,0.05916
2,0.827377,False,-0.284663,2,0,-0.488854,2,1,-0.420976,-0.449052,-0.198713,-0.538682,-0.79854,-0.560975
3,-1.566107,False,0.407926,0,0,0.42073,2,3,0.195497,0.467628,-0.03951,0.030196,1.062038,0.05916
4,0.827377,True,0.407926,2,0,-0.486337,2,2,0.195497,-0.407058,-0.198598,-0.531407,-0.784179,-0.560975


In [22]:
y.value_counts(normalize=True)

Survived
0    0.616162
1    0.383838
Name: proportion, dtype: float64

In [23]:
## Now Let's train some stuffs
#from sklearn.model_selection import cross_val_score
#scores = cross_val_score(lr, X, Y, cv=5)
#print(scores)

In [27]:
# Initialize Logistic Regression model
lr = LogisticRegression()
rf = RandomForestClassifier(criterion = 'entropy', class_weight = {0:0.61, 1:0.39})

# Initialize Stratified K-Fold with 5 splits
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

# List to store the accuracy scores for each fold
accuracy_scores_lr = []
accuracy_scores_rf = []
wf1_scores_lr = []
wf1_scores_rf = []

# Perform Stratified K-Fold Cross-Validation
for train_index, test_index in skf.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    # Train the model
    lr.fit(X_train, y_train)
    rf.fit(X_train, y_train)

    # Make predictions
    y_pred_lr = lr.predict(X_test)
    y_pred_rf = rf.predict(X_test)
    
    # Calculate accuracy for the current fold
    accuracy_lr = accuracy_score(y_test, y_pred_lr)
    accuracy_rf = accuracy_score(y_test, y_pred_rf)

    wf1_lr = weighted_f1_score(y_test, y_pred_lr)
    wf1_rf = weighted_f1_score(y_test, y_pred_rf)

    accuracy_scores_lr.append(accuracy_lr)
    accuracy_scores_rf.append(accuracy_rf)
    wf1_scores_lr.append(wf1_lr)
    wf1_scores_rf.append(wf1_rf)
# Print accuracy for each fold
for i in range(n_splits):
    print(f"Fold {i+1}:")
    print(f"Accuracy Logistic Regression = {accuracy_scores_lr[i]:.2f}")
    print(f"Weighted F1-Score Logistic Regression = {wf1_scores_lr[i]:.2f}")
    print("-"*5)
    print(f"Accuracy Random Forest = {accuracy_scores_rf[i]:.2f}")
    print(f"Weighted F1-Score Random Forest= {wf1_scores_rf[i]:.2f}")
    if i < 4 : print("-"*10)

# Print mean accuracy across all folds
print("-"*10 + "Means" + '-'*10)
print(f"Mean Accuracy Logistic Regression: {np.mean(accuracy_scores_lr):.2f}")
print(f"Mean Weighted F1-Score Logistic Regression: {np.mean(wf1_scores_lr):.2f}")
print("-"*5)
print(f"Mean Accuracy Random Forest: {np.mean(accuracy_scores_rf):.2f}")
print(f"Mean Weighted F1-Score Random Forest: {np.mean(wf1_scores_rf):.2f}")

Fold 1:
Accuracy Logistic Regression = 0.80
Weighted F1-Score Logistic Regression = 0.80
-----
Accuracy Random Forest = 0.84
Weighted F1-Score Random Forest= 0.84
----------
Fold 2:
Accuracy Logistic Regression = 0.82
Weighted F1-Score Logistic Regression = 0.82
-----
Accuracy Random Forest = 0.84
Weighted F1-Score Random Forest= 0.84
----------
Fold 3:
Accuracy Logistic Regression = 0.80
Weighted F1-Score Logistic Regression = 0.80
-----
Accuracy Random Forest = 0.81
Weighted F1-Score Random Forest= 0.81
----------
Fold 4:
Accuracy Logistic Regression = 0.79
Weighted F1-Score Logistic Regression = 0.79
-----
Accuracy Random Forest = 0.79
Weighted F1-Score Random Forest= 0.79
----------
Fold 5:
Accuracy Logistic Regression = 0.82
Weighted F1-Score Logistic Regression = 0.82
-----
Accuracy Random Forest = 0.85
Weighted F1-Score Random Forest= 0.85
----------Means----------
Mean Accuracy Logistic Regression: 0.81
Mean Weighted F1-Score Logistic Regression: 0.81
-----
Mean Accuracy Random