# Day 9 — Model Optimization & Cross-Validation
**Goal:** Improve model performance using GridSearchCV & RandomizedSearchCV, compare tuned models, and validate fairness/stability across groups.


### 1.1 Import Libraries and Load Data

In [1]:
# Importing required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, precision_score, recall_score
import joblib
import json
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Reading cleaned data from csv file
df = pd.read_csv("../data/titanic_clean.csv")
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,FamilySize,IsAlone,Title,Fare_per_person,AgeBin
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S,2,0,Mr,3.625,Adult
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C,2,0,Mrs,35.64165,Adult
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S,1,1,Miss,7.925,Adult
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S,2,0,Mrs,26.55,Adult
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S,1,1,Mr,8.05,Adult


In [3]:
# Separating input and output features
X = df[['Pclass', 'Sex', 'Age', 'Fare', 'FamilySize', 'IsAlone']]
X['Sex'] = X['Sex'].map({'male': 1, 'female': 0})
y = df['Survived']

In [4]:
# Split data into train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)


In [5]:
# Scaling numerical columns for Logistic Regression
scaler = StandardScaler()
X_train_scaled = X_train.copy()
X_train_scaled[['Age', 'Fare']] = scaler.fit_transform(X_train[['Age', 'Fare']])
X_test_scaled = X_test.copy()
X_test_scaled[['Age', 'Fare']] = scaler.transform(X_test[['Age', 'Fare']])

### Baseline Models Evaluation

In [6]:
# Models Evaluation function
def evaluate_model(m, X, y, name = "model"):
    y_pred = m.predict(X)
    y_proba  = m.predict_proba(X)[:, 1] if hasattr(m, "predict_proba") else None
    return{
        "accuracy": round(accuracy_score(y, y_pred),4),
        "roc_auc": round(roc_auc_score(y, y_proba), 4) if y_proba is not None else None,
        "f1": round(f1_score(y, y_pred),4),
        "precision": round(precision_score(y, y_pred),4),
        "recall": round(recall_score(y, y_pred), 4)
    }

# Baseline Logistic Regression and Random Forest
lr = LogisticRegression(max_iter=1000, random_state=42)
rf = RandomForestClassifier(random_state=42)

# Fitting both models on training data
lr.fit(X_train_scaled, y_train)
rf.fit(X_train, y_train)

print("Baseline Logistic Regression:", evaluate_model(lr, X_test, y_test, "LR"))
print("Baseline Random Forest:", evaluate_model(rf, X_test, y_test, "RF"))

Baseline Logistic Regression: {'accuracy': 0.6648, 'roc_auc': 0.7903, 'f1': 0.375, 'precision': 0.8182, 'recall': 0.2432}
Baseline Random Forest: {'accuracy': 0.8268, 'roc_auc': 0.9069, 'f1': 0.7862, 'precision': 0.8028, 'recall': 0.7703}


## Hyperparameter Optimization (Grid & Random)

In [7]:
## define cv for cross validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = 'roc_auc'

### Logistic Regression - GridSearchCV

In [8]:
# Parameters grid
lr_param_grid = {
    'C':[0.01, 0.1, 1, 5, 10],
    'penalty': ['l2'],
    'solver': ['lbfgs', 'saga']
}

# Logistic Regression Grid
lr_grid = GridSearchCV(LogisticRegression(max_iter=2000, random_state = 42),
                       lr_param_grid, scoring = scoring, cv = cv, n_jobs=-1, verbose=1)

# Fitting grid on training data
lr_grid.fit(X_train_scaled, y_train)
print(f"Best LR Params: {lr_grid.best_params_}, Best CV ROC-AUC: {lr_grid.best_score_:.4f}")
best_lr = lr_grid.best_estimator_

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best LR Params: {'C': 5, 'penalty': 'l2', 'solver': 'lbfgs'}, Best CV ROC-AUC: 0.8421


### Random Forest - RandomizedSearchCV

In [9]:
# Initializing rf paramaters
from scipy.stats import randint as sp_randint
rf_param_dist = {
    'n_estimators': sp_randint(50, 300),
    'max_depth': sp_randint(3, 20),
    'min_samples_split': sp_randint(2, 10),
    'min_samples_leaf': sp_randint(1, 6),
    'max_features': ['sqrt', 'log2', 0.5, 0.8]
}

# Initializing Randomized Search CV
rf_random = RandomizedSearchCV(RandomForestClassifier(random_state = 42), rf_param_dist,
                               n_iter=60, scoring = scoring, cv = cv, random_state = 42, n_jobs = -1, verbose=1)

# fitting on training data
rf_random.fit(X_train, y_train)
print(f"Best RF Random Params: {rf_random.best_estimator_} Best CV ROC-AUC: {rf_random.best_score_:.4f}")
best_rf = rf_random.best_estimator_

Fitting 5 folds for each of 60 candidates, totalling 300 fits
Best RF Random Params: RandomForestClassifier(max_depth=7, max_features='log2', min_samples_leaf=2,
                       n_estimators=77, random_state=42) Best CV ROC-AUC: 0.8628


In [16]:
import joblib

# Save the model to a file
joblib.dump(best_rf, 'tuned_rf_model.pkl')
print("Model saved successfully!")

Model saved successfully!


### Advanced CV & Compare Models

In [10]:
best_lr

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,5
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'lbfgs'
,max_iter,2000


In [11]:
models = {
    'LogisticRegression (tuned)': best_lr,
    'RandomForest (tuned)': best_rf
}

num_cols = ['Age', 'Fare']
def cv_metrics(model, X_data, y_data, cv=cv):
    # if model is LR expect scaled features
    if isinstance(model, LogisticRegression):
        X_data = X_data.copy()
        X_data[num_cols] = scaler.transform(X_data[num_cols])
    else:
        X_data = X_data.copy()
    scores = cross_val_score(model, X_data, y_data, scoring='roc_auc', cv=cv, n_jobs=-1)
    return {
        "cv_roc_auc_mean": scores.mean(),
        "cv_roc_auc_std": scores.std(),
        "cv_roc_auc_scores": [round(score, 3) for score in scores]
    }

summary = {}
for name, m in models.items():
    summary[name] = cv_metrics(m, X, y)
pd.DataFrame(summary).T


Unnamed: 0,cv_roc_auc_mean,cv_roc_auc_std,cv_roc_auc_scores
LogisticRegression (tuned),0.85397,0.023562,"[0.878, 0.864, 0.828, 0.824, 0.876]"
RandomForest (tuned),0.876701,0.02049,"[0.904, 0.885, 0.853, 0.853, 0.888]"


### Model Evaluation on Test Data

In [12]:
eval_results = {}
eval_results['LR_tuned'] = evaluate_model(best_lr, X_test_scaled, y_test)
eval_results['RF_tuned'] = evaluate_model(best_rf, X_test, y_test)
eval_results = pd.DataFrame(eval_results).T
eval_results

Unnamed: 0,accuracy,roc_auc,f1,precision,recall
LR_tuned,0.8101,0.8825,0.7571,0.803,0.7162
RF_tuned,0.8101,0.9049,0.7463,0.8333,0.6757


### Fairness Check & Robustness Re-run

In [13]:
# Creating data frame for test data
df_test = X_test.copy()
df_test['Survived'] = y_test.values

# LR Predictions
df_test['pred_lr'] = best_lr.predict(X_test_scaled)

# RF Predictions
df_test['pred_rf'] = best_lr.predict(X_test)

# function for checking grouped performance
def grouped_perf(df, group_col, pred_col):
    rows = []
    for val, g in df.groupby(group_col):
        rows.append({
            group_col:val,
            'accuracy': accuracy_score(g['Survived'], g[pred_col]),
            'precision': precision_score(g['Survived'], g[pred_col]),
            'recall': recall_score(g['Survived'], g[pred_col]),
            'auc_roc': roc_auc_score(g['Survived'], g[pred_col]),

        })
    return display(pd.DataFrame(rows))


print("LR grouped performance:")
grouped_perf(df_test, 'Sex', 'pred_lr')
print("RF grouped performance:")
grouped_perf(df_test, 'Sex', 'pred_rf')


LR grouped performance:


Unnamed: 0,Sex,accuracy,precision,recall,auc_roc
0,0,0.811594,0.809524,0.980769,0.637443
1,1,0.809091,0.666667,0.090909,0.539773


RF grouped performance:


Unnamed: 0,Sex,accuracy,precision,recall,auc_roc
0,0,0.449275,1.0,0.269231,0.634615
1,1,0.8,0.5,0.090909,0.534091


### Checking Stability With Noisy Data

In [14]:
np.random.seed(42)
X_noisy = X_test.copy()
X_noisy['Age'] = X_noisy['Age'] + np.random.normal(0, 0.2, size = len(X_noisy))
X_noisy['Fare'] = X_noisy['Fare'] * np.random.uniform(0.95, 1.05, size = len(X_noisy))
stab = (best_rf.predict(X_test) == best_rf.predict(X_noisy)).mean()
print(f"Prediction Stability (best_rf) under noise: {stab:.4f}")

Prediction Stability (best_rf) under noise: 0.9553


## Day 9 — Key Results

**Baseline (holdout)**  
- LR baseline Accuracy =  0.6648, ROC_AUC = 0.7903 
- RF baseline = Accuracy =  0.8268, ROC_AUC =  0.9069,

**Tuned Models (holdout)**  
- Logistic Regression (best params): `{'C': 5, 'penalty': 'l2', 'solver': 'lbfgs'}`  
  - Holdout metrics: Accuracy = 0.8101 ROC-AUC = 	0.8825 , F1 = 0.7571


- RandomForest (best params): `max_depth=7, max_features='log2', min_samples_leaf=2 n_estimators=77`  
  - Holdout metrics: Accuracy = 0.8101 ROC-AUC = 0.9049, F1 = 0.7463

**Cross-Validation**  
- LR cv roc_auc mean = 0.85397
- RF cv roc_auc mean = 0.86670

**Fairness Check (by Sex)**

**LR Fairness Check**
| Sex | Accuracy | Precision | Recall  | AUC_ROC  |
|-----|-----------|------------|---------|----------|
| 0   | 0.811594  | 0.809524   | 0.980769 | 0.637443 |
| 1   | 0.809091  | 0.666667   | 0.090909 | 0.539773 |

**RF Fairness Check**
| Sex | Accuracy | Precision | Recall  | AUC_ROC  |
|-----|-----------|------------|---------|----------|
| 0   | 0.449275  | 1.000000   | 0.269231 | 0.634615 |
| 1   | 0.800000  | 0.500000   | 0.090909 | 0.534091 |


**Actionable Takeaway**
We evaluated Random Forest performance separately for males (Sex = 0) and females (Sex = 1) in the Titanic survival prediction task.

Base Model:
The baseline Random Forest performed reasonably well, especially for males, with high recall indicating it correctly identified many survivors. However, performance for females was noticeably weaker with low recall.

Tuned Model:
After tuning, overall performance did not improve.

Male performance dropped substantially in accuracy and recall, meaning the model failed to correctly identify many male survivors.

Female performance still struggled, with recall remaining very low, indicating the model continues to miss a large proportion of surviving females.

Hyperparameter tuning did not benefit the model. It reduced its ability to correctly classify survivors in both groups, especially males.
The base model remains the more reliable option for predicting survival across gender groups.


### XGBoost - RandomizedSearchCV

In [15]:
# pip install xgboost if necessary
from xgboost import XGBClassifier
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb_param = {
    'n_estimators': [50,100,200],
    'max_depth': [3,5,7],
    'learning_rate': [0.01,0.05,0.1],
    'subsample': [0.6,0.8,1.0]
}
xgb_rand = RandomizedSearchCV(xgb, xgb_param, n_iter=20, scoring='roc_auc', cv=cv, n_jobs=-1, random_state=42)
xgb_rand.fit(X_train, y_train)
best_xgb = xgb_rand.best_estimator_
print(f"XGB best CV: {xgb_rand.best_score_:.4f}")
# Evaluate on holdout
evaluate_model(best_xgb, X_test, y_test)


XGB best CV: 0.8630


{'accuracy': 0.8156,
 'roc_auc': 0.8841,
 'f1': 0.7591,
 'precision': 0.8254,
 'recall': 0.7027}

### Day 9 — Model Optimization & Cross-Validation

- Performed GridSearchCV for Logistic Regression (scaled features).
- Performed RandomizedSearchCV + refinement for Random Forest.
- Compared CV-ROC-AUC across models and validated on a holdout set.
- Performed post-tuning group fairness checks and quick robustness tests.
