# Introduction
This notebook summarizes and compares the results of all trained models, highlighting performance metrics, feature importance, and observations for model selection.


# Import Libraries

In [92]:
import pandas as pd
import pickle
import os
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
from sklearn.model_selection import RandomizedSearchCV

# Baseline Models

## Load Results

In [93]:
# Get the logistic_regression_baseline_results.csv path
main_dir = os.path.dirname(os.getcwd())
lr_file_path = os.path.join(main_dir, 'data', 'processed', 'Baseline', 'lr_results.csv')
dtc_file_path = os.path.join(main_dir, 'data', 'processed', 'Baseline', 'dtc_results.csv')
rfc_file_path = os.path.join(main_dir, 'data', 'processed', 'Baseline', 'rfc_results.csv')
xgbc_file_path = os.path.join(main_dir, 'data', 'processed', 'Baseline', 'xgbc_results.csv')
lr_log_file_path = os.path.join(main_dir, 'data', 'processed', 'Baseline', 'lr_log_results.csv')
dtc_log_file_path = os.path.join(main_dir, 'data', 'processed', 'Baseline', 'dtc_log_results.csv')
lr_standard_scaled_file_path = os.path.join(main_dir, 'data', 'processed', 'Baseline', 'lr_standard_scaled_results.csv')
dtc_standard_scaled_file_path = os.path.join(main_dir, 'data', 'processed', 'Baseline', 'dtc_standard_scaled_results.csv')
lr_min_max_scaled_file_path = os.path.join(main_dir, 'data', 'processed', 'Baseline', 'lr_min_max_scaled_results.csv')
dtc_min_max_scaled_file_path = os.path.join(main_dir, 'data', 'processed', 'Baseline', 'dtc_min_max_scaled_results.csv')

# Load the model results.csv
lr_results = pd.read_csv(lr_file_path)
dtc_results = pd.read_csv(dtc_file_path)
rfc_results = pd.read_csv(rfc_file_path)
xgbc_results = pd.read_csv(xgbc_file_path)
lr_log_results = pd.read_csv(lr_log_file_path)
dtc_log_results = pd.read_csv(dtc_log_file_path)
lr_standard_scaled_results = pd.read_csv(lr_standard_scaled_file_path)
dtc_standard_scaled_results = pd.read_csv(dtc_standard_scaled_file_path)
lr_min_max_scaled_results = pd.read_csv(lr_min_max_scaled_file_path)
dtc_min_max_scaled_results = pd.read_csv(dtc_min_max_scaled_file_path)

## Models Evaluation

In [94]:
# Create a function to compute all the evaluation metrics and store them inside a DataFrame
def model_evaluation(df, model_name='Model', features_used = 'features selected manually based on EDA', params = None, notes = None, y_val='y_val', y_pred='y_pred'):
    metrics = {
        'model': [model_name],
        'features_used': [features_used if features_used else "default"],
        'params_used': [params if params else 'default'],
        'accuracy_score': [accuracy_score(df[y_val], df[y_pred])],
        'f1_score': [f1_score(df[y_val], df[y_pred])],
        'precision_score': [precision_score(df[y_val], df[y_pred])],
        'recall_score': [recall_score(df[y_val], df[y_pred])],
        'confusion_matrix': [confusion_matrix(df[y_val], df[y_pred])],
        'notes': [notes if notes else None]
    }

    return pd.DataFrame(metrics)

# Call the function
df_lr_eval = model_evaluation(lr_results, 'Logistic Regression')
df_dtc_eval = model_evaluation(dtc_results, 'Decision Tree Classifier')
df_rfc_eval = model_evaluation(rfc_results, 'Random Forest Classifier')
df_xgbc_eval = model_evaluation(xgbc_results, 'XGBoost Classifier')
df_lr_log_eval = model_evaluation(lr_log_results, 'Logistic Regression Log')
df_dtc_log_eval = model_evaluation(dtc_log_results, 'Decision Tree Classifier Log')
df_lr_standard_scaled_eval = model_evaluation(lr_standard_scaled_results, 'Logistic Regression Standard Scaled')
df_dtc_standard_scaled_eval = model_evaluation(dtc_standard_scaled_results, 'Decision Tree Classifier Standard Scaled')
df_lr_min_max_scaled_eval = model_evaluation(lr_min_max_scaled_results, 'Logistic Regression MinMax Scaled')
df_dtc_min_max_scaled_eval = model_evaluation(dtc_min_max_scaled_results, 'Decision Tree Classifier MinMax Scaled')

# Concatenate all eval into a single DataFrame
df_metrics = pd.concat([
    df_lr_eval, 
    df_dtc_eval,
    df_rfc_eval,
    df_xgbc_eval,
    df_lr_log_eval,
    df_dtc_log_eval,
    df_lr_standard_scaled_eval,
    df_dtc_standard_scaled_eval,
    df_lr_min_max_scaled_eval,
    df_dtc_min_max_scaled_eval]).reset_index(drop = True)
df_metrics


Unnamed: 0,model,features_used,params_used,accuracy_score,f1_score,precision_score,recall_score,confusion_matrix,notes
0,Logistic Regression,features selected manually based on EDA,default,0.810056,0.767123,0.727273,0.811594,"[[89, 21], [13, 56]]",
1,Decision Tree Classifier,features selected manually based on EDA,default,0.815642,0.744186,0.8,0.695652,"[[98, 12], [21, 48]]",
2,Random Forest Classifier,features selected manually based on EDA,default,0.798883,0.731343,0.753846,0.710145,"[[94, 16], [20, 49]]",
3,XGBoost Classifier,features selected manually based on EDA,default,0.815642,0.759124,0.764706,0.753623,"[[94, 16], [17, 52]]",
4,Logistic Regression Log,features selected manually based on EDA,default,0.810056,0.767123,0.727273,0.811594,"[[89, 21], [13, 56]]",
5,Decision Tree Classifier Log,features selected manually based on EDA,default,0.815642,0.744186,0.8,0.695652,"[[98, 12], [21, 48]]",
6,Logistic Regression Standard Scaled,features selected manually based on EDA,default,0.810056,0.767123,0.727273,0.811594,"[[89, 21], [13, 56]]",
7,Decision Tree Classifier Standard Scaled,features selected manually based on EDA,default,0.815642,0.744186,0.8,0.695652,"[[98, 12], [21, 48]]",
8,Logistic Regression MinMax Scaled,features selected manually based on EDA,default,0.815642,0.772414,0.736842,0.811594,"[[90, 20], [13, 56]]",
9,Decision Tree Classifier MinMax Scaled,features selected manually based on EDA,default,0.815642,0.744186,0.8,0.695652,"[[98, 12], [21, 48]]",


## Comparisons and Observation

Model 0 – Logistic Regression
The model was trained using the base features: Pclass, Fare, Sex, and Embarked.
It achieves an accuracy of ~81%, which is acceptable as a baseline. However, since the goal is to predict passenger survival, we place greater emphasis on the Precision Score (TP / (TP + FP)), which currently stands at ~72.7%.

A higher precision indicates that when the model predicts a passenger will survive, there is a higher probability that this prediction is correct. This reduces the number of false positives (21 cases), where the model predicts survival but the passenger did not survive.

Model 1 – Decision Tree Classifier
The model was trained using the same features as Model 0. We observe a significant improvement in Precision, which increased from ~72.7% to 80%. The Recall decreased from 81% to 69%, but since precision is our priority, the Decision Tree Classifier is considered superior in this context. The accuracy also shows a slight improvement.

Conclusion:
Based on our metrics and the prioritization of precision, the Decision Tree Classifier provides better predictive reliability for identifying survivors, making it the preferred baseline model for further experiments and feature engineering.

Model 2 - Random Forest Classifier
This is an ensemble model and usually the scores should exceed baseline models but since we didn't use Pipeline and we kept the model quite standard without feature engineering and parameters tuning, the scores are similar or even a little bit worse compared to Model 1.

Model 3 - XGBoost Classifier
The same as Model 2

Next steps would be to begin the feature engineering phase to see how the models evolve.

After log-transforming and Scaling the "Fare" column, models 4-9 don't seem to improve too much, therefore we will stick with Model 1.

# Advanced Models

## Load Results

In [95]:
main_dir = os.path.dirname(os.getcwd())
rfc_path = os.path.join(main_dir, 'data', 'processed', 'Advanced', 'rfc_results.csv')
rfc_hyperparam_path = os.path.join(main_dir, 'data', 'processed', 'Advanced', 'rfc_hyperparam_results.csv')
rfc_hyperparam_auto_path = os.path.join(main_dir, 'data', 'processed', 'Advanced', 'rfc_hyperparam_auto_results.csv')
rfc_auto_selected_path = os.path.join(main_dir, 'data', 'processed', 'Advanced', 'df_rfc_auto_selected_results.csv')

rfc_results = pd.read_csv(rfc_path)
rfc_hyperparam_results = pd.read_csv(rfc_hyperparam_path)
rfc_hyperparam_auto_results = pd.read_csv(rfc_hyperparam_auto_path)
rfc_auto_selected_results = pd.read_csv(rfc_auto_selected_path)

## Model Evaluation

In [96]:
# Evaluate the model
df_rfc_eval = model_evaluation(rfc_results, "Random Forest Classifier")
df_rfc_hyperparam_eval = model_evaluation(rfc_hyperparam_results, "Ranom Forest Classifier", params = 'RandomizedSearchCV', notes = 'Parameters were selected base on RandomizedSearchCV')
df_rfc_hyperparam_auto_eval =  model_evaluation(rfc_hyperparam_auto_results, "Ranom Forest Classifier", features_used = 'All features', notes = 'Include all features and let the model to select them. Review Feature Importance from 02_ML_Models')
df_rfc_auto_selected_eval =  model_evaluation(rfc_auto_selected_results, "Ranom Forest Classifier", features_used = 'Auto selected features')


# Combine the results in a DataFrame
df_advanced_metrics = pd.concat([
    df_rfc_eval,
    df_rfc_hyperparam_eval,
    df_rfc_hyperparam_auto_eval,
    df_rfc_auto_selected_eval
]).reset_index(drop = True)

df_advanced_metrics

Unnamed: 0,model,features_used,params_used,accuracy_score,f1_score,precision_score,recall_score,confusion_matrix,notes
0,Random Forest Classifier,features selected manually based on EDA,default,0.776536,0.733333,0.743243,0.723684,"[[84, 19], [21, 55]]",
1,Ranom Forest Classifier,features selected manually based on EDA,RandomizedSearchCV,0.77095,0.732026,0.727273,0.736842,"[[82, 21], [20, 56]]",Parameters were selected base on RandomizedSea...
2,Ranom Forest Classifier,All features,default,0.781206,0.706767,0.728682,0.686131,"[[369, 70], [86, 188]]",Include all features and let the model to sele...
3,Ranom Forest Classifier,Auto selected features,default,0.796634,0.724858,0.754941,0.69708,"[[377, 62], [83, 191]]",


## Comparisons and Observation

Model 0 – Random Forest Classifier
Although Random Forest is an ensemble model that typically outperforms baseline algorithms, in this case it was trained with a simple pipeline, default hyperparameters, and without advanced feature engineering. As a result, the evaluation metrics did not improve significantly compared to the baseline models.

In fact, the Decision Tree Classifier (baseline) still provides the best precision score, which aligns with our initial objective of minimizing false positives.

Model 1 – Random Forest Classifier with hyperparameter tuning
Hyperparameters were optimized using RandomizedSearchCV, but tuning alone was not sufficient to improve the precision score meaningfully. This confirms that, given the simplicity of the current features, the model cannot extract significantly more predictive power without additional informative features.

Next Steps
The logical next step is feature engineering: creating additional features derived from the existing columns (e.g., Title from Name, FamilySize, IsAlone, binned Age, interaction terms, etc.) in the 01_EDA notebook. After generating these new features, the pipeline and hyperparameter tuning can be re-applied to evaluate the true potential of Random Forest and other ensemble models.