# **MODEL DEVELOPMENT NOTEBOOK**

## Objectives

* Write here your notebook objective, for example, "Fetch data from Kaggle and save as raw data", or "engineer features for modelling"

## Inputs

* Write here which data or information you need to run the notebook 

## Outputs

* Write here which files, code or artefacts you generate by the end of the notebook 

## Additional Comments

* In case you have any additional comments that don't fit in the previous bullets, please state them here. 


---

# Set Project Root Directory

Centralise the base path using project_root

In [None]:
import os
from pathlib import Path

# Resolve the project root
project_root = Path.cwd()
if project_root.name == "jupyter_notebooks":
    project_root = project_root.parent

# Import Libraries

In this section, all necessary standard libaries are imported to allow using their functions.

Import Libraries with necessary Settings

In [None]:
# Standard libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import warnings

# Settings
%matplotlib inline
sns.set(style="whitegrid")

# ML libraries
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.exceptions import UndefinedMetricWarning
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, balanced_accuracy_score, confusion_matrix, classification_report, f1_score
from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import RandomOverSampler
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
from xgboost import XGBClassifier


---

# Load Train & Test Sets

In this section, the train and test sets are loaded to be able to access the prepared data.

In [None]:
data_path = project_root / "outputs" / "data"

X_train = pd.read_csv(data_path / "X_train.csv")
X_test = pd.read_csv(data_path / "X_test.csv")
y_train = pd.read_csv(data_path / "y_train.csv").values.ravel()
y_test = pd.read_csv(data_path / "y_test.csv").values.ravel()

---

# Data Preparation

In this section, 

## Handle Data Imbalance

In [None]:
smote = SMOTE(sampling_strategy='minority', random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

* 

In [None]:
# Save train/test splits
data_path = project_root / "outputs" / "data"
os.makedirs(data_path, exist_ok=True)

X_train_resampled.to_csv(data_path / "X_train_resampled.csv", index=False)

# Convert numpy array before saving
pd.Series(y_train_resampled).to_csv(data_path / "y_train_resampled.csv", index=False)

print("✅ Saved resampled train sets to outputs/data/")

## Standardise Data

In [None]:
# Scale X train and test set
scaler = StandardScaler()
X_train_scaled_arr = scaler.fit_transform(X_train_resampled)
X_test_scaled_arr = scaler.transform(X_test)

# Convert numpy array to pandas df
X_train_scaled = pd.DataFrame(X_train_scaled_arr, columns=X_train_resampled.columns)
X_test_scaled = pd.DataFrame(X_test_scaled_arr, columns=X_test.columns)

In [None]:
print("✅ X_train_scaled:", X_train_scaled.shape)
print("✅ y_train_resampled:", y_train_resampled.shape)
print("✅ X_test_scaled:", X_test_scaled.shape)
print("✅ y_test:", y_test.shape)

* Scale X_train_resampled and X_test for Logistic Regression and LinearSVC
* Not for Random Forest or XGBoost, which are tree-based and scale-invariant

In [None]:
# Save scaled test and train sets
data_path = project_root / "outputs" / "data"
os.makedirs(data_path, exist_ok=True)

X_train_scaled.to_csv(data_path / "X_train_scaled.csv", index=False)
X_test_scaled.to_csv(data_path / "X_test_scaled.csv", index=False)

print("✅ Saved scaled train and test set to outputs/data/")

## Helper Function for Result Collection

In [None]:
results = []

def collect_results(model_name, acc, balanced_acc, f1_macro, f1_weighted):
    results.append({
        'Model': model_name,
        'Accuracy': round(acc, 4),
        'Balanced Accuracy': round(balanced_acc, 4),
        'F1 Macro': round(f1_macro, 4),
        'F1 Weighted': round(f1_weighted, 4)
    })

* 

# Model Training

In this section, 

## Helper Function for Training Models

In [None]:
# Train and quickly evaluate any given model
def train_and_evaluate(model, model_name, X_train_input, y_train_input,
                       X_test_input, y_test_input, balance_test=False):
    print(f"\n🧪 {model_name}")

    # Optional: balance test set
    if balance_test:
        ros = RandomOverSampler(random_state=42)
        X_test_input, y_test_input = ros.fit_resample(X_test_input, y_test_input)
        print("🔁 Test set was balanced for evaluation")

    # Train
    model.fit(X_train_input, y_train_input)

    # Predict
    predictions = model.predict(X_test_input)

    # Suppress UndefinedMetricWarning for clean output
    with warnings.catch_warnings():
        warnings.simplefilter("ignore", category=UndefinedMetricWarning)

        # Metrics
        acc = accuracy_score(y_test_input, predictions)
        balanced_acc = balanced_accuracy_score(y_test_input, predictions)
        f1_macro = f1_score(y_test_input, predictions, average='macro')
        f1_weighted = f1_score(y_test_input, predictions, average='weighted')

        # Output
        print("✅ Accuracy:", round(acc, 4))
        print("🎯 Balanced Accuracy:", round(balanced_acc, 4))
        print("🧮 F1 Score (macro):", round(f1_macro, 4))
        print("🧮 F1 Score (weighted):", round(f1_weighted, 4))
        print("📉 Confusion Matrix:\n", confusion_matrix(y_test_input, predictions))
        print("📋 Classification Report:\n", classification_report(y_test_input, predictions))

        # Collect results function
        collect_results(model_name, acc, balanced_acc, f1_macro, f1_weighted)

    return model, acc

* 

## Train Multiple Models

To start, multiple models are trained with different algorithms to find the best one for this data.

In [None]:
# Logistic Regression
logreg = LogisticRegression(max_iter=2000, random_state=42)
train_and_evaluate(logreg, "Logistic Regression", X_train_scaled, y_train_resampled,
                   X_test_scaled, y_test,
                   balance_test=True)

# Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
train_and_evaluate(rf, "Random Forest", X_train_resampled, y_train_resampled,
                   X_test, y_test,
                   balance_test=False)

# Linear Support Vector Classifier
linear_svc = LinearSVC(max_iter=10000)
train_and_evaluate(linear_svc, "Linear Support Vector Classifier", X_train_scaled, y_train_resampled,
                   X_test_scaled, y_test,
                   balance_test=True)

# XGBoost
xgb = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
train_and_evaluate(xgb, "XGBoost", X_train_resampled, y_train_resampled,
                   X_test, y_test,
                   balance_test=False)

* 

## Save Model Training Results

In [None]:
df_training_results = pd.DataFrame(results)

evaluation_path = project_root / "outputs" / "evaluation"
os.makedirs(evaluation_path, exist_ok=True)

df_training_results.to_csv(evaluation_path / "model_training_results.csv", index=False)
print("📁 Results saved to model_training_results.csv")

*

## Save Top 2 Models

In [None]:
ml_path = project_root / "outputs" / "ml_pipeline"
os.makedirs(ml_path, exist_ok=True)

joblib.dump(rf, ml_path / "default_rf.pkl")
joblib.dump(xgb, ml_path / "default_xgb.pkl")
print("📁 Top 2 default models saved.")

*

# Top Model Development

In this section, 

## Hyperparameter Tuning

In [None]:
# Hyperparameter grids
rf_params = {
    'n_estimators': [100, 200],
    'max_depth': [None, 20],
    'min_samples_split': [2, 5]
}

xgb_params = {
    'n_estimators': [100, 200],
    'max_depth': [6, 10],
    'learning_rate': [0.1, 0.01],
    'subsample': [0.8, 1.0]
}

# Grid search for Random Forest
print("🔍 Tuning Random Forest...")
rf_grid = GridSearchCV(
    RandomForestClassifier(random_state=42),
    rf_params,
    cv=3,
    scoring='f1_macro',
    n_jobs=-1
)
rf_grid.fit(X_train_resampled, y_train_resampled)
tuned_rf = rf_grid.best_estimator_
print("✅ Best RF Parameters:", rf_grid.best_params_)

# Grid search for XGBoost
print("🔍 Tuning XGBoost...")
xgb_grid = GridSearchCV(
    XGBClassifier(eval_metric='mlogloss', use_label_encoder=False, random_state=42),
    xgb_params,
    cv=3,
    scoring='f1_macro',
    n_jobs=-1
)
xgb_grid.fit(X_train_resampled, y_train_resampled)
tuned_xgb = xgb_grid.best_estimator_
print("✅ Best XGB Parameters:", xgb_grid.best_params_)

*

## Evaluate Results after Tuning

Use helper function to train and evaluate models after tuning

In [None]:
results = []

# Evaluate tuned RF
tuned_rf = rf_grid.best_estimator_
res_rf = train_and_evaluate(tuned_rf, "Tuned Random Forest", X_train_resampled, y_train_resampled, X_test, y_test)

# Evaluate tuned XGB
tuned_xgb = xgb_grid.best_estimator_
res_xgb = train_and_evaluate(tuned_xgb, "Tuned XGBoost", X_train_resampled, y_train_resampled, X_test, y_test)

* 

## Save Evaluation Results and Tuned Models

In [None]:
# Save results
df_tuning_results = pd.DataFrame(results)

evaluation_path = project_root / "outputs" / "evaluation"
os.makedirs(evaluation_path, exist_ok=True)

df_tuning_results.to_csv(evaluation_path / "model_tuning_results.csv", index=False)
print("📁 Results saved to model_tuning_results.csv")

# Save tuned models
ml_path = project_root / "outputs" / "ml_pipeline"
os.makedirs(ml_path, exist_ok=True)

joblib.dump(tuned_rf, ml_path / "tuned_rf.pkl")
joblib.dump(tuned_xgb, ml_path / "tuned_xgb.pkl")
print("📁 Top 2 tuned models saved.")

* 

---

# Best Model Development

In this section, 

## Save Best Performing Default Model

In [None]:
# Set path
final_model_path = project_root / "outputs" / "ml_pipeline"
os.makedirs(final_model_path, exist_ok=True)

# Save tuned RF as current best model
joblib.dump(tuned_rf, final_model_path / "default_best_model.pkl")
print("📁 Best tuned RF model saved as default_best_model.pkl")

*

## Check Feature Importance

In [None]:
# Get feature names from X_train_resampled
feature_names = X_train_resampled.columns

# Extract importances
importances = tuned_rf.feature_importances_

# Create a DataFrame
df_feature = pd.DataFrame({
    'feature': feature_names,
    'importance': importances
}).sort_values(by='importance', ascending=False)

# Plot top 15 features
plt.figure(figsize=(10, 6))
plt.barh(df_feature['feature'][:15][::-1], df_feature['importance'][:15][::-1])
plt.title("Top 15 Feature Importances (Random Forest)")
plt.xlabel("Importance")
plt.tight_layout()
plt.show()

# Save plot
figures_path = project_root / "outputs" / "eda" / "figures"
os.makedirs(figures_path, exist_ok=True)
plt.savefig(figures_path / "best_model_feature_importance.png", dpi=300)
print("📁 Saved best model feature importance to outputs/eda/figures")

* 

## Advanced Feature Engineering

Drop features with importance below a threshold near zero

In [None]:
# Drop features with importance below a threshold
low_imp_feats = df_feature[df_feature['importance'] < 0.001]['feature']
X_train_fe = X_train_resampled.drop(columns=low_imp_feats)
X_test_fe = X_test_scaled.drop(columns=low_imp_feats)

print(f"Dropped {len(low_imp_feats)} low-importance features.")

* 

## Retrain Default Best Model with Advanced Feature Engineering

In [None]:
results = []

# Retrain on new features
feature_rf = RandomForestClassifier(**tuned_rf.get_params())

# Fit model
feature_rf.fit(X_train_fe, y_train_resampled)

# Evaluate
train_and_evaluate(feature_rf, "Random Forest + FE", X_train_fe, y_train_resampled, X_test_fe, y_test, balance_test=False)

* The feature engineering (feature dropping) didn't improve performance and significantly worsened it, meaning those “low importance” features likely had:

* Small but meaningful contributions

* Interactions with other features

* Redundant but stabilizing effects on splits

* Conclusion: Feature Engineering and removing features with low importance did not increase model performance. The original default_best_model can be considered the best_model with the best performance. 



Save evaluation results

In [None]:
# Save results
df_fe_results = pd.DataFrame(results)

evaluation_path = project_root / "outputs" / "evaluation"
os.makedirs(evaluation_path, exist_ok=True)

df_fe_results.to_csv(evaluation_path / "best_model_fe_results.csv", index=False)
print("📁 Results saved to best_model_fe_results.csv")

*

Save feature engineered rf model 

In [None]:
# Set path
model_path = project_root / "outputs" / "ml_pipeline"
os.makedirs(model_path, exist_ok=True)

# Save tuned RF as current best model
joblib.dump(feature_rf, model_path / "feature_rf.pkl")
print("📁 Best tuned RF model saved as feature_rf.pkl")

---

# Final Model Evaluation

In this section, 

# Evaluate Final Model with Original Test Set

Load default best model to get clean version

In [None]:
model_path = project_root / "outputs" / "ml_pipeline" / "default_best_model.pkl"
defaul_best_model = joblib.load(model_path)

*

Evaluate default best model with original test set

In [None]:
results = []

# Evaluate on original test set
train_and_evaluate(
    defaul_best_model,
    model_name="Final Random Forest (Test Set)",
    X_train_input=X_train_resampled,
    y_train_input=y_train_resampled,
    X_test_input=X_test_scaled,
    y_test_input=y_test,
    balance_test=False
)

Poor performance reasoning and conclusion: 
* The model was trained on a resampled (balanced) training set, but the test set remains imbalanced.
* The model learned to expect classes in roughly equal frequency.
* When evaluated on the real-world imbalanced test set, its predictions don’t align well, especially for minority classes like class 2.
* The model has zero precision and recall for class 2, meaning it's never predicting it. This could be due to ittle distinguishing signal for that class, overlapping features across classe or underrepresentation in test data.

Save evaluation results

In [None]:
# Save results
df_final_results = pd.DataFrame(results)

evaluation_path = project_root / "outputs" / "evaluation"
os.makedirs(evaluation_path, exist_ok=True)

df_final_results.to_csv(evaluation_path / "best_model_final_results.csv", index=False)
print("📁 Results saved to best_model_final_results.csv")

* 

---

# Conclusion and Next Steps

* 