In [1]:
import pandas as pd

df = pd.read_csv("../data/processed/final_labeled_ward_data.csv")
features = [
    "population_density", "stop_density", "route_count", "route_density",
    "overlap_score", "border_overlap_score", "avg_fare"
]
X = df[features]
y = df["label"]


In [None]:
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer

le = LabelEncoder()
y_encoded = le.fit_transform(y)

imputer = SimpleImputer(strategy="mean")
X_imputed = imputer.fit_transform(X)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)


In [None]:
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y_encoded, stratify=y_encoded, test_size=0.2, random_state=42
)

sm = SMOTE(random_state=42)
X_resampled, y_resampled = sm.fit_resample(X_train, y_train)


In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import classification_report

models = {
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
}

for name, model in models.items():
    model.fit(X_resampled, y_resampled)
    y_pred = model.predict(X_test)
    print(classification_report(y_test, y_pred, target_names=le.classes_))



📌 Training and Evaluating: Random Forest
              precision    recall  f1-score   support

   cluttered       1.00      0.97      0.98        61
     optimal       0.50      1.00      0.67         5
 underserved       1.00      0.57      0.73         7

    accuracy                           0.93        73
   macro avg       0.83      0.85      0.79        73
weighted avg       0.97      0.93      0.94        73


📌 Training and Evaluating: Logistic Regression
              precision    recall  f1-score   support

   cluttered       1.00      0.82      0.90        61
     optimal       0.31      0.80      0.44         5
 underserved       0.60      0.86      0.71         7

    accuracy                           0.82        73
   macro avg       0.64      0.83      0.68        73
weighted avg       0.91      0.82      0.85        73


📌 Training and Evaluating: XGBoost
              precision    recall  f1-score   support

   cluttered       1.00      0.97      0.98        61
   

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [5]:
import joblib
import os

os.makedirs("../models", exist_ok=True)

joblib.dump(models["XGBoost"], "../models/xgboost_model.pkl")

joblib.dump(scaler, "../models/scaler.pkl")
joblib.dump(le, "../models/label_encoder.pkl")

print(" Model, Scaler, and LabelEncoder saved successfully.")


✅ Model, Scaler, and LabelEncoder saved successfully.
