In [1]:
import pandas as pd
import numpy as np
import mlflow
import xgboost as xgb
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, accuracy_score, precision_score, recall_score, confusion_matrix
from mlflow.models.signature import infer_signature
import warnings
import logging

warnings.filterwarnings("ignore", message="Hint: Inferred schema contains integer column")
warnings.filterwarnings("ignore", message=".*Saving model in the UBJSON format.*")
logging.getLogger("mlflow.models.model").setLevel(logging.ERROR)
logging.getLogger("xgboost").setLevel(logging.ERROR)


In [2]:
print("--- Part 1: Starting Data Preparation ---")
df_raw = pd.read_csv('smart_traffic_management_dataset.csv')
df_processed = df_raw.copy()


df_processed['timestamp'] = pd.to_datetime(df_processed['timestamp'])
df_processed['hour'] = df_processed['timestamp'].dt.hour
df_processed['day_of_week'] = df_processed['timestamp'].dt.dayofweek  # Monday=0, Sunday=6
df_processed['month'] = df_processed['timestamp'].dt.month
df_processed['is_weekend'] = (df_processed['timestamp'].dt.dayofweek >= 5).astype(int)
df_processed['is_rush_hour'] = ((df_processed['hour'] >= 7) & (df_processed['hour'] <= 9) | (df_processed['hour'] >= 17) & (df_processed['hour'] <= 19)).astype(int)   # Rush hours: 7-9 AM and 5-7 PM

print("Data preparation complete. The 'df_processed' DataFrame is ready.")
print("Shape of processed data:", df_processed.shape)
df_processed.head()

--- Part 1: Starting Data Preparation ---
Data preparation complete. The 'df_processed' DataFrame is ready.
Shape of processed data: (2000, 17)


Unnamed: 0,timestamp,location_id,traffic_volume,avg_vehicle_speed,vehicle_count_cars,vehicle_count_trucks,vehicle_count_bikes,weather_condition,temperature,humidity,accident_reported,signal_status,hour,day_of_week,month,is_weekend,is_rush_hour
0,2024-01-01 00:00:00,4,504,53.124162,142,24,44,Cloudy,33.334387,36.390698,0,Red,0,0,1,0,0
1,2024-01-01 00:01:00,5,209,44.94785,862,50,23,Cloudy,17.92683,37.640927,0,Green,0,0,1,0,0
2,2024-01-01 00:02:00,3,572,63.179229,317,12,10,Windy,33.483375,84.26261,1,Red,0,0,1,0,0
3,2024-01-01 00:03:00,5,699,42.269697,709,43,21,Sunny,19.212941,61.550978,0,Yellow,0,0,1,0,0
4,2024-01-01 00:04:00,5,639,72.185791,594,34,14,Cloudy,11.349244,77.494506,0,Red,0,0,1,0,0


In [3]:
print("\n--- Part 2: Starting Traffic Volume Forecasting ---")


df_forecast = pd.get_dummies(df_processed, columns=['weather_condition', 'signal_status'], drop_first=True, dtype=int)
target_forecast = 'traffic_volume'
X_forecast = df_forecast.drop(columns=[target_forecast, 'timestamp', 'location_id'])
y_forecast = df_forecast[target_forecast].astype(float)
int_cols = X_forecast.select_dtypes(include=['int64', 'int32']).columns.tolist()
X_forecast[int_cols] = X_forecast[int_cols].astype(float)

# Split data
X_train_f, X_test_f, y_train_f, y_test_f = train_test_split(X_forecast, y_forecast, test_size=0.2, random_state=42)

# Start MLflow experiment
mlflow.set_experiment("Smart Traffic Project")

with mlflow.start_run(run_name="Forecaster_XGBoost_v2"): 
    mlflow.set_tag("model_type", "XGBoost")
    mlflow.set_tag("task", "Forecasting")

    # Train model
    model_xgb = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, max_depth=6, random_state=42)
    model_xgb.fit(X_train_f, y_train_f)

    # Evaluate
    predictions = model_xgb.predict(X_test_f)
    rmse = np.sqrt(mean_squared_error(y_test_f, predictions))
    mae = mean_absolute_error(y_test_f, predictions)

    # Log metrics
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("mae", mae)
    mlflow.log_params(model_xgb.get_params())

    input_example = X_train_f.head(5)
    signature = infer_signature(input_example, model_xgb.predict(input_example))
    model_path = "xgboost-forecaster"
    model_xgb.save_model(model_path)
    mlflow.log_artifact(model_path)

    # Log the model with signature and input example
    mlflow.xgboost.log_model(
        xgb_model=model_xgb,
        artifact_path="xgboost-forecaster",
        input_example=input_example,
        signature=signature,
        registered_model_name="XGBoost_Forecaster"
    )

    # Print the run ID for dashboard use
    run_id = mlflow.active_run().info.run_id
    print(f"\nModel saved with Run ID: {run_id}")


--- Part 2: Starting Traffic Volume Forecasting ---

Model saved with Run ID: a5b21df4485a4fc7a5d6947e397b1af4


Registered model 'XGBoost_Forecaster' already exists. Creating a new version of this model...
Created version '5' of model 'XGBoost_Forecaster'.


In [4]:
print("\n--- Part 3: Starting Accident Prediction ---")


df_accident = pd.get_dummies(df_processed, columns=['weather_condition', 'signal_status'], drop_first=True, dtype=int)
target_accident = 'accident_reported'
X_accident = df_accident.drop(columns=[target_accident, 'timestamp', 'location_id'])
y_accident = df_accident[target_accident]

scale_pos_weight = y_accident.value_counts()[0] / (y_accident.value_counts().get(1, 1))
print(f"Calculated scale_pos_weight for imbalanced data: {scale_pos_weight:.2f}")
X_train_a, X_test_a, y_train_a, y_test_a = train_test_split(X_accident, y_accident, test_size=0.2, random_state=42, stratify=y_accident)
X_tr, X_val, y_tr, y_val = train_test_split(X_train_a, y_train_a, test_size=0.2, random_state=42, stratify=y_train_a)

with mlflow.start_run(run_name="Accident_Predictor_LGBM_v2"): 
    mlflow.set_tag("model_type", "LightGBM")
    mlflow.set_tag("task", "Classification")
    model_lgbm = lgb.LGBMClassifier(objective='binary', scale_pos_weight=scale_pos_weight, random_state=42)
    model_lgbm.fit(X_tr, y_tr)

    val_probs = model_lgbm.predict_proba(X_val)[:, 1]
    thresholds = np.linspace(0.01, 0.99, 99)
    best_thresh = 0.5
    best_f1 = -1
    from sklearn.metrics import f1_score
    for t in thresholds:
        preds_t = (val_probs >= t).astype(int)
        f1_t = f1_score(y_val, preds_t, zero_division=0)
        if f1_t > best_f1:
            best_f1 = f1_t
            best_thresh = t

    print(f"Chosen probability threshold (validation): {best_thresh:.2f} with F1: {best_f1:.3f}")

    # Evaluate on test set using chosen threshold
    test_probs = model_lgbm.predict_proba(X_test_a)[:, 1]
    predictions = (test_probs >= best_thresh).astype(int)

    accuracy = accuracy_score(y_test_a, predictions)
    f1 = f1_score(y_test_a, predictions, zero_division=0)
    precision = precision_score(y_test_a, predictions, zero_division=0)
    recall = recall_score(y_test_a, predictions, zero_division=0)
    cm = confusion_matrix(y_test_a, predictions)

    # Log all relevant metrics
    mlflow.log_metric("accuracy", accuracy)
    mlflow.log_metric("f1_score", f1)
    mlflow.log_metric("precision", precision)
    mlflow.log_metric("recall", recall)
    mlflow.log_metric("threshold", float(best_thresh))

    print("Confusion Matrix:")
    print(cm)
    from sklearn.metrics import classification_report
    print("Classification Report:\n", classification_report(y_test_a, predictions, zero_division=0))

    # Log the model WITH an input example and signature
    try:
        input_example = X_tr.head()
        signature = infer_signature(input_example, model_lgbm.predict_proba(input_example))
        mlflow.lightgbm.log_model(
            lgb_model=model_lgbm,
            artifact_path="lightgbm-accident-predictor",
            input_example=input_example,
            signature=signature,
            registered_model_name="LightGBM_Accident_Predictor"
        )
    except Exception as e:
        print("Failed to log LightGBM model to MLflow:", e)

    print(f"LGBM Accident Predictor Run Complete. Accuracy: {accuracy:.2f}, F1-Score: {f1:.2f}, Precision: {precision:.2f}, Recall: {recall:.2f}")


--- Part 3: Starting Accident Prediction ---
Calculated scale_pos_weight for imbalanced data: 17.52
[LightGBM] [Info] Number of positive: 69, number of negative: 1211
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000406 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1461
[LightGBM] [Info] Number of data points in the train set: 1280, number of used features: 17
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.053906 -> initscore=-2.865095
[LightGBM] [Info] Start training from score -2.865095
Chosen probability threshold (validation): 0.01 with F1: 0.070
Confusion Matrix:
[[291  87]
 [ 19   3]]
Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.77      0.85       378
           1       0.03      0.14      0.05        22

    accuracy                           0.73       400


Registered model 'LightGBM_Accident_Predictor' already exists. Creating a new version of this model...
Created version '5' of model 'LightGBM_Accident_Predictor'.


In [7]:
print("\n--- Part 4: Model Serving and Dashboard ---")

# Use registered model instead of run ID
model_name = "XGBoost_Forecaster"
model_version = "5"  # Latest version
logged_model_uri = f"models:/{model_name}/{model_version}"

try:
    loaded_model = mlflow.pyfunc.load_model(logged_model_uri)
    print(f"\nModel '{model_name}' version {model_version} loaded successfully from MLflow!")
    sample_input = X_test_f.head(1)
    prediction_result = loaded_model.predict(sample_input)
    print(f"Sample Prediction for Traffic Volume: {prediction_result[0]:.0f}")
    
except Exception as e:
    print(f"\nCould not load model. Error: {e}")
    print("\nAlternatively, try loading from the most recent run:")
    print("You can find valid run IDs in the mlruns/593742697345910866/ directory")


--- Part 4: Model Serving and Dashboard ---

Model 'XGBoost_Forecaster' version 5 loaded successfully from MLflow!
Sample Prediction for Traffic Volume: 598
