In [7]:
# ============================================================
# NOTEBOOK 03: LOAD AND USE SAVED MODEL
# ============================================================

# Cell 1: Load Required Libraries
import pandas as pd
import numpy as np
import joblib
import os

# Cell 2: Load Saved Model
MODEL_DIR = r'output/models'
model_filename = os.path.join(MODEL_DIR, 'xgboost_final.pkl')
metadata_filename = os.path.join(MODEL_DIR, 'model_metadata.pkl')

print("=" * 60)
print("LOADING SAVED MODEL")
print("=" * 60)

# Load model
loaded_model = joblib.load(model_filename)
print(f" Model loaded from: {model_filename}")

# Load metadata
metadata = joblib.load(metadata_filename)
print("\n=== MODEL METADATA ===")
for key, value in metadata.items():
    if key != 'features':  # Skip long feature list
        print(f"{key}: {value}")

print(f"\n Model ready for predictions")

# Cell 3: Load New Data (Example)
# If you want to make predictions on new data
DATA_PATH = r"FINAL_DATASET_FOR_TRAINING.csv"
df_new = pd.read_csv(DATA_PATH)





# 1. Parse datetime first
df_new['collision_hour'] = pd.to_datetime(df_new['collision_hour'], utc=True, errors='coerce')

# 2. Now you can perform calculations
print(f"\nDate range: {df_new['collision_hour'].min()} to {df_new['collision_hour'].max()}")

total_days = (df_new['collision_hour'].max() - df_new['collision_hour'].min()).days
print(f"Total years: {total_days / 365.25:.1f}")



# Parse datetime
df_new['collision_hour'] = pd.to_datetime(df_new['collision_hour'], utc=True, errors='coerce')
df_new['year'] = df_new['collision_hour'].dt.year
df_new['month'] = df_new['collision_hour'].dt.month
df_new['hour'] = df_new['collision_hour'].dt.hour
df_new['dow'] = df_new['collision_hour'].dt.dayofweek

print("\n=== NEW DATA LOADED ===")
print(f"Shape: {df_new.shape}")

# Cell 4: Make Predictions on New Data
# Select features (must match training features)
feature_cols = metadata['features']
X_new = df_new[feature_cols]

# Generate predictions
print("\n=== GENERATING PREDICTIONS ===")
predictions_proba = loaded_model.predict_proba(X_new)[:, 1]
predictions_class = (predictions_proba >= metadata['threshold']).astype(int)

# Add predictions to dataframe
df_new['predicted_risk'] = predictions_proba
df_new['predicted_collision'] = predictions_class


# Cell 5: Example - Identify High-Risk Hours
high_risk_threshold = df_new['predicted_risk'].quantile(0.95)
high_risk_records = df_new[df_new['predicted_risk'] >= high_risk_threshold]

print("\n=== HIGH RISK HOURS (Top 5%) ===")
print(f"Threshold: {high_risk_threshold:.4f}")
print(f"High-risk hours: {len(high_risk_records):,}")

# Show sample high-risk hours
print("\nSample high-risk predictions:")
print(high_risk_records[['collision_hour', 'region', 'borough_name', 
                         'temp', 'visibility', 'heavy_rain_flag', 
                         'freezing_risk_flag', 'predicted_risk']].head(10))

# Cell 6: Save Predictions (Optional)
output_file = r'output/tables/predictions_new_data.csv'
df_new[['collision_hour', 'region', 'borough_name', 
        'predicted_risk', 'predicted_collision']].to_csv(output_file, index=False)

print(f"\n Predictions saved to: {output_file}")
print("\n" + "=" * 60)
print("MODEL DEPLOYMENT COMPLETE ")
print("=" * 60)


LOADING SAVED MODEL
 Model loaded from: output/models\xgboost_final.pkl

=== MODEL METADATA ===
model_type: XGBoost (sklearn Pipeline)
training_date: 2026-01-10 05:40:25
train_samples: 3768864
val_samples: 754392
test_samples: 1131072
test_roc_auc: 0.732346820110733
test_pr_auc: 0.14217710750714488
threshold: 0.6363959908485413
scale_pos_weight: 12.884601498662697

 Model ready for predictions

Date range: 2010-01-01 00:00:00+00:00 to 2024-12-31 23:00:00+00:00
Total years: 15.0

=== NEW DATA LOADED ===
Shape: (5654328, 25)

=== GENERATING PREDICTIONS ===

=== HIGH RISK HOURS (Top 5%) ===
Threshold: 0.7356
High-risk hours: 282,717

Sample high-risk predictions:
                collision_hour          region borough_name  temp  visibility  \
296  2010-01-13 08:00:00+00:00  greater london     Lewisham  -0.6         0.6   
752  2010-02-01 08:00:00+00:00  greater london     Lewisham  -0.9        18.4   
800  2010-02-03 08:00:00+00:00  greater london     Lewisham   1.0        10.1   
824  20