In [0]:
import etl
import pandas as pd
import numpy as np

In [0]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score, cohen_kappa_score

In [0]:
import matplotlib.pyplot as plt
import seaborn as sns

In [0]:
import joblib

In [0]:
def inference_pipeline(model, data, preprocessor, ignore_vars=None, ordinal_feats=[]):
    # 1-Preprocess data
    #data = etl.preprocess(data)
    
    # 2-Transformation steps
    if "Severity" in data.columns:
        X = data.drop(columns=["Severity"]).copy()
    else:
        X = data.copy()
    
    # Ignore certain columns
    if ignore_vars is None:
        ignore_vars = []
        
    X = X[[c for c in X.columns if c not in ignore_vars]]
    
    X_transformed = preprocessor.transform(X)
    
    # 3-Inference
    predictions = model.predict(X_transformed)
    
    return predictions

In [0]:
def evaluate(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred, average='weighted')
    kappa = cohen_kappa_score(y_true, y_pred)
    print(f"Accuracy: {accuracy:.3f}, F1 Score: {f1:.3f}, Cohen's Kappa: {kappa:.3f}")

### Load model

In [0]:
model = joblib.load("models/xgb_model_20241001_144429.pkl")
preprocessor = joblib.load("models/xgb_preprocessor_20241001_144429.pkl")

### Load data

In [0]:
df = pd.read_csv("data/US_Accidents_March23.csv")
df = df[(df["State"] == "MA") & (df["City"] == "Boston")]
df.shape

In [0]:
df = etl.preprocess(df)
df.shape

In [0]:
df["Severity"].value_counts()

### Inference

In [0]:
target_transformer = OrdinalEncoder(categories=[[1, 2, 3, 4]])
y_true = target_transformer.fit_transform(df["Severity"].values.reshape(-1, 1)).ravel()

In [0]:
y_true.shape

In [0]:
%%time
y_pred = inference_pipeline(model=model, data=df, preprocessor=preprocessor, ignore_vars=None, ordinal_feats=["Month"])

In [0]:
evaluate(y_true, y_pred)

In [0]:
print(classification_report(y_true, y_pred))

In [0]:
def plot_confusion_matrix(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred, normalize='true')
    # Plot confusion matrix
    plt.figure(figsize=(4, 3))
    sns.heatmap(cm, annot=True, fmt=".2f", cmap="Blues", cbar=False, xticklabels=[1, 2, 3, 4], yticklabels=[1, 2, 3, 4])
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title('Confusion Matrix')
    plt.tight_layout()
    plt.show()

In [0]:
plot_confusion_matrix(y_true, y_pred)