In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import pickle
import shap
import lime.lime_tabular
import matplotlib.pyplot as plt
import numpy as np
import pickle
import os


# Load your data (assuming output.csv is your dataset)
data = pd.read_csv("output.csv")
data = data.drop_duplicates()

# Prepare your features and target (adjust columns as needed)
X = data.drop(columns=['gravity', 'AccID', 'vehicleID', 'num_veh'], errors='ignore')
y = data['gravity']

# Split data (this is optional if you only need the final model)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train your model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Save the trained model
with open("model.pkl", "wb") as f:
    pickle.dump(model, f)

# Optionally, save your test data if you want to use it in Streamlit later
X_test.to_csv("X_test.csv", index=False)


In [None]:
# Ensure test data is loaded
X_test = pd.read_csv("X_test.csv")  # Load the test dataset if not already loaded

# Create a smaller sample for SHAP (to reduce computation time)
X_sample = X_test.sample(n=50, random_state=42)  # Define X_sample here

# Create SHAP Explainer
explainer = shap.TreeExplainer(model)

# Compute SHAP values
shap_values = explainer.shap_values(X_sample)  # Now X_sample is correctly defined

# Determine the correct class index for SHAP dependence plot
class_index = 0  # Change this index to visualize other classes (0, 1, 2, 3)

# Print SHAP value shapes to check correctness
for i, sv in enumerate(shap_values):
    print(f"Class {i}: {sv.shape}")

# SHAP Summary Plot
shap.summary_plot(shap_values[class_index], X_sample, show=False)
fig_summary = plt.gcf()

# SHAP Dependence Plot with correct class selection
feature_name = X_sample.columns[0]
shap.dependence_plot(feature_name, shap_values[class_index], X_sample, show=False)
fig_dependence = plt.gcf()

# LIME Explanation for a single prediction
explainer_lime = lime.lime_tabular.LimeTabularExplainer(
    training_data=X_train.values,
    feature_names=X_train.columns,
    class_names=['Class 0', 'Class 1', 'Class 2', 'Class 3'],  # Adjust class names as needed
    mode='classification'
)

exp = explainer_lime.explain_instance(X_test.iloc[0], model.predict_proba)
fig_lime = exp.as_pyplot_figure()

# Display results
plt.show(fig_summary)
plt.show(fig_dependence)
plt.show(fig_lime)
