### Using SHAP for Feature Drift Analysis
**Description**: Utilize SHapley Additive exPlanations (SHAP) values to analyze feature
importance changes over time, indicating feature drift.

In [2]:
pip install shap

Defaulting to user installation because normal site-packages is not writeable
Collecting shap
  Downloading shap-0.47.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (25 kB)
Collecting slicer==0.0.8 (from shap)
  Downloading slicer-0.0.8-py3-none-any.whl.metadata (4.0 kB)
Collecting numba>=0.54 (from shap)
  Downloading numba-0.61.2-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (2.8 kB)
Collecting cloudpickle (from shap)
  Downloading cloudpickle-3.1.1-py3-none-any.whl.metadata (7.1 kB)
Collecting llvmlite<0.45,>=0.44.0dev0 (from numba>=0.54->shap)
  Downloading llvmlite-0.44.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.8 kB)
Downloading shap-0.47.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (992 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m992.3/992.3 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading slicer-0.0.8-py3-none-any.whl (15 kB)
Download

In [4]:
# write your code from here
import pandas as pd
import numpy as np
import shap
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# Step 1: Create synthetic train and test datasets
np.random.seed(42)

# Training dataset
train_df = pd.DataFrame({
    'feature1': np.random.normal(0, 1, 1000),
    'feature2': np.random.normal(5, 1, 1000),
    'feature3': np.random.randint(0, 2, 1000),
    'label': np.random.randint(0, 2, 1000)
})

# Testing dataset with drift in feature distributions
test_df = pd.DataFrame({
    'feature1': np.random.normal(0.5, 1, 1000),   # Drifted mean
    'feature2': np.random.normal(6, 1, 1000),     # Drifted mean
    'feature3': np.random.randint(0, 2, 1000),
    'label': np.random.randint(0, 2, 1000)
})

# Step 2: Train a model on the train data
X_train = train_df.drop(columns=['label'])
y_train = train_df['label']

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Step 3: Initialize SHAP explainer
explainer = shap.TreeExplainer(model)

# SHAP values for train and test
shap_values_train = explainer.shap_values(X_train)[1]  # Class 1
shap_values_test = explainer.shap_values(test_df.drop(columns=['label']))[1]

# Step 4: Calculate mean absolute SHAP values for each feature
mean_shap_train = np.abs(shap_values_train).mean(axis=0)
mean_shap_test = np.abs(shap_values_test).mean(axis=0)

# Combine into a DataFrame for comparison
feature_names = X_train.columns
shap_df = pd.DataFrame({
    'feature': feature_names,
    'mean_abs_shap_train': mean_shap_train,
    'mean_abs_shap_test': mean_shap_test,
    'shap_diff': np.abs(mean_shap_train - mean_shap_test)
}).sort_values('shap_diff', ascending=False)

print("SHAP Feature Drift Summary:")
print(shap_df)

# Step 5: Plot
plt.figure(figsize=(10, 6))
index = np.arange(len(shap_df))
bar_width = 0.35

plt.bar(index, shap_df['mean_abs_shap_train'], bar_width, label='Train SHAP')
plt.bar(index + bar_width, shap_df['mean_abs_shap_test'], bar_width, label='Test SHAP')

plt.xlabel('Features')
plt.ylabel('Mean |SHAP| Value')
plt.title('SHAP Feature Drift Analysis')
plt.xticks(index + bar_width / 2, shap_df['feature'], rotation=45)
plt.legend()
plt.tight_layout()
plt.show()


ValueError: All arrays must be of the same length