### Using SHAP for Feature Drift Analysis
**Description**: Utilize SHapley Additive exPlanations (SHAP) values to analyze feature
importance changes over time, indicating feature drift.

In [2]:
pip install shap

Defaulting to user installation because normal site-packages is not writeable
Collecting shap
  Downloading shap-0.47.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (25 kB)
Collecting slicer==0.0.8 (from shap)
  Downloading slicer-0.0.8-py3-none-any.whl.metadata (4.0 kB)
Collecting numba>=0.54 (from shap)
  Downloading numba-0.61.2-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (2.8 kB)
Collecting cloudpickle (from shap)
  Downloading cloudpickle-3.1.1-py3-none-any.whl.metadata (7.1 kB)
Collecting llvmlite<0.45,>=0.44.0dev0 (from numba>=0.54->shap)
  Downloading llvmlite-0.44.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.8 kB)
Downloading shap-0.47.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (992 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m992.3/992.3 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading slicer-0.0.8-py3-none-any.whl (15 kB)
Download

In [5]:
import pandas as pd
import numpy as np
import shap
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.exceptions import NotFittedError

def generate_data():
    np.random.seed(42)
    train_df = pd.DataFrame({
        'feature1': np.random.normal(0, 1, 1000),
        'feature2': np.random.normal(5, 1, 1000),
        'feature3': np.random.randint(0, 2, 1000),
        'label': np.random.randint(0, 2, 1000)
    })
    test_df = pd.DataFrame({
        'feature1': np.random.normal(0.5, 1, 1000),
        'feature2': np.random.normal(6, 1, 1000),
        'feature3': np.random.randint(0, 2, 1000),
        'label': np.random.randint(0, 2, 1000)
    })
    return train_df, test_df

def validate_data(train_df, test_df):
    if train_df.empty or test_df.empty:
        raise ValueError("One or both input DataFrames are empty.")
    required_columns = {'feature1', 'feature2', 'feature3', 'label'}
    if not required_columns.issubset(train_df.columns) or not required_columns.issubset(test_df.columns):
        raise ValueError("Missing required columns in input data.")

def analyze_shap_drift(train_df, test_df):
    try:
        validate_data(train_df, test_df)

        X_train = train_df.drop(columns=['label'])
        y_train = train_df['label']
        X_test = test_df.drop(columns=['label'])

        model = RandomForestClassifier(n_estimators=100, random_state=42)
        model.fit(X_train, y_train)

        explainer = shap.TreeExplainer(model)

        shap_values_train = explainer.shap_values(X_train)[1]
        shap_values_test = explainer.shap_values(X_test)[1]

        mean_train = np.abs(shap_values_train).mean(axis=0)
        mean_test = np.abs(shap_values_test).mean(axis=0)

        shap_df = pd.DataFrame({
            'feature': X_train.columns,
            'mean_abs_shap_train': mean_train,
            'mean_abs_shap_test': mean_test,
            'shap_diff': np.abs(mean_train - mean_test)
        }).sort_values('shap_diff', ascending=False)

        return shap_df

    except NotFittedError:
        raise RuntimeError("Model training failed.")
    except Exception as e:
        raise RuntimeError(f"SHAP analysis failed: {str(e)}")


In [6]:
def plot_shap_drift(shap_df):
    try:
        plt.figure(figsize=(10, 6))
        index = np.arange(len(shap_df))
        bar_width = 0.35

        plt.bar(index, shap_df['mean_abs_shap_train'], bar_width, label='Train SHAP')
        plt.bar(index + bar_width, shap_df['mean_abs_shap_test'], bar_width, label='Test SHAP')

        plt.xlabel('Features')
        plt.ylabel('Mean |SHAP| Value')
        plt.title('SHAP Feature Drift Analysis')
        plt.xticks(index + bar_width / 2, shap_df['feature'], rotation=45)
        plt.legend()
        plt.tight_layout()
        plt.show()
    except Exception as e:
        print(f"Plotting failed: {str(e)}")


In [7]:
import unittest

class TestShapDrift(unittest.TestCase):
    def setUp(self):
        self.train_df, self.test_df = generate_data()

    def test_data_validation_success(self):
        try:
            validate_data(self.train_df, self.test_df)
        except Exception:
            self.fail("validate_data() raised Exception unexpectedly!")

    def test_empty_data(self):
        with self.assertRaises(ValueError):
            validate_data(pd.DataFrame(), pd.DataFrame())

    def test_missing_columns(self):
        df = pd.DataFrame({'a': [1], 'b': [2]})
        with self.assertRaises(ValueError):
            validate_data(df, df)

    def test_shap_analysis_output(self):
        shap_df = analyze_shap_drift(self.train_df, self.test_df)
        self.assertFalse(shap_df.empty)
        self.assertIn('shap_diff', shap_df.columns)

if __name__ == "__main__":
    unittest.main(argv=[''], exit=False)


...E
ERROR: test_shap_analysis_output (__main__.TestShapDrift)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/tmp/ipykernel_6825/736230695.py", line 50, in analyze_shap_drift
    shap_df = pd.DataFrame({
  File "/home/vscode/.local/lib/python3.10/site-packages/pandas/core/frame.py", line 733, in __init__
    mgr = dict_to_mgr(data, index, columns, dtype=dtype, copy=copy, typ=manager)
  File "/home/vscode/.local/lib/python3.10/site-packages/pandas/core/internals/construction.py", line 503, in dict_to_mgr
    return arrays_to_mgr(arrays, columns, index, dtype=dtype, typ=typ, consolidate=copy)
  File "/home/vscode/.local/lib/python3.10/site-packages/pandas/core/internals/construction.py", line 114, in arrays_to_mgr
    index = _extract_index(arrays)
  File "/home/vscode/.local/lib/python3.10/site-packages/pandas/core/internals/construction.py", line 677, in _extract_index
    raise ValueError("All arrays must be of the sa