In [1]:
!pip install shap

Collecting shap
  Downloading shap-0.46.0-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (24 kB)
Collecting slicer==0.0.8 (from shap)
  Downloading slicer-0.0.8-py3-none-any.whl.metadata (4.0 kB)
Downloading shap-0.46.0-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (540 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m540.1/540.1 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading slicer-0.0.8-py3-none-any.whl (15 kB)
Installing collected packages: slicer, shap
Successfully installed shap-0.46.0 slicer-0.0.8


In [4]:
import shap
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import clone

def create_churn_prediction_pipeline(df):
    numeric_features = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
    categorical_features = df.select_dtypes(include=['object', 'category']).columns.tolist()

    if 'churn' in numeric_features:
        numeric_features.remove('churn')
    if 'churn' in categorical_features:
        categorical_features.remove('churn')

    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ])
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))  # 'ignore' is crucial here
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)
        ]
    )

    classifier = RandomForestClassifier(n_estimators=100, random_state=42)

    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', classifier)
    ])

    return pipeline


def train_model_and_compute_shap(pipeline, df):
    X = df.drop('churn', axis=1)
    y = df['churn']

    pipeline.fit(X, y)

    X_preprocessed = pipeline.named_steps['preprocessor'].transform(X)
    model = pipeline.named_steps['classifier']

    explainer = shap.TreeExplainer(model)
    shap_values = explainer.shap_values(X_preprocessed)

    return pipeline, explainer, shap_values, X_preprocessed


def update_pipeline_for_dominant_features(pipeline, dominant_features):
    preprocessor = pipeline.named_steps['preprocessor']

    numeric_features = [f for f in preprocessor.transformers[0][2] if f in dominant_features]
    categorical_features = [f for f in preprocessor.transformers[1][2] if f in dominant_features]

    new_pipeline = clone(pipeline)

    # Clone the transformers
    new_numeric_transformer = clone(new_pipeline.named_steps['preprocessor'].transformers[0][1])
    new_categorical_transformer = clone(new_pipeline.named_steps['preprocessor'].transformers[1][1])

    new_preprocessor = ColumnTransformer(
        transformers=[
            ('num', new_numeric_transformer, numeric_features),
            ('cat', new_categorical_transformer, categorical_features)
        ]
    )

    new_pipeline.named_steps['preprocessor'] = new_preprocessor
    return new_pipeline



def get_dominant_features(shap_values, X_preprocessed, feature_names, top_n=5):
    mean_abs_shap = np.mean(np.abs(shap_values[1]), axis=0)
    if len(feature_names) != len(mean_abs_shap):
      if len(feature_names) < len(mean_abs_shap):
        # If feature names are shorter, truncate mean_abs_shap
        mean_abs_shap = mean_abs_shap[:len(feature_names)]
      else:
        # If mean_abs_shap is shorter, truncate feature_names
        feature_names = feature_names[:len(mean_abs_shap)]

    shap_importance = pd.DataFrame({
        'feature': feature_names,
        'mean_abs_shap': mean_abs_shap
    })

    shap_importance = shap_importance.sort_values(by='mean_abs_shap', ascending=False)

    dominant_features = shap_importance.head(top_n)['feature'].tolist()

    return dominant_features

# Sample Data (Replace with your actual data)
data1 = pd.DataFrame({
    'age': [25, 30, 35, 40],
    'tenure': [2, 5, 8, 10],
    'gender': ['Male', 'Female', 'Male', 'Female'],
    'partner': ['Yes', 'No', 'Yes', 'Yes'],
    'churn': [True, False, True, False]
})

pipeline = create_churn_prediction_pipeline(data1)  # Create the pipeline

pipeline, explainer, shap_values, X_preprocessed = train_model_and_compute_shap(pipeline, data1)

feature_names = pipeline.named_steps['preprocessor'].get_feature_names_out()

dominant_features = get_dominant_features(shap_values, X_preprocessed, feature_names, top_n=2)
updated_pipeline = update_pipeline_for_dominant_features(pipeline, dominant_features)

print(updated_pipeline)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer()),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['age', 'tenure']),
                                                 ('cat',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(fill_value='missing',
                                                                                 strategy='constant')),
                                                                  ('onehot',
                                                                   OneHotEncoder(handle_unkno

In [6]:
updated_pipeline.fit(data1.drop('churn', axis=1), data1['churn'])

new_user = pd.DataFrame({
    'age': [32],
    'tenure': [3],
    'gender': ['Female'],
    'partner': ['No']
})

prediction = updated_pipeline.predict(new_user)
if prediction[0] == 1:
    print("The new user is likely to churn.")
else:
    print("The new user is likely to stay.")

The new user is likely to stay.
