<a href="https://colab.research.google.com/github/ambersus/Interpretable-Machine-Learning/blob/main/quantitative_shap.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import shap
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline

df = pd.read_csv("/content/heart_disease_data_imputed.csv")
y = df['num'].apply(lambda x: 1 if x > 0 else 0)
X = df.drop(columns=['num'])


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)


model = RandomForestClassifier(n_estimators=200, random_state=42)
model.fit(X_train, y_train)


explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_train)

if isinstance(shap_values, list):
    shap_values_positive = np.array(shap_values[1])
    mean_abs_shap = np.abs(shap_values_positive).mean(axis=0)
else:
    shap_values_array = np.array(shap_values)
    if len(shap_values_array.shape) == 3:
        mean_abs_shap = np.abs(shap_values_array).mean(axis=(0,2))
    else:
        mean_abs_shap = np.abs(shap_values_array).mean(axis=0)

if len(mean_abs_shap) != len(X.columns):
    raise ValueError(f"Dimension mismatch: Expected {len(X.columns)} features, got {len(mean_abs_shap)} SHAP values")

feature_importance = pd.Series(mean_abs_shap, index=X.columns, name='SHAP_importance')


sorted_features = feature_importance.sort_values(ascending=False)
print("\nFeature Importance (SHAP values):")
print(sorted_features)

top_features = sorted_features.index.tolist()

best_acc = 0
optimal_features = []
current_features = []

for feature in top_features:
    if len(current_features) == 10:
        break

    current_features.append(feature)
    X_temp = X_train[current_features]

    pipe = Pipeline([
        ('scaler', StandardScaler()),
        ('smote', SMOTE(random_state=42)),
        ('rf', RandomForestClassifier(random_state=42))
    ])

    pipe.fit(X_temp, y_train)
    acc = accuracy_score(y_test, pipe.predict(X_test[current_features]))

    if acc > best_acc:
        best_acc = acc
        optimal_features = current_features.copy()



optimal_features = optimal_features[:10]

final_pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('smote', SMOTE(random_state=42)),
    ('rf', RandomForestClassifier(
        n_estimators=300,
        max_depth=10,
        min_samples_split=5,
        random_state=42,
        class_weight='balanced'
    ))
])

final_pipe.fit(X_train[optimal_features], y_train)
y_pred = final_pipe.predict(X_test[optimal_features])
y_proba = final_pipe.predict_proba(X_test[optimal_features])[:,1]


def predict_heart_disease():
    """
    Function to collect user input and make a prediction using the trained model
    """
    print("Please provide the following information for heart disease prediction:")


    user_data = {}
    for feature in optimal_features:
        while True:
            try:
                value = float(input(f"Enter value for {feature}: "))
                user_data[feature] = value
                break
            except ValueError:
                print("Please enter a valid number.")


    user_df = pd.DataFrame([user_data])


    prediction = final_pipe.predict(user_df)
    probability = final_pipe.predict_proba(user_df)[:,1][0]


    print("\nPrediction Results:")
    if prediction[0] == 1:
        print("Prediction: High risk of heart disease")
    else:
        print("Prediction: Low risk of heart disease")
    print(f"Probability: {probability:.2%}")


predict_heart_disease()



Feature Importance (SHAP values):
id          0.167710
cp          0.075753
exang       0.046724
oldpeak     0.040056
slope       0.036780
ca          0.032506
age         0.032083
thal        0.029333
dataset     0.027879
thalch      0.026973
chol        0.019358
sex         0.018571
trestbps    0.010798
restecg     0.008373
fbs         0.007056
Name: SHAP_importance, dtype: float64
New best accuracy 0.7663 with features: ['id']
New best accuracy 0.8043 with features: ['id', 'cp']
New best accuracy 0.8424 with features: ['id', 'cp', 'exang']
New best accuracy 0.8533 with features: ['id', 'cp', 'exang', 'oldpeak']
New best accuracy 0.8750 with features: ['id', 'cp', 'exang', 'oldpeak', 'slope', 'ca']
New best accuracy 0.9076 with features: ['id', 'cp', 'exang', 'oldpeak', 'slope', 'ca', 'age']
Please provide the following information for heart disease prediction:
Enter value for id: 87
Enter value for cp: 0
Enter value for exang: 0
Enter value for oldpeak: 2.5
Enter value for slope: 3

In [None]:
import pandas as pd
import numpy as np
import shap
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline


df = pd.read_csv("/content/preprocessed_car_claim.csv")


y = df['OUTCOME'].apply(lambda x: 1 if x > 0 else 0)
X = df.drop(columns=['OUTCOME'])


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)


rf_model = RandomForestClassifier(n_estimators=200, random_state=42)
rf_model.fit(X_train, y_train)


explainer = shap.TreeExplainer(rf_model)
shap_values = explainer.shap_values(X_train)


print("\nSHAP Values Structure Information:")
print(f"Type: {type(shap_values)}")
print(f"SHAP values shape: {shap_values.shape}")


mean_shap_values = np.abs(shap_values[:,:,1]).mean(axis=0)

mean_shap_values = mean_shap_values.flatten()


if len(mean_shap_values) != len(X_train.columns):
    print(f"\nDebug: Features ({len(X_train.columns)}): {X_train.columns.tolist()}")
    print(f"Debug: SHAP values shape: {mean_shap_values.shape}")
    if len(mean_shap_values) == 2 * len(X_train.columns):
        mean_shap_values = mean_shap_values[:len(X_train.columns)]
    else:
        raise ValueError(f"SHAP values dimension mismatch. Expected {len(X_train.columns)} features, got {len(mean_shap_values)} SHAP values")


shap_feature_importance = pd.DataFrame({
    'Feature': X_train.columns,
    'SHAP_Importance': mean_shap_values
}).sort_values(by="SHAP_Importance", ascending=False)


selected_features = shap_feature_importance.head(12)['Feature'].tolist()
print("\nTop 12 Features Selected Based on SHAP Values:")
print(selected_features)


X_train_selected = X_train[selected_features]
X_test_selected = X_test[selected_features]

final_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('smote', SMOTE(random_state=42)),
    ('classifier', RandomForestClassifier(n_estimators=200, random_state=42))
])


final_pipeline.fit(X_train_selected, y_train)


def predict_from_user_input():
    print("\nPlease enter values for the following features:")
    user_data = {}
    for feature in selected_features:
        value = float(input(f"Enter value for {feature}: "))
        user_data[feature] = [value]

    user_df = pd.DataFrame(user_data)
    prediction = final_pipeline.predict(user_df)
    probability = final_pipeline.predict_proba(user_df)[:, 1]

    print("\nPrediction Results:")
    print(f"Predicted Class: {'Claim' if prediction[0] == 1 else 'No Claim'}")
    print(f"Probability of Claim: {probability[0]:.4f}")


predict_from_user_input()



SHAP Values Structure Information:
Type: <class 'numpy.ndarray'>
SHAP values shape: (7964, 18, 2)

Top 12 Features Selected Based on SHAP Values:
['DRIVING_EXPERIENCE', 'VEHICLE_OWNERSHIP', 'VEHICLE_YEAR', 'POSTAL_CODE', 'AGE', 'GENDER', 'SPEEDING_VIOLATIONS', 'PAST_ACCIDENTS', 'CREDIT_SCORE', 'MARRIED', 'ANNUAL_MILEAGE', 'ID']

Please enter values for the following features:
Enter value for DRIVING_EXPERIENCE: 0
Enter value for VEHICLE_OWNERSHIP: 0
Enter value for VEHICLE_YEAR: 2000
Enter value for POSTAL_CODE: 10238
Enter value for AGE: 16
Enter value for GENDER: 0
Enter value for SPEEDING_VIOLATIONS: 3
Enter value for PAST_ACCIDENTS: 3
Enter value for CREDIT_SCORE: 0.87
Enter value for MARRIED: 0
Enter value for ANNUAL_MILEAGE: 0.18
Enter value for ID: 567876

Prediction Results:
Predicted Class: Claim
Probability of Claim: 0.5300
