In [1]:
# Import necessary libraries
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import roc_auc_score, accuracy_score, classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
import pandas as pd
import numpy as np

# Suppress warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Load Dataset
data = pd.read_csv('../2 - Data/develop.csv')  # Update the path accordingly
target = 'Ins'  # Define the target variable
X = data.drop(columns=[target])
y = data[target]

# Encode categorical variables
X = pd.get_dummies(X, drop_first=True)

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Apply SMOTE to handle class imbalance
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [3]:
# Define parameter grid for Random Forest
rf_params = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize Random Forest
rf = RandomForestClassifier(random_state=42)

# Cross-validation setup
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Perform GridSearchCV for Random Forest
rf_grid_search = GridSearchCV(estimator=rf, param_grid=rf_params, cv=cv, scoring='roc_auc', n_jobs=-1, verbose=1)
rf_grid_search.fit(X_train_resampled, y_train_resampled)

# Best parameters and model
best_rf = rf_grid_search.best_estimator_
print(f"Best Random Forest Parameters: {rf_grid_search.best_params_}")

Fitting 5 folds for each of 81 candidates, totalling 405 fits
Best Random Forest Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300}


In [5]:
# Define parameter grid for XGBoost
xgb_params = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'min_child_weight': [1, 3, 5]
}

# Initialize XGBoost
xgb = XGBClassifier(random_state=42)

# Perform GridSearchCV for XGBoost
xgb_grid_search = GridSearchCV(estimator=xgb, param_grid=xgb_params, cv=cv, scoring='roc_auc', n_jobs=-1, verbose=1)
xgb_grid_search.fit(X_train_resampled, y_train_resampled)

# Best parameters and model
best_xgb = xgb_grid_search.best_estimator_
print(f"Best XGBoost Parameters: {xgb_grid_search.best_params_}")

Fitting 5 folds for each of 81 candidates, totalling 405 fits
Best XGBoost Parameters: {'learning_rate': 0.1, 'max_depth': 7, 'min_child_weight': 1, 'n_estimators': 300}


In [6]:
# Ensemble Model (Simple Averaging)
ensemble_pred = (rf_pred + xgb_pred) / 2
ensemble_auc = roc_auc_score(y_test, ensemble_pred)
ensemble_acc = accuracy_score(y_test, (ensemble_pred > 0.5).astype(int))

# Print Results
print(f"Ensemble Model AUC: {ensemble_auc:.3f}")
print(f"Ensemble Model Accuracy: {ensemble_acc:.3f}")

# Classification Report
print("\nClassification Report:\n", classification_report(y_test, (ensemble_pred > 0.5).astype(int)))

# Confusion Matrix
cm = confusion_matrix(y_test, (ensemble_pred > 0.5).astype(int))
print("Confusion Matrix:")
print(cm)

Ensemble Model AUC: 0.814
Ensemble Model Accuracy: 0.757

Classification Report:
               precision    recall  f1-score   support

           0       0.79      0.85      0.82      4218
           1       0.67      0.58      0.62      2235

    accuracy                           0.76      6453
   macro avg       0.73      0.71      0.72      6453
weighted avg       0.75      0.76      0.75      6453

Confusion Matrix:
[[3593  625]
 [ 943 1292]]


In [None]:
import shap

# Initialize SHAP for each model
rf_explainer = shap.TreeExplainer(rf_model)
xgb_explainer = shap.Explainer(xgb_model)

# Calculate SHAP values for each model
rf_shap_values = rf_explainer.shap_values(X_test)
xgb_shap_values = xgb_explainer.shap_values(X_test)

# Average SHAP values for ensemble
ensemble_shap_values = (rf_shap_values + xgb_shap_values) / 2

# SHAP summary plot
shap.summary_plot(ensemble_shap_values, X_test, plot_type="bar")

# SHAP dependence plot for a specific feature
shap.dependence_plot("CD", ensemble_shap_values, X_test)  # Replace "Feature_Name" with the feature you want to inspect

In [None]:
from alibi.explainers import AnchorTabular

explainer = AnchorTabular(predict_fn=ensemble_model.predict, feature_names=X_train.columns.tolist())
explainer.fit(X_train.values, disc_perc=(25, 50, 75))  # Discretize features
explanation = explainer.explain(X_test.values[0])
print(explanation)

In [None]:
from skater.core.explanations import Interpretation
from skater.model import InMemoryModel

interpreter = Interpretation(X_test, feature_names=X_test.columns)
model = InMemoryModel(ensemble_model.predict_proba, examples=X_train)
plots = interpreter.feature_importance.plot_feature_importance(model)

In [None]:
from dice_ml import Dice
from dice_ml.utils import helpers

# Initialize DICE with the model and data
dice_data = helpers.load_adult_income_dataset()
d = Dice(dice_data, model=model, method="random")
explanation = d.generate_counterfactuals(X_test.iloc[0:1], total_CFs=5, desired_class="opposite")
explanation.visualize_as_dataframe()