## Install libraries

In [None]:
# pip install torch

In [None]:
# pip install dice-ml

Import libraries

In [None]:
# %% Imports
from torch.utils.data import DataLoader
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, accuracy_score

import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

In [None]:
# path = '/content/data/healthcare-dataset-stroke-data.csv'

## Load the data

In [None]:
# %% Custom DataLoader
class CustomDataLoader:
    def __init__(self, filepath):
        self.filepath = filepath
        self.data = None

    def load_dataset(self):
        self.data = pd.read_csv(self.filepath)

    def preprocess_data(self):
        # Implement your preprocessing here
        self.data.dropna(inplace=True)
        self.data = pd.get_dummies(self.data)

    def get_data_split(self, test_size=0.2, random_state=42):
        X = self.data.drop('stroke', axis=1)
        y = self.data['stroke']
        return train_test_split(X, y, test_size=test_size, random_state=random_state)

    def oversample(self, X_train, y_train):
        smote = SMOTE(random_state=42)
        X_res, y_res = smote.fit_resample(X_train, y_train)
        return X_res, y_res

# %% Load and preprocess data
data_loader = CustomDataLoader('/content/data/healthcare-dataset-stroke-data.csv')
data_loader.load_dataset()
data_loader.preprocess_data()

## Train-Test data split

In [None]:
# Split the data for evaluation
X_train, X_test, y_train, y_test = data_loader.get_data_split()

# Oversample the train data
X_train, y_train = data_loader.oversample(X_train, y_train)


y_test = y_test.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
X_train = X_train.reset_index(drop=True)


## Random Forest Classifier

In [None]:
# %% Fit blackbox model
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
print(f"F1 Score {f1_score(y_test, y_pred, average='macro')}")
print(f"Accuracy {accuracy_score(y_test, y_pred)}")

In [None]:
# Convert y_test and y_pred to pandas Series
y_test_series = pd.Series(y_test)
y_pred_series = pd.Series(y_pred)

# Get indices where y_test and y_pred are 1
test_indices = y_test_series[y_test_series == 1].index.tolist()
pred_indices = y_pred_series[y_pred_series == 1].index.tolist()

print("Test indices:", test_indices)
print("Prediction indices:", pred_indices)

## Create Counterfactual Explanations

In [None]:
# %% Create diverse counterfactual explanations
import dice_ml

# Dataset
data_dice = dice_ml.Data(dataframe=data_loader.data,
                         # For perturbation strategy
                         continuous_features=['age',
                                              'avg_glucose_level',
                                              'bmi'],
                         outcome_name='stroke')

## Creating the Data and Model Objects for DiCE: (Diverse Counterfactual Explanations)

In [None]:
# Model
rf_dice = dice_ml.Model(model=rf,
                        # There exist backends for tf, torch, ...
                        backend="sklearn")
explainer = dice_ml.Dice(data_dice,
                         rf_dice,
                         # Random sampling, genetic algorithm, kd-tree,...
                         method="random")

## Generating and Visualizing Counterfactual Explanations:

In [None]:
# %% Create explanation
# Generate CF based on the blackbox model
input_datapoint = X_test[10:11]

cf = explainer.generate_counterfactuals(input_datapoint,
                                  total_CFs=3,
                                  desired_class="opposite")

In [None]:
print(X_test[0:1])

In [None]:
# Visualize it
# cf.visualize_as_dataframe(show_only_changes=False)

cf.visualize_as_dataframe(show_only_changes=True)

## Creating Feasible (Conditional) Counterfactuals

In [None]:
# Get indices where age is above 70
indices_above_70 = X_test[X_test['age'] > 70].index.tolist()

print("Indices of people whose age is above 70:", indices_above_70)

In [None]:
# %% Create feasible (conditional) Counterfactuals
features_to_vary=['avg_glucose_level',
                  'bmi',
                  'smoking_status_smokes']
permitted_range={'avg_glucose_level':[40,300],
                'bmi':[15, 45]}

i = 139

input_datapoint2 = X_test[i:i+1]

print("Label of test data: ", y_test[i])
print(input_datapoint2.to_string(index=False))

# Now generating explanations using the new feature weights

cf = explainer.generate_counterfactuals(input_datapoint2,
                                  total_CFs=10,
                                  desired_class="opposite",
                                  permitted_range=permitted_range,
                                  features_to_vary=features_to_vary)
# Visualize it
cf.visualize_as_dataframe(show_only_changes=True)

In [None]:
print(y_test[i])