In [None]:
#############################################################################################################
##### Notebook Explainability
##### Baseado em:
##  Dataset: https://www.kaggle.com/datasets/fedesoriano/stroke-prediction-dataset
##
##############################################################################################################
## Objetivos:
##   Demosntrar os principais metodos de explainability

In [3]:
#!pip install xgboost
#!pip install dice-ml
!pip install interpret

Collecting interpret
  Downloading interpret-0.6.3-py3-none-any.whl.metadata (1.1 kB)
Collecting interpret-core==0.6.3 (from interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret)
  Downloading interpret_core-0.6.3-py3-none-any.whl.metadata (2.8 kB)
Collecting shap>=0.28.5 (from interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret)
  Downloading shap-0.44.1-cp38-cp38-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (24 kB)
Collecting SALib>=1.3.3 (from interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret)
  Downloading salib-1.4.8-py3-none-any.whl.metadata (11 kB)
Collecting aplr>=10.5.1 (from interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret)
  Downloading aplr-10.6.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.0 kB)
Collecting dash>=1.0.0 (from interpret-core[apl

In [4]:
import pandas as pd
import imblearn
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, accuracy_score
from interpret.blackbox import LimeTabular
from interpret import show

pd.set_option('display.max_columns', None)

import shap
import dice_ml

In [5]:
class DataLoader():
    def __init__(self):
        self.data = None

    def load_dataset(self, path="C:/Users/dealbuqc/Desktop/ontomqol/Datasets/stroke/healthcare-dataset-stroke-data.csv"):
        self.data = pd.read_csv(path)

    def preprocess_data(self):
        # One-hot encode para todas as colunas categoricas
        categorical_cols = ["gender",
                            "ever_married",
                            "work_type",
                            "Residence_type",
                            "smoking_status"]
        encoded = pd.get_dummies(self.data[categorical_cols], 
                                prefix=categorical_cols, dtype=float)

        # Atualiza dataset com novas colunas
        self.data = pd.concat([encoded, self.data], axis=1)
        self.data.drop(categorical_cols, axis=1, inplace=True)

        # Incluir valores que faltam na coluna BMI
        self.data.bmi = self.data.bmi.fillna(0)
        
        # Drop id - caracteristica nao eh relevante
        self.data.drop(["id"], axis=1, inplace=True)

    def get_data_split(self):
        X = self.data.iloc[:,:-1]
        y = self.data.iloc[:,-1]
        return train_test_split(X, y, test_size=0.20, random_state=2021)
    
    def oversample(self, X_train, y_train):
        oversample = RandomOverSampler(sampling_strategy='minority')
        # Converte para numpy e oversample
        x_np = X_train.to_numpy()
        y_np = y_train.to_numpy()
        x_np, y_np = oversample.fit_resample(x_np, y_np)
        # Convert de volta para pandas
        x_over = pd.DataFrame(x_np, columns=X_train.columns)
        y_over = pd.Series(y_np, name=y_train.name)
        return x_over, y_over

In [6]:
# Carregar dados
data_loader = DataLoader()
data_loader.load_dataset()
data_loader.preprocess_data()

# Separar em treinamento e avaliacao, fazendo o oversampling
X_train, X_test, y_train, y_test = data_loader.get_data_split()
X_train, y_train = data_loader.oversample(X_train, y_train)
print(X_train.shape)
print(X_test.shape)

FileNotFoundError: [Errno 2] No such file or directory: 'C:/Users/dealbuqc/Desktop/ontomqol/Datasets/stroke/healthcare-dataset-stroke-data.csv'

In [None]:
# %% Treinar o modelo blackbox (pode ser qualquer um aqui)
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
print(f"F1 Score {f1_score(y_test, y_pred, average='macro')}")
print(f"Accuracy {accuracy_score(y_test, y_pred)}")

In [None]:
# Aplicar LIME

# Classificacao: predict_proba. Regressao: predict)
lime = LimeTabular(rf, 
                   X_train) # precisa do dataset para gerar as perturbacoes

# Retornar explanation
lime_local = lime.explain_local(X_test[-20:], 
                                y_test[-20:], 
                                name='LIME')
show(lime_local)

In [None]:
# Aplicar SHAP

import xgboost

model = xgboost.XGBClassifier(n_estimators=20)
model.fit(X_train, y_train)

explainer = shap.Explainer(model)
shap_values = explainer(X_train)
shap.plots.beeswarm(shap_values)

In [None]:
# Aplicar DiCE  (Diverse Counterfactual Explanations)

# Dataset
data_dice = dice_ml.Data(dataframe=data_loader.data, 
                         # Indicar quem sao as caracteristicas continuas (para perturbacao)
                         continuous_features=['age', 
                                              'avg_glucose_level',
                                              'bmi'], 
                         outcome_name='stroke')

In [None]:
# Modelo
rf_dice = dice_ml.Model(model=rf, 
                        backend="sklearn") # tf, torch, ...
explainer = dice_ml.Dice(data_dice, 
                         rf_dice, 
                         # Random sampling, genetic algorithm, kd-tree,... (Ver github.com/)
                         # Ver github.com/interpretml/DICE para outras opcoes incluindo para DL
                         method="random")

In [7]:
# %% Criar explanations
input_datapoint = X_test[0:1]
cf = explainer.generate_counterfactuals(input_datapoint, 
                                  total_CFs=3, 
                                  desired_class="opposite")

# Visualizar
cf.visualize_as_dataframe(show_only_changes=True)


NameError: name 'X_test' is not defined

In [None]:
# %% Criar contrafatos condicionais
features_to_vary=['avg_glucose_level',
                  'bmi',
                  'smoking_status_smokes']
permitted_range={'avg_glucose_level':[80,250],
                'bmi':[18, 35]}

cf = explainer.generate_counterfactuals(input_datapoint, 
                                  total_CFs=3, 
                                  desired_class="opposite",
                                  permitted_range=permitted_range,
                                  features_to_vary=features_to_vary)
# Visualizacao
cf.visualize_as_dataframe(show_only_changes=True)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.vgg16 import preprocess_input, decode_predictions
from tensorflow.keras.applications.vgg16 import VGG16
from innvestigate import create_analyzer

# Carregar os pesos do modelo pre-treinado VGG16
model = VGG16(weights='imagenet')

In [None]:
# Carregar um exemplo de imagem
img_path = 'C:/Users/dealbuqc/Desktop/ontomqol/Datasets/brain-mri/Testing/glioma_tumor/image(1).jpg'
img = image.load_img(img_path, target_size=(224, 224))
x = image.img_to_array(img)
x = np.expand_dims(x, axis=0)
x = preprocess_input(x)

In [None]:
plt.imshow(img)

In [None]:
# Identificacao da imagem (classificao)
preds = model.predict(x)
print('Predicted:', decode_predictions(preds, top=3)[0])

In [None]:
## ESSE CODIGO VAI GERAR UM ERRO. TENTE ENTENDER O MOTIVO!

# Create an LRP analyzer
#analyzer = create_analyzer("lrp.z", model)
analyzer = create_analyzer("gradient", model)

# Aplicar o LRP a image
analysis = analyzer.analyze(x)

# Plotar o heatmap
plt.imshow(analysis.squeeze(), cmap='viridis')
plt.colorbar()
plt.show()

# Para uma implementacao correta (em pytorch), ver link abaixo:
# https://www.kaggle.com/code/gustavkeppler/layer-wise-relevance-propagation-lrp-on-vgg16

In [None]:
# Ver um demo em: 
### https://lrpserver.hhi.fraunhofer.de/image-classification