In [None]:
from google.colab import drive
drive.mount('/content/drive')
%cd /#YOUR PATH TO THE NOTEBOOK FOLDER IN GOOGLE COLAB

In [None]:
!pip install shap
!pip install umap-learn

In [None]:
from pathlib import Path
import numpy as np
import random
import pandas as pd
import joblib
import itertools
import shap
import tensorflow as tf
# Set seed for Python's random module
random.seed(42)
# Set seed for NumPy's random generator
np.random.seed(42)
# Set seed for TensorFlow
tf.random.set_seed(42)

In [None]:
from configuration import data_path as DATA_PATH
from configuration import results_root as RESULTS_ROOT
from configuration import results_path as RESULTS_PATH
from configuration import model_name as MODEL_NAME
from configuration import hyperparameter_tuning as HYPERPARAMETER_TUNNING
from configuration import feature_selection as FEATURE_SELECTION

from configuration import batchsize as BATCH_SIZE
from configuration import epochs as EPOCHS

In [None]:
from utils import load_data, load_selected_features, load_hyperparameters, init_model, dimensionality_reduction_2D

In [None]:
DATA_PATH, RESULTS_ROOT, MODEL_NAME

In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 10)

FOOTPRINT

In [None]:
class Footprint:
  def __init__(self, model_name, hyperparameter_tuning, feature_selection):
    self.model_name = model_name
    self.hyperparameter_tuning = hyperparameter_tuning
    self.feature_selection = feature_selection
    self.model = None
    self.data = {}
    self.shapley_values = None
    self.shapley_feature_importance = None
    self.meta_representations = None
    self.meta_representations_2d = None
    self.dim_reduction = None

  def train_model(self, data_path, results_path, **kwargs):
    # Load data
    X_train, y_train, X_test, y_test = load_data(path=data_path)

    if self.feature_selection:
      selected_features = load_selected_features(path=f"{results_path}/{self.model_name}/hyperparameter_tuning=True-feature_selection=True", n=None)
      print(f"\nLoading selected features: {selected_features}")

      X_train = X_train[selected_features]
      X_test = X_test[selected_features]

    # Load previously tuned hyperparameters or set default parameters
    params = load_hyperparameters(model_name=self.model_name, results_root=results_path, hyperparameter_tuning=self.hyperparameter_tuning)
    print(f"\nLoading hyperparameters: {params}")

    # Initialize model
    model = init_model(params=params, model_name=self.model_name, num_features=X_train.shape[1], num_targets=y_train.shape[1])

    if self.model_name == "neural_network":
      model.fit(X_train, y_train, batch_size=kwargs['batch_size'], epochs=kwargs['epochs'], verbose=1)
    else:
      model.fit(X_train, y_train)

    self.model = model
    self.data['X_train'] = X_train
    self.data['X_test'] = X_test
    self.data['y_train'] = y_train
    self.data['y_test'] = y_test
    return self

  def calculate_shapley_values(self):
    """ Calculate Shapley values for multi target regression model, with respect to each target. """
    explainer = shap.KernelExplainer(model=self.model.predict, data=self.data["X_test"].values)
    shapley_values_mtr = explainer.shap_values(X=self.data["X_test"].values)
    shapley_values_mtr = [shapley_values_mtr[:,:,i] for i in range(len(self.data["y_test"].columns))]
    for target_id, shapley_values in zip(self.data["y_test"].columns, shapley_values_mtr):
      self.shapley_values = {target_id: pd.DataFrame(shapley_values, columns=self.data["X_test"].columns, index=self.data["X_test"].index) for target_id, shapley_values in zip(self.data["y_test"].columns, shapley_values_mtr)}
      expected_values_mtr = explainer.expected_value
      self.expected_values = {target_id: expected_value for target_id, expected_value in zip(self.data["y_test"].columns, expected_values_mtr)}
      self.explainer = explainer
    return self

  def calculate_shapley_feature_importance(self):
    """ Calculate Shapley importance by averaging the Shapley values per feature. """
    shapley_feature_importance = {}
    for target_id, shapley_values in self.shapley_values.items():
      shapley_importance_temp = shapley_values.abs().mean().reset_index()
      shapley_importance_temp = shapley_importance_temp.rename(columns={"index": "feature_name", 0: 'feature_importance'}).sort_values(by='feature_importance', ascending=False)
      shapley_importance_temp['rank'] = np.arange(1, len(shapley_importance_temp) + 1)
      shapley_feature_importance[target_id] = shapley_importance_temp
    self.shapley_feature_importance = shapley_feature_importance
    return self

  def concat_metarepresentations(self):
    """ Concatenate shapley values for all algorithms into a single DataFrame. """
    data = pd.DataFrame()
    for target_id, shap_values in self.shapley_values.items():
      print(f"\ntarget_id: {target_id}")
      data = pd.concat([data, shap_values.assign(target_id=target_id)], axis=0)
    data = data.set_index("target_id", append=True)
    self.meta_representations = data
    return data

  def metarepresentations_2d(self):
    """ Dimensionality reduction 2D. """
    data_2d, dim_reduction = dimensionality_reduction_2D(data=self.meta_representations, dim_reduction_name="PCA", n_components=2)
    self.meta_representations_2d = data_2d
    self.dim_reduction = dim_reduction
    return self

RUN

In [None]:
def run_footprint():
  my_file = Path(f"{RESULTS_PATH}/footprint.joblib")
  print(my_file.is_file())
  if my_file.is_file():
      footprint = joblib.load(f"{RESULTS_PATH}/footprint.joblib")
      print(f"\n\n--- Load data ---")
      for dataset in ["train", "test"]:
        print(f"{dataset}: \n")
        print(footprint.data[f"X_{dataset}"].head())
        print(footprint.data[f"X_{dataset}"].shape)
        print("\n")
        print(footprint.data[f"y_{dataset}"].head())
        print(footprint.data[f"y_{dataset}"].shape)
  else:
    footprint = Footprint(model_name=MODEL_NAME, hyperparameter_tuning=HYPERPARAMETER_TUNNING, feature_selection=FEATURE_SELECTION)

    footprint.train_model(data_path=DATA_PATH, results_path=RESULTS_ROOT)
    print(f"\n\n--- Load data ---")
    for dataset in ["train", "test"]:
      print(f"{dataset}: \n")
      print(footprint.data[f"X_{dataset}"].head())
      print(footprint.data[f"X_{dataset}"].shape)
      print("\n")
      print(footprint.data[f"y_{dataset}"].head())
      print(footprint.data[f"y_{dataset}"].shape)

    footprint.calculate_shapley_values()
    print(f"\n\n--- Calculate Shapley values ---")
    for target_id, shapley_values in footprint.shapley_values.items():
      print(f"\n{target_id}")
      print(shapley_values.head())
      print(shapley_values.shape)

    # save fitted model
    joblib.dump(footprint, f"{RESULTS_PATH}/footprint.joblib")

  footprint.calculate_shapley_feature_importance()
  print(f"--- Calculate Shapley importance --- \n")
  for target_id, shapley_feature_importance in footprint.shapley_feature_importance.items():
    print(f"{target_id} \n")
    print(shapley_feature_importance.head(30))
    print(shapley_feature_importance.shape)

  footprint.concat_metarepresentations()
  print(f"--- Concatenate meta-representations --- \n")
  print(footprint.meta_representations.head())
  print(footprint.meta_representations.shape)

  footprint.metarepresentations_2d()
  print(f"--- Meta-representations 2D --- \n")
  print(footprint.meta_representations_2d.head())
  print(footprint.meta_representations_2d.shape)

  # save to csv
  for target_id, shapley_values in footprint.shapley_values.items():
    shapley_values.reset_index().to_csv(f"{RESULTS_PATH}/shapley_values-{target_id}.csv")

  # save fitted model
  joblib.dump(footprint, f"{RESULTS_PATH}/footprint.joblib")

run_footprint()