In [None]:
!pip install transformers
!pip install shap
!pip install lime
!pip install sentencepiece 
!pip install emoji
!pip install rouge_score
!pip install captum

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m54.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m100.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.14.0-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.2/224.2 kB[0m [31m24.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.0 tokenizers-0.13.3 transformers-4.28.1
Looking in indexes: https://pypi.org/simple, htt

In [None]:
# General purpose packages
import datetime
import json
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import pickle
import pprint
import time
from abc import ABC, abstractmethod
from collections import defaultdict
from datetime import datetime
from tqdm import tqdm
from pprint import pprint
from math import exp
from functools import partial
from typing import Any, Dict, List

# Required imports for the supervised models
import torch
import sentencepiece
import transformers
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification, TFAutoModelForSequenceClassification

# Required imports for the explainability frameworks
import shap
from lime.lime_text import LimeTextExplainer
from captum.attr import (LayerIntegratedGradients, ShapleyValueSampling, Occlusion, 
                         LayerDeepLiftShap, TokenReferenceBase, LayerGradientXActivation, LimeBase)
from captum._utils.models.linear_model import SkLearnLasso, SkLearnLinearModel

# Import to connect with google drive's content
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# 0. Define the configuration

In [None]:
# GLOBAL CONFIGURATION
IS_ALE       = False
BASE_PATH    = "drive/MyDrive/NLU Spring 2023 - Final Project" if IS_ALE else "drive/MyDrive/NYU/NLU Spring 2023 - Final Project"
DATA_PATH    = f"{BASE_PATH}/data"
RESULTS_PATH = f"{BASE_PATH}/results"
DATASET      = "1. Tweet Sentiment" # One of ["1. Tweet Sentiment", "2. Movie Rationales"] 
SUBSAMPLE    = True

SUPERVISED_MODELS = [
          "distilbert-base-uncased-finetuned-sst-2-english", # POSITIVE, NEGATIVE -  https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english
          "cardiffnlp/twitter-roberta-base-sentiment", # LABEL_0, LABEL_1, LABEL_2 - https://huggingface.co/cardiffnlp/twitter-roberta-base-sentiment
          "ProsusAI/finbert", # Positive, Negative, Neutral - https://huggingface.co/ProsusAI/finbert 
          "cardiffnlp/twitter-xlm-roberta-base-sentiment", # Positive, Negative, Neutral - https://huggingface.co/cardiffnlp/twitter-xlm-roberta-base-sentiment
          "finiteautomata/bertweet-base-sentiment-analysis", # Pos, Neu, Neg - https://huggingface.co/finiteautomata/bertweet-base-sentiment-analysis 
          "yiyanghkust/finbert-tone", # https://huggingface.co/yiyanghkust/finbert-tone 
         ]

EMBEDDINGS = {'distilbert-base-uncased-finetuned-sst-2-english': 'model.distilbert.embeddings',
              'cardiffnlp/twitter-roberta-base-sentiment': 'model.roberta.embeddings',
              'ProsusAI/finbert': 'model.bert.embeddings',
              'cardiffnlp/twitter-xlm-roberta-base-sentiment': 'model.roberta.embeddings',
              'finiteautomata/bertweet-base-sentiment-analysis': 'model.roberta.embeddings',
              'cardiffnlp/twitter-roberta-base-sentiment-latest': 'model.roberta.embeddings',
              'yiyanghkust/finbert-tone': 'model.bert.embeddings',
              'j-hartmann/emotion-english-distilroberta-base': 'model.roberta.embeddings'}

# SPECIFIC CONFIGURATION
LIME_NUM_FEATURES = 100 # Number of tokens to extract LIME weights for
LIME_NUM_SAMPLES  = 1000 # Number of neighbors to generate local predictions for

# 1. Load corresponding data

## 1.1 Read Datasets

- Load datasets from files
- Filter non-neutral data

In [None]:
train_data = pd.read_csv(f"{DATA_PATH}/{DATASET}/train.csv")
test_data  = pd.read_csv(f"{DATA_PATH}/{DATASET}/test.csv")

train_data  = train_data[train_data.sentiment != 'neutral'].reset_index(drop=True)
test_data   = test_data[test_data.sentiment != 'neutral'].reset_index(drop=True)

text_corpus = train_data["text"].tolist()
if SUBSAMPLE:
  text_corpus = text_corpus[:2]

train_data.head()

Unnamed: 0,textID,text,selected_text,sentiment
0,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
1,088c60f138,my boss is bullying me...,bullying me,negative
2,9642c003ef,what interview! leave me alone,leave me alone,negative
3,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative
4,6e0c6d75b1,2am feedings for the baby are fun when he is a...,fun,positive


In [None]:
train_data[train_data.sentiment == 'negative'][:3]

Unnamed: 0,textID,text,selected_text,sentiment
0,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
1,088c60f138,my boss is bullying me...,bullying me,negative
2,9642c003ef,what interview! leave me alone,leave me alone,negative


In [None]:
train_data[train_data.sentiment == 'positive'][:3]

Unnamed: 0,textID,text,selected_text,sentiment
4,6e0c6d75b1,2am feedings for the baby are fun when he is a...,fun,positive
5,fc2cbefa9d,Journey!? Wow... u just became cooler. hehe....,Wow... u just became cooler.,positive
6,16fab9f95b,I really really like the song Love Story by Ta...,like,positive


# 2. Supervised Models

In [None]:
class PreTrainedSentimentModel(object):
  ''' This class is only intended to be a helper for our pre-trained models. 
      Methods to be adjusted for convenience on our predictions/calculation of metrics.
  '''
  def __init__(self, model_name):
    self.model_name         = model_name
    self.pipeline           = pipeline(model=self.model_name)
    self.label2id           = self.pipeline.model.config.label2id
    self.positive_label     = self._positive_label()
    self.negative_label     = self._negative_label()
    self.positive_label_idx = self.label2id[self.positive_label]
    self.negative_label_idx = self.label2id[self.negative_label]
    self.sentiment2id       = defaultdict(None, {'positive': self.positive_label_idx, 'negative': self.negative_label_idx })
    self.id2sentiment       = defaultdict(None, { self.positive_label_idx: 'positive', self.negative_label_idx: 'negative'})
    self.tokenizer          = AutoTokenizer.from_pretrained(self.model_name)
    self.model              = AutoModelForSequenceClassification.from_pretrained(self.model_name)
    self.embeddings         = EMBEDDINGS[self.model_name]


  def predict(self, text_samples, padding = True):
    ''' Predicts label and probability for each sample received
        Returns two lists: one with the list of labels assigned to each sample ('negative' or 'positive', None otherwise)
        and the second list with predicted probabilities for each of the labels assigned. 
        of each.
        E.g.: ['positive', 'negative'], [.85, .99]
        Note: i still don't know the difference with the actual tokenizer/model predict.
        args:
          text_samples Series of strings 
    ''' 
    input_ids   = torch.tensor(self.tokenizer(list(text_samples.values), padding = padding)['input_ids'])
    logits      = self.model(input_ids).logits.detach().numpy()
    probs       = np.exp(logits) / np.sum(np.exp(logits), axis=1)[:,None]  
    pred_labels = [self.id2sentiment.get(id) for id in probs.argmax(axis=1)]
    pred_scores = np.amax(probs, axis=1)

    return pred_labels, pred_scores

  def _positive_label(self):
    return self._find_label(keyword='pos', sample_text='I really really like the song Love Story')

  def _negative_label(self):
    return self._find_label(keyword='neg', sample_text='Sooo SAD I will miss you here in San Diego!!!')
      
  def _find_label(self, keyword='', sample_text=''):
    labels = []
    if keyword:
      labels = [label for label in self.label2id.keys() if keyword in label.lower()]
    return labels[0] if (len(labels)==1) else self._predict_from_pipeline(sample_text)[0]['label']

  def _predict_from_pipeline(self, text_samples):
    ''' Predicts label and probability for a single string or a list of strings.
        Returns a dictionary with a key for the predicted label together with its probability
        E.g.: [{'label': 'POSITIVE', 'score': 0.9855217337608337}]
        Note: i still don't know the difference with the actual tokenizer/model predict.
        args:
          text_samples single string or list of strings
    ''' 
    return self.pipeline(text_samples)

  def __str__(self) -> str:
    return f"model_name:         {self.model_name}\n" + \
           f"positive_label:     {self.positive_label}\n" + \
           f"positive_label_idx: {self.positive_label_idx}\n" + \
           f"negative_label:     {self.negative_label}\n" + \
           f"negative_label_idx: {self.negative_label_idx}\n" + \
           f"sentiment2id:       {self.sentiment2id}\n" 

In [None]:
for model_name in SUPERVISED_MODELS:
  pretrainedModel = PreTrainedSentimentModel(model_name)
  print(pretrainedModel)

## 2.1 Predict

In [None]:
#Generate a model to use in the prediction examples
pretrainedModel          = PreTrainedSentimentModel(SUPERVISED_MODELS[0])
print(pretrainedModel)

model_name:         distilbert-base-uncased-finetuned-sst-2-english
positive_label:     POSITIVE
positive_label_idx: 1
negative_label:     NEGATIVE
negative_label_idx: 0
sentiment2id:       defaultdict(None, {'positive': 1, 'negative': 0})



In [None]:
# Generate some data for the predictions examples
batch_size = 10

# Testing something... 
for batch_i in tqdm(range(1)):
  # Set batch indexes
  start_batch_idx =  batch_i * batch_size
  end_batch_idx   = (batch_i+1) * batch_size 

  # Process batch data
  text_samples    = train_data['text'][start_batch_idx:end_batch_idx]
  selected_texts  = train_data['selected_text'][start_batch_idx:end_batch_idx]
  sentiments      = train_data['sentiment'][start_batch_idx:end_batch_idx]

100%|██████████| 1/1 [00:00<00:00, 489.53it/s]


### 2.1.1 From Model/Tokenizer


In [None]:
pred_labels, pred_scores = pretrainedModel.predict(text_samples, padding=True)

print(pred_labels)
print(pred_scores)

['positive', 'negative', 'negative', 'negative', 'positive', 'negative', 'positive', 'negative', 'negative', 'negative']
[0.9712993  0.9949237  0.9782493  0.9962517  0.9982126  0.85659474
 0.99801165 0.9983179  0.9996627  0.9880779 ]


2.2 Predict From Pipeline

In [None]:
# print("Prediction")
# print(pretrainedModel._predict_from_pipeline(text_samples.values[0])) #INPUT = SINGLE STRING

print("Prediction")
pprint(pretrainedModel._predict_from_pipeline(list(text_samples.values)))  #INPUT = LIST OF STRINGS

Prediction
[{'label': 'POSITIVE', 'score': 0.9855217337608337},
 {'label': 'NEGATIVE', 'score': 0.9993494153022766},
 {'label': 'NEGATIVE', 'score': 0.9985242486000061},
 {'label': 'NEGATIVE', 'score': 0.9981385469436646},
 {'label': 'POSITIVE', 'score': 0.9996920824050903},
 {'label': 'NEGATIVE', 'score': 0.8565945625305176},
 {'label': 'POSITIVE', 'score': 0.9996702671051025},
 {'label': 'NEGATIVE', 'score': 0.9989368319511414},
 {'label': 'NEGATIVE', 'score': 0.9997137188911438},
 {'label': 'NEGATIVE', 'score': 0.9977685213088989}]


## Question for ALE 

-  Do you know what is the difference on these two ways of predicting?
I find there is slight differences on the probability scores. Maybe it's only rounding but wanna double check with you...

# 3. Explainability Frameworks Refactored

- For the previous implementations (non-batched+batched), see v1 version of this file (everything is working code there)

## 3.1 Attribution Methods

- There is an abstract Attribution method class which defines the signature of the method: 'attribute'
- We have a class that extends this abstract class for every explainability method, each class defines its own version of attribute

- Reminders for TODOs:
  - Make LIME Work for batch
  - Once LIME is working, if we don't use attn_masks, remove it from all signatures because that was the only reason why it was added.
  - Review class vs object methods once done
  - Review n_steps if we are not getting good results
  - Find a better class name, i added the word "method" just to avoid collapse with Captum


In [None]:
class AttributionMethod(ABC):

  @abstractmethod
  def attribute(self, input_ids, targets, model, embeddings = None, attn_masks = None):
    '''
    Returns the attributions assigned by the attribution method
    given the model and input_ids received.
    args:
      model      model to use for the explanations 
      input_ids  3D tensor with shape (num_samples, seq_length, embedding_size)
      targets    1D tensor with the targets for the explanations
    '''
    pass

### 3.1.1 LimeMethod [TODO]

In [None]:
# class LimeMethod(AttributionMethod):

#   def attribute(self, input_ids, targets, model, embeddings = None, attn_masks = None):
#     '''
#     Returns the attributions assigned by the Layer Integrated Gradients method
#     given the model and input_ids received.
#     args:
#       model      model to use for the explanations 
#       input_ids  3D tensor with shape (num_samples, seq_length, embedding_size)
#       targets    1D tensor with the targets for the explanations
#     '''
#     pass

### 3.1.2 DeepLiftSHAPMethod

In [None]:
class DeepLiftSHAPMethod(AttributionMethod):

  def attribute(self, input_ids, targets,  model, embeddings = None, attn_masks = None):
    '''
        Returns the attributions assigned by the Layer Integrated Gradients method
        given the model and input_ids received.
        args:
          model      model to use for the explanations 
          input_ids  3D tensor with shape (num_samples, seq_length, embedding_size)
          targets    1D tensor with the targets for the explanations
    '''
    class ModelWrapper(torch.nn.Module):
      def __init__(self, res):
          super().__init__()
          self.model = res
          self.model.zero_grad()
          self.forward = lambda x: res.forward(x.long()).logits

    # Instantiate a DeepLiftShap explainer object
    explainer = LayerDeepLiftShap(ModelWrapper(model), eval(embeddings))

    # Explain the model's prediction on the input text
    baselines    = torch.cat([ torch.zeros_like(input_ids), torch.zeros_like(input_ids)]).float()
    attributions = explainer.attribute( input_ids.float(), 
                                        baselines = baselines,
                                        target    = targets)
    
    # Sum across embeddings
    attributions = attributions.sum(dim=2) 

    return attributions

### 3.1.3 LayerGradientActivationMethod

In [None]:
class LayerGradientActivationMethod(AttributionMethod):

  def attribute(self, input_ids, targets,  model, embeddings = None, attn_masks = None):
    '''
        Returns the attributions assigned by the Layer Integrated Gradients method
        given the model and input_ids received.
        args:
          model      model to use for the explanations 
          input_ids  3D tensor with shape (num_samples, seq_length, embedding_size)
          targets    1D tensor with the targets for the explanations
    '''
    class ModelWrapper(torch.nn.Module):
          def __init__(self, res):
              super().__init__()
              self.model = res
              self.model.zero_grad()
              self.forward = lambda x: res.forward(x.long()).logits

    # Define the Layer Gradient Activation attribution algorithm
    lga = LayerGradientXActivation(forward_func = ModelWrapper(model),
                                   layer        = eval(embeddings)
    )                                           

    # Compute the attribution scores for each token
    attributions = lga.attribute(input_ids, target = targets)

    # Sum across embeddings
    attributions = attributions.sum(dim=2) 

    return attributions

### 3.1.4 LayerIntegratedGradientsMethod

In [None]:
class LayerIntegratedGradientsMethod(AttributionMethod):

  def attribute(self, input_ids, targets,  model, embeddings = None, attn_masks = None):
    '''
        Returns the attributions assigned by the Layer Integrated Gradients method
        given the model and input_ids received.
        args:
          model      model to use for the explanations 
          input_ids  3D tensor with shape (num_samples, seq_length, embedding_size)
          targets    1D tensor with the targets for the explanations
    '''
    # Define the Integrated Gradients attribution algorithm
    lig = LayerIntegratedGradients(lambda x: model(x).logits, eval(embeddings))

    # Compute the attribution scores for each token
    attributions, delta = lig.attribute(input_ids, 
                                        n_steps                  = 50,
                                        return_convergence_delta = True,
                                        target                   = targets
                                        )

    # Sum across embeddings
    attributions = attributions.sum(dim=2) 

    return attributions

### 3.1.5 ShapleyValueSamplingMethod

In [None]:
class ShapleyValueSamplingMethod(AttributionMethod):

    def attribute(self, input_ids, targets,  model, embeddings = None, attn_masks = None):
      '''
          Returns the attributions assigned by ShapleyValueSampling method
          given the model and input_ids received.
          args:
            model      model to use for the explanations 
            input_ids  3D tensor with shape (num_samples, seq_length, embedding_size)
            targets    1D tensor with the targets for the explanations
      '''
      # Define the ShapleyValueSampling attribution algorithm
      svs = ShapleyValueSampling(lambda x: model(x).logits)

      # Compute the attribution scores for each token
      attributions  = svs.attribute(input_ids, target=targets)

      return attributions

### 3.1.6 OcclusionMethod

In [None]:
class OcclusionMethod(AttributionMethod):

    def attribute(self, input_ids, targets,  model, embeddings = None, attn_masks = None):
      '''
          Returns the attributions assigned by the Occlusion attribution method
          given the model and input_ids received.
          args:
            model      model to use for the explanations 
            input_ids  3D tensor with shape (num_samples, seq_length, embedding_size)
            targets    1D tensor with the targets for the explanations
      '''

      # Define the Occlusion attribution algorithm
      svs = Occlusion(lambda x: model(x).logits)

      # Compute the attribution scores for each token
      attributions  = svs.attribute(input_ids, 
                                    sliding_window_shapes=((1,)),
                                    target=targets)

      return attributions

## 3.2 Model Explainer

- The "ModelExplainer" class contains all the COMMON (repeated) code to get the explanations from the explainability methods and delegates to the "AttributionMethod" class the specific things of the explainability method/class (including the nuances due to Captum not implementing a very consistent API).

In [None]:
class ModelExplainer(object):

  def __init__(self, preTrainedModel, normalize = True, skip_special_tokens = True):
    self.preTrainedModel     = preTrainedModel
    self.normalize           = normalize
    self.skip_special_tokens = True

  def explain(self, texts, sentiments, explainer_method = 'svs'):
    ''' 
        Returns attributions for input text with respect to targets received
        args:
          texts      2D numpy array where each row is a different text sample
          sentiments 1D numpylabel with the sentiment corresponding to each text in 'texts'
          method  explanation method to be used (possible values are: [TODO])
    ''' 
    # Prepare inputs for attribution method
    #input_ids     = torch.tensor(self.preTrainedModel.tokenizer(list(texts.values), padding=True)['input_ids'])
    inputs     = self.preTrainedModel.tokenizer(list(texts.values), padding=True)
    input_ids  = torch.tensor(inputs['input_ids'])
    attn_masks = torch.tensor(inputs['attention_mask'])
    targets    = torch.tensor([self.preTrainedModel.sentiment2id[sentiment] for sentiment in sentiments])

    # Attribute
    explainer    = self._get_explainer(explainer_method)
    attributions = explainer.attribute(input_ids  = input_ids, 
                                       targets    = targets, 
                                       model      = self.preTrainedModel.model, 
                                       embeddings = self.preTrainedModel.embeddings,
                                       attn_masks = attn_masks)  #TODO: remove if finally we dont use it
    
    # Normalize Attributions
    attributions = torch.div(attributions, torch.norm(attributions, dim=1).reshape(-1,1)).detach().numpy() \
                                if self.normalize else attributions.detach().numpy()

    # Format output - tuples of (token, weights)
    results = []
    for input_i, attr_i in zip(input_ids, attributions):
      tokens_i  = self.preTrainedModel.tokenizer.convert_ids_to_tokens(input_i, skip_special_tokens = self.skip_special_tokens)
      results.append([(token, attr) for token, attr in zip(tokens_i, attr_i) if token!='[PAD]'])

    return results


  def _get_explainer(self, explainer_method):
    # TODO: might need factory class if explainability constructors go non-trivial (TBD)
    # TODO: We can later put this dict in a general config
    EXPLAINER_METHOD_CLASSES = {
        #'lime'     : 'LimeMethod', #Not working yet
        'dls'      : 'DeepLiftSHAPMethod',
        'lga'      : 'LayerGradientActivationMethod',
        'lig'      : 'LayerIntegratedGradientsMethod',
        'svs'      : 'ShapleyValueSamplingMethod',
        'occlusion': 'OcclusionMethod',
    }

    assert explainer_method in EXPLAINER_METHOD_CLASSES, f"Explainability method ({explainer_method}) is invalid!"
    explainer = eval(EXPLAINER_METHOD_CLASSES[explainer_method])()

    return explainer

## 3.3 Example of usage of Explainer

- The following lines show how to use any of the explainers above.
- Note that there is a model explainer (generic for any model) and then an "AttributeMethod" class for every explanation model. 

In [None]:
text_samples      = train_data['text'][:2] 
sentiments        = train_data['sentiment'][:2]

pretrainedModel   = PreTrainedSentimentModel(model_name = SUPERVISED_MODELS[0]) 
model_explainer   = ModelExplainer(pretrainedModel)

model_explainer.explain(text_samples, sentiments, explainer_method = 'lig') 

#Execution Times (2 samples, no GPU)
#lig ~2secs
#svs ~22.52secs
#occlusion ~1secs
#lga ~.15secs
#dls ~.52secs

3.6677894592285156


# 4. Consolidation of results table
In this section we create a table containing the following fields 

- text: Sample input text
- selected_text: Ground truth explanation
- supervised_model: Identifier of the supervised model used
- expl_framework: Identifier of the explainability framework used
- label: Prediction from the supervised model
- score: Score assigned by the supervised model
- weights: List of explainability weights assigned by the framework

Once we have this table, we will be able to extract metrics about the goodness of fit of the explainability methods as well as the similarity / dissimilarity between them.

## 4.1 Batched Refactored

- For previous version of consolidating results table please see v1 of this file (all code is in a working state there). 

- This will only use the new implementation of Explainer/Attributor Methods.


In [None]:
# So far we have run: 0, 1, 3, 4 from list below
SUPERVISED_MODELS

['distilbert-base-uncased-finetuned-sst-2-english',
 'cardiffnlp/twitter-roberta-base-sentiment',
 'ProsusAI/finbert',
 'cardiffnlp/twitter-xlm-roberta-base-sentiment',
 'finiteautomata/bertweet-base-sentiment-analysis',
 'yiyanghkust/finbert-tone']

In [None]:
EXPLAINABILITY_METHODS = [ 'dls', 'lga', 'lig', 'svs', 'occlusion']  # missing 'lime' yet

In [None]:
# We will use this subset of 1k samples for the first run (balanced number of classes)
# Note that test_data does not have the anotations
train_data[:1000].sentiment.value_counts()

positive    528
negative    472
Name: sentiment, dtype: int64

In [None]:
batch_size                = 10 
data                      = train_data[:1000]
SUPERVISED_MODELS_REDUCED = []

for model_i, model_name in enumerate(SUPERVISED_MODELS[3:5]): 

  metrics = { key:[] for key in ['time', 'supervised_model', 'expl_framework', 'batch_size'] }
  results = { key:[] for key in ['texts', 'selected_text', 'supervised_model', 'expl_framework', 'predicted_label', 'predicted_score', 'explanations' ] }

  # Load our pretrainedModel helper
  pretrainedModel   = PreTrainedSentimentModel(model_name) 
  model_explainer   = ModelExplainer(pretrainedModel)

  for batch_i in tqdm(range(data.shape[0]//batch_size)):
    # Set batch indexes
    start_batch_idx =  batch_i * batch_size
    end_batch_idx   = (batch_i+1) * batch_size 

    # Process batch data
    text_samples    = data['text'][start_batch_idx:end_batch_idx]
    selected_texts  = data['selected_text'][start_batch_idx:end_batch_idx]
    sentiments      = data['sentiment'][start_batch_idx:end_batch_idx]
    
    # Predict
    pred_labels, pred_scores = pretrainedModel.predict(text_samples, padding = True)

    # Explain
    for e in tqdm(EXPLAINABILITY_METHODS):
      start_time        = time.time()
      explanations      = model_explainer.explain(text_samples, sentiments, explainer_method = e) 
      end_time          = time.time()

      # Collect METRICS and RESULTS
      metrics['time'].append(end_time-start_time)
      metrics['batch_size'].append(batch_size)
      metrics['supervised_model'].append(model_name)
      metrics['expl_framework'].append(e)

      results['texts'].extend(text_samples) 
      results['selected_text'].extend(selected_texts) 
      results['supervised_model'].extend([model_name] * batch_size)
      results['expl_framework'].extend([e] * batch_size)
      results['predicted_label'].extend(list(pred_labels)) 
      results['predicted_score'].extend(list(pred_scores)) 
      results['explanations'].extend(explanations)

      results_df = pd.DataFrame(results)
      metrics_df = pd.DataFrame(metrics)
      
      label  = datetime.now().strftime("%Y%m%d_%H%M%S")

  model_number = model_i + 2
  results_df.astype(str).to_parquet(f"{RESULTS_PATH}/{DATASET}/results_{model_number}_{label}.parquet")
  metrics_df.astype(str).to_parquet(f"{RESULTS_PATH}/{DATASET}/metrics_{model_number}_{label}.parquet")

  results_df.to_csv(f"{RESULTS_PATH}/{DATASET}/results_{model_number}_{label}_csv.csv")
  metrics_df.to_csv(f"{RESULTS_PATH}/{DATASET}/metrics_{model_number}_{label}_csv.csv")

  0%|          | 0/100 [00:00<?, ?it/s]
  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:07<00:29,  7.31s/it][A
 40%|████      | 2/5 [00:07<00:09,  3.20s/it][A
 60%|██████    | 3/5 [00:17<00:12,  6.39s/it][A
 80%|████████  | 4/5 [01:12<00:25, 25.42s/it][A
100%|██████████| 5/5 [01:14<00:00, 14.91s/it]
  1%|          | 1/100 [01:14<2:03:15, 74.70s/it]
  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:14<00:57, 14.43s/it][A
 40%|████      | 2/5 [00:14<00:18,  6.27s/it][A
 60%|██████    | 3/5 [00:34<00:24, 12.20s/it][A
 80%|████████  | 4/5 [03:51<01:25, 85.36s/it][A
100%|██████████| 5/5 [04:00<00:00, 48.09s/it]
  2%|▏         | 2/100 [05:15<4:41:25, 172.30s/it]
  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:10<00:41, 10.26s/it][A
 40%|████      | 2/5 [00:10<00:13,  4.48s/it][A
 60%|██████    | 3/5 [00:24<00:17,  8.58s/it][A
 80%|████████  | 4/5 [02:22<00:51, 51.79s/it][A
100%|██████████| 5/5 [02:26<00:00, 29.32s/it]
  3%|▎    

Downloading (…)lve/main/config.json:   0%|          | 0.00/949 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/540M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/338 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/843k [00:00<?, ?B/s]

Downloading (…)solve/main/bpe.codes:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/22.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/167 [00:00<?, ?B/s]

  0%|          | 0/100 [00:00<?, ?it/s]
  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:06<00:25,  6.40s/it][A
 40%|████      | 2/5 [00:06<00:08,  2.80s/it][A
 60%|██████    | 3/5 [00:15<00:11,  5.74s/it][A
 80%|████████  | 4/5 [01:18<00:28, 28.35s/it][A
100%|██████████| 5/5 [01:21<00:00, 16.28s/it]
  1%|          | 1/100 [01:21<2:14:29, 81.51s/it]
  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:12<00:50, 12.54s/it][A
 40%|████      | 2/5 [00:12<00:16,  5.38s/it][A
 60%|██████    | 3/5 [00:30<00:21, 10.99s/it][A
 80%|████████  | 4/5 [03:22<01:14, 74.56s/it][A
100%|██████████| 5/5 [03:29<00:00, 41.83s/it]
  2%|▏         | 2/100 [04:50<4:15:56, 156.70s/it]
  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:09<00:38,  9.75s/it][A
 40%|████      | 2/5 [00:10<00:12,  4.17s/it][A
 60%|██████    | 3/5 [00:22<00:15,  7.95s/it][A
 80%|████████  | 4/5 [02:06<00:45, 45.97s/it][A
100%|██████████| 5/5 [02:10<00:00, 26.14s/it]
  3%|▎    

# 5. Metrics
Section where we retrieve metrics associated with the goodness of fit nature of the results extracted in section 4. 

### 5.1 Ground-truth faithfulness metrics

In [None]:
import scipy

def feature_agreement(explanation, selected_text, k):
  """
  Fraction of top k tokens in the predicted saliency map that are common with the ground truth explanation

  args:
    explanations: Predicted explanations
    selected_text: Ground truth annotated explanation
    k: Cutoff of number of tokens to keep 
  
  returns:
    feat_agreement: Percentage of coincidences

  """

  # Define value at which to cut to keep k records
  cutoff = sorted([abs(explanation[i][1]) for i in range(len(explanation))])[-k:][0]

  # Select tokens in explanation with higher weight than cutoff
  top_explanation = [c for c in explanation if abs(c[1])>=cutoff]

  # Compute coincidence with selected text
  coincidence = [1 if c in selected_text.lower() else 0 for c in [s[0] for s in top_explanation]]

  # Compute feature agreement
  feat_agreement = sum(coincidence) / len(coincidence)

  return feat_agreement


def rank_correlation(explanation, selected_text):
  """
  Spearman’s rank correlation coefficient between the predicted token weights and the ground truth

  args:
    explanations: Predicted explanations
    selected_text: Ground truth annotated explanation
  
  returns:
    sperman: Correlation coefficient between the coincidence and the explanation weights
  """

  # Compute coincidence with selected text
  coincidence = [1 if c in selected_text.lower() else 0 for c in [s[0] for s in explanation]]

  # Compute the rank 
  sperman = scipy.stats.spearmanr([s[0] for s in explanation], coincidence)[0]

  return sperman  


def iou(explanation, selected_text, k):
  """
  For two spans of text (human explanation and model explanation), it is defined as the size of the overlap of the tokens of the two 
  spans divided by the size of their union.
  """

  # Define value at which to cut to keep k records
  cutoff = sorted([abs(explanation[i][1]) for i in range(len(explanation))])[-k:][0]

  # Select tokens in explanation with higher weight than cutoff
  top_explanation = [c for c in explanation if abs(c[1])>=cutoff]

  # Compute coincidence with selected text
  coincidence = [1 if c in selected_text.lower() else 0 for c in [s[0] for s in top_explanation]]

  # Compute overlap size
  overlap = sum(coincidence)

  # Compute union size
  union = len(coincidence) + len(selected_text.split(' ')) - overlap

  # Compute IoU
  iou = overlap / union

  return iou

### 5.2 Predictive faithfulness metrics

In [None]:
models = {m: AutoModelForSequenceClassification.from_pretrained(m) for m in SUPERVISED_MODELS}
tokenizers = {m: AutoTokenizer.from_pretrained(m) for m in SUPERVISED_MODELS}

def pgi(text, supervised_model, explanation, k):
  """
  Prediction Gap on Important feature perturbation (PGI): Metric that assesses whether all features were needed to make a prediction, 
  measures as the difference in prediction probability that results from perturbing the features deemed as influential
  """

  # Instantiate the current supervised model
  #tokenizer  = AutoTokenizer.from_pretrained(supervised_model)
  #model      = AutoModelForSequenceClassification.from_pretrained(supervised_model)
  tokenizer = tokenizers[supervised_model]
  model = models[supervised_model]

  # Compute model prediction with text as is
  logits     = model(torch.tensor(tokenizer.encode(text)).unsqueeze(0)).logits.detach().numpy()[0]
  probs      = np.exp(logits) / np.sum(np.exp(logits)) 
  pred_label = probs.argmax()
  pred_score = probs[pred_label]

  # Remove top k influential elements from the text
  cutoff = sorted([abs(explanation[i][1]) for i in range(len(explanation))])[-k:][0]
  new_text = " ".join([c[0] for c in explanation if (abs(c[1]) < cutoff) and ('[' not in c[0])])

  # Compute new model probabilities
  logits     = model(torch.tensor(tokenizer.encode(new_text)).unsqueeze(0)).logits.detach().numpy()[0]
  probs      = np.exp(logits) / np.sum(np.exp(logits)) 
  new_pred_score = probs[pred_label]

  # Compute difference
  prob_diff = abs(pred_score - new_pred_score)

  return prob_diff


def pgu(text, supervised_model, explanation, k):
  """
  Prediction Gap on Unimportant feature perturbation: Metric that assesses whether the extracted rationales contain enough signal 
  to come to a disposition, measured as the difference in prediction probability that results from removing the unimportant features
  """

  # Instantiate the current supervised model
  tokenizer = tokenizers[supervised_model]
  model = models[supervised_model]

  # Compute model prediction with text as is
  logits     = model(torch.tensor(tokenizer.encode(text)).unsqueeze(0)).logits.detach().numpy()[0]
  probs      = np.exp(logits) / np.sum(np.exp(logits)) 
  pred_label = probs.argmax()
  pred_score = probs[pred_label]

  # Remove top k influential elements from the text
  cutoff = sorted([abs(explanation[i][1]) for i in range(len(explanation))])[-k:][0]
  new_text = " ".join([c[0] for c in explanation if (abs(c[1]) >= cutoff) and ('[' not in c[0])])

  # Compute new model probabilities
  logits     = model(torch.tensor(tokenizer.encode(new_text)).unsqueeze(0)).logits.detach().numpy()[0]
  probs      = np.exp(logits) / np.sum(np.exp(logits)) 
  new_pred_score = probs[pred_label]

  # Compute difference
  prob_diff = abs(pred_score - new_pred_score)

  return prob_diff

### 5.3 Stability metrics

In [None]:
def ris():
