# Preliminary procedures

First, we import the Transformer, Ethos, and HateXplain libraries.

In [None]:
print('#####################################################################')
print('#                           HateXplain                              #')
print('#####################################################################')
print('#                    (distil)BERT Evaluation                        #')
print('#####################################################################')

#####################################################################
#                           HateXplain                              #
#####################################################################
#                    (distil)BERT Evaluation                        #
#####################################################################


In [None]:
# First we are going to install the transformers library by hugging face!
import warnings
warnings.filterwarnings("ignore", category=UserWarning)
! pip install transformers



In [None]:
#Clone the HateXplain dataset
!git clone https://github.com/hate-alert/HateXplain/
!mv '/content/HateXplain/Data' '/content/Data'

#Choose between Bert and DistilBert
model_used = 'Bert'

fatal: destination path 'HateXplain' already exists and is not an empty directory.
mv: cannot stat '/content/HateXplain/Data': No such file or directory


# Prepare HateXplain Dataset

We now load the HateXplain dataset. We extract the samples that are labeled 'hate speech' or 'normal', preprocess the respective text, and create the ground truth vector.

We then split the samples into two datasets, train and test. 

In [None]:
import json
import numpy as np
import pandas as pd
import re
from scipy.special import softmax

if model_used == 'Bert':
    from transformers import BertTokenizerFast
    from transformers import BertForSequenceClassification, Trainer, TrainingArguments
    tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
else:
    from transformers import DistilBertTokenizerFast 
    from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments 
    tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')


#Preprocess text tokens by removing <> and replacing all numbers with 'number'
def preprocess(tokens):
    for i in range(len(tokens)):
        tokens[i] = re.sub(r"[<\*>]", "",tokens[i])
        tokens[i] = re.sub(r"\b\d+\b", "number", tokens[i])
    return tokens


#Function to extract ground truth attention vector from the annotator rationales
def getGroundTruth(key, tokens):
    original_rationales = data[key]['rationales']
    new_rationales = []

    #Calculate the BERT token splits
    lengths = []
    for token in tokens:
        lengths.append(len(tokenizer.tokenize(token)))

    #Adjust each rationale to the new BERT tokens
    for current_rationale in original_rationales:
        tweaked_rationale = []
        for weight, length in zip(current_rationale, lengths):
            tweaked_rationale += length * [weight]
        new_rationales.append(tweaked_rationale)

    #Produce final rationale vector through union of the annotator rationales
    ground_truth = [int(any(weight)) for weight in zip(*new_rationales)]
    return ground_truth

#Load HateXplain dataset
with open('Data/dataset.json', 'r') as fp:
    data = json.load(fp)

X = []
y = [] 
ground_truth = []

#For each sample
for key in data:
    #Construct the string                                          
    tokens = data[key]['post_tokens']
    tokens = preprocess(tokens)
    text = ' '.join(tokens)  

    #Get all 3 labels of annotators
    annotator_labels = []
    for i in range(3):                                    
        annotator_labels.append(data[key]['annotators'][i]['label'])
  
    #Get final label based on majority voting
    final_label=max(annotator_labels,key=annotator_labels.count)

    #If label was either "hate speech" or "normal speech", preprocess string, create ground truth vector and add the sample to the list
    if(annotator_labels.count(final_label)!=1):
        if(final_label == 'hatespeech'):
            X.append(text)
            y.append(int(1))
            ground_truth.append(getGroundTruth(key, tokens))
        elif(final_label == 'normal'):
            X.append(text)
            y.append(int(0))
            ground_truth.append(int(0)) #We are not going to use these ground truths

X = np.array(X)
y = np.array(y)

class_names = ['noHateSpeech', 'hateSpeech']

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
print("Total amount:",len(y))
print("Hate speech:",sum(y))
print("Non Hate speech:",len(y)-sum(y))
print()
print("Example sample")
print(X[3])
print(y[3])
print(ground_truth[3])

Total amount: 13749
Hate speech: 5935
Non Hate speech: 7814

Example sample
user i am bit confused coz chinese ppl can not access twitter thn how this ching chong using it i think he pakistani 🤔 🤔 🤔
1
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [None]:
#Split dataset into train, validation, and test (70% - 10% - 20%)
from sklearn.model_selection import train_test_split

indices = np.arange(len(y))
train_texts, test_texts, train_labels, test_labels, _, test_indexes = train_test_split(list(X), y, indices, stratify=y, test_size=.2, random_state=42)
#Keep ground truth for test samples only
ground_truth = [ground_truth[x] for x in test_indexes]

size = (0.1 * len(y)) / len(train_labels)
train_texts, validation_texts, train_labels, validation_labels = train_test_split(list(train_texts), train_labels, stratify=train_labels, test_size=size, random_state=42)

print("Training samples: ",len(train_labels))
print("Validation samples: ",len(validation_labels))
print("Test samples: ",len(test_labels))

Training samples:  9624
Validation samples:  1375
Test samples:  2750


In [None]:
#Construct hard ground truth
import more_itertools as mit

#Return start and end index of each hard rationale
def find_rationale_range(iterable):
    """Yield range of consecutive numbers."""
    for group in mit.consecutive_groups(iterable):
        group = list(group)
        if len(group) == 1:
            yield (group[0], group[0] + 1)
        else:
            yield (group[0], group[-1] + 1) 

def extract_hard_truth(ground_truth):
    hard_truth = []
    for rationale in ground_truth:
        if rationale == 0:
            hard_truth.append(0)
        else:
            rationale_indexes = [index for index, weight in enumerate(rationale) if weight==1]
            hard_truth.append(list(find_rationale_range(rationale_indexes)))
    return hard_truth

hard_truth = extract_hard_truth(ground_truth)
print(ground_truth[3])
print(hard_truth[3])

#Calculate ground truth average across hate speech samples
avg_per_sample = []
for truth in ground_truth:
    if truth != 0:
        avg_per_sample.append(np.average(truth))
ground_truth_avg = np.average(avg_per_sample)
print("Ground truth average: ",ground_truth_avg)

[0, 0, 1, 1, 1, 1, 1]
[(2, 7)]
Ground truth average:  0.4442347618299537


# Prepare (Distil)Bert

We create the train and test datasets using the (Distil)Bert tokenizer, and set the training parameters.

In [None]:
import torch

#Set training arguments
training_args = TrainingArguments(
    evaluation_strategy='epoch',     # evaluation frequency
    save_strategy='epoch',           # model checkpoint frequency
    logging_strategy='epoch',        # logging frequency
    log_level='warning',             # logging level
    output_dir='./results',          # output directory
    num_train_epochs=6,              # total number of training epochs
    per_device_train_batch_size=4,   # batch size per device during training
    per_device_eval_batch_size=32,   # batch size for evaluation
    warmup_steps=200,                # number of warmup steps for learning rate scheduler
    weight_decay=0.001,              # strength of weight decay
    logging_dir='./logs'             # directory for storing logs
)

In [None]:
#Create the train and test datasets
class HateSpeechDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_encodings = tokenizer(list(train_texts), truncation=True, padding=True)
validation_encodings = tokenizer(list(validation_texts), truncation=True, padding=True)
test_encodings = tokenizer(list(test_texts), truncation=True, padding=True)
    
train_dataset = HateSpeechDataset(train_encodings, train_labels)
validation_dataset = HateSpeechDataset(validation_encodings, validation_labels)
test_dataset = HateSpeechDataset(test_encodings, test_labels)

# Create the explanation methods

Four methods are provided: 

1.   Averaging attention across all layers
2.   Averaging attention only across last layer
3.   LIME
4.   Integrated Gradients

In [None]:
#methods = ['attention_all', 'attention_last', 'IG']
#Due to time constraints, we can run the rest of the techniques and then run LIME standalone
#on the trained model, after loading it with output_attention=False
methods = ['attention_all', 'attention_last', 'LIME' 'IG']


#Import LIME
! pip install lime
import lime
from lime.lime_text import LimeTextExplainer

#Import Integrated Gradients
! pip install transformers-interpret
from transformers_interpret import SequenceClassificationExplainer

#Predictor function for LIME
def predictor(texts):
    encodings = tokenizer(list(texts), truncation=True, padding=True)
    dataset = HateSpeechDataset(encodings, np.zeros(len(texts), dtype=int))
    logits = model_predict(dataset)
    probabilities = softmax(logits, axis = 1)
    return probabilities

#Base function for explanations
def explainTexts(texts, method):
    attributions = []
    if method == 'LIME':
        explainer = LimeTextExplainer(class_names=class_names, split_expression='\s+', bow=False)
        for i, test_phrase in enumerate(texts):
            if i % 10 == 0:
                print("Current sample:", i)
            exp = explainer.explain_instance(test_phrase, predictor, num_features=200, num_samples=2000)
            explanation_dict = dict(list(exp.as_map().values())[0])
            #Assign scores to bert tokens
            scores = []
            tokens = test_phrase.split(" ")
            for i in range(len(tokens)):
                bert_tokens = tokenizer.encode(tokens[i],add_special_tokens = False)
                bert_tokens = tokenizer.convert_ids_to_tokens(bert_tokens)
                for j in range(len(bert_tokens)):
                    scores.append((bert_tokens[j], explanation_dict[i]))
            attributions.append(scores)
    elif method == 'IG':
        explainer = SequenceClassificationExplainer(model, tokenizer, custom_labels = class_names)
        for test_phrase in texts:
            scores = explainer(test_phrase)
            #Delete CLS and SEP tokens
            scores.pop(0)
            scores.pop(-1)
            attributions.append(scores)
    else:
        for text_id in range(len(texts)):
            encodings = tokenizer(texts[text_id])
            encodings['input_ids'] = np.reshape(encodings['input_ids'], (1,-1))
            encodings['attention_mask'] = np.reshape(encodings['attention_mask'], (1,-1))
            dataset = HateSpeechDataset(encodings, np.zeros(1, dtype=int))
            attention_matrix = model_predict(dataset, return_attention=True)
            if method == 'attention_all':
                attention_matrix = attention_matrix.mean(axis=0).mean(axis=1).mean(axis=1)
            elif method == 'attention_last':
                attention_matrix = np.mean(attention_matrix[-1][:,:,0,:],axis=1)
            scores = []
            tokens = tokenizer.convert_ids_to_tokens(encodings['input_ids'][0])
            for token_id in range(len(tokens)):
                scores.append((tokens[token_id], attention_matrix[0][token_id]))
            scores.pop(0)
            scores.pop(-1)
            attributions.append(scores)
    return attributions



# Define performance and interpretability metrics

For performance, we import accuracy, precision, recall and F1_score from sklearn, and define specificity and sensitivity.

For interpretability, we define rationale based metrics, average nonzero-weights, robustness and faithfulness.

In [None]:
#Define performance metrics
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score, precision_score, recall_score

def specificity(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    if (tn+fp) > 0:
        speci = tn/(tn+fp)
        return speci
    return 0

def sensitivity(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    if (tp+fn) > 0:
        sensi = tp/(tp+fn)
        return sensi
    return 0

In [None]:
#Define ground truth based metrics
#Code based on ERASER benchmark: https://github.com/jayded/eraserbenchmark/blob/master/rationale_benchmark/metrics.py
from sklearn.metrics import precision_recall_curve, auc
from sklearn.preprocessing import maxabs_scale

#Extract only weights, discarding tokens in each tuple
def extract_weights(attributions):
    weights = []
    for explanation in attributions:
        weights.append([x[1] for x in explanation])
    return weights

#Scale weights to [-1, 1]
def scale_weights(attributions):
    for i in range(len(attributions)):
        attributions[i] = maxabs_scale(np.reshape(attributions[i], (-1,1)))
        attributions[i] = np.reshape(attributions[i], -1)
    return attributions

#Extract rationales with weight >= threshold
def extract_hard_rationales(attributions, threshold):
    attributions = scale_weights(attributions)
    hard_rationales = []
    for explanation in attributions:
        rationale = []
        for i in range(len(explanation)):
            if explanation[i] >= threshold:
                if rationale and rationale[-1][1] == i:
                    rationale[-1] = (rationale[-1][0], rationale[-1][1] + 1)
                else:
                    rationale.append((i, i+1))
        hard_rationales.append(rationale)
    return hard_rationales

#AUPRC metric for soft rationales
def auprc(attributions, ground_truth):
    aucs = []
    for prediction, truth in zip(attributions, ground_truth):
        precision, recall, _ = precision_recall_curve(truth, prediction)
        aucs.append(auc(recall, precision))
    return np.average(aucs) 

def calculate_macro_f1(true_positives, hard_rationales, hard_truth):
    recall_per_sample = [true_positives[i] / len(truth) for i, truth in enumerate(hard_truth)]
    precision_per_sample = [true_positives[i] / len(rationales) if len(rationales) > 0 else 0 for i, rationales in enumerate(hard_rationales)]
    macro_recall = np.average(recall_per_sample)
    macro_precision = np.average(precision_per_sample)
    if macro_recall == 0 or macro_precision == 0:
        return 0
    f1 = 2 * (macro_precision * macro_recall) / (macro_precision + macro_recall)
    return f1

def IOU_F1(hard_rationales, hard_truth):
    #Calculate best IOU(Intesection Over Union) for each rationale in each test sample
    ious = []
    for sample_id in range(len(hard_rationales)):
        sample_ious = []
        for rationale_id in range(len(hard_rationales[sample_id])):
            rationale = hard_rationales[sample_id][rationale_id]
            best_iou = 0.0
            for truth in hard_truth[sample_id]:
                num = len(set(range(rationale[0], rationale[1])) & set(range(truth[0], truth[1])))
                denom = len(set(range(rationale[0], rationale[1])) | set(range(truth[0], truth[1])))
                iou = 0 if denom == 0 else num / denom
                if iou > best_iou:
                    best_iou = iou
            sample_ious.append(best_iou)
        ious.append(sample_ious)

    #Calculate macro F1 score
    threshold = 0.5
    true_positives = []
    for sample in ious:
        true_positives.append(sum(int(x >= threshold) for x in sample))
    return calculate_macro_f1(true_positives, hard_rationales, hard_truth)

def token_F1(hard_rationales, hard_truth):
    #Turn eveything into token level
    token_rationales = []
    token_truth = []
    for i in range(len(hard_rationales)):
        temp = []
        for rationale in hard_rationales[i]:
            temp.extend(list(range(rationale[0], rationale[1])))
        token_rationales.append(temp)
    
        temp = []
        for truth in hard_truth[i]:
            temp.extend(list(range(truth[0], truth[1])))
        token_truth.append(temp)
    #Calculate token F1 score
    true_positives = [len(set(rationale) & set(truth)) for rationale, truth in zip(token_rationales, token_truth)]
    return calculate_macro_f1(true_positives, token_rationales, token_truth)

def hard_rationale_metrics(attributions, hard_truth, ground_truth_avg):
    thresholds = [0.5, 0.33, ground_truth_avg]
    dictionary = {}
    for threshold in thresholds:
        hard_rationales = extract_hard_rationales(attributions, threshold)
        key1 = 'IOU F1 t=' + str(threshold)
        key2 = 'Token F1 t=' + str(threshold)
        dictionary.update({key1: IOU_F1(hard_rationales, hard_truth), key2: token_F1(hard_rationales, hard_truth)})
    return dictionary

def rationales_metrics(attributions, ground_truth, hard_truth, ground_truth_avg):
    attributions = extract_weights(attributions)
    #Get only hate speech samples
    indexes = [index for index, truth in enumerate(ground_truth) if truth != 0]
    attributions = [attributions[x] for x in indexes]
    ground_truth = [ground_truth[x] for x in indexes]
    hard_truth = [hard_truth[x] for x in indexes]
    #Keep only positive weights, turn negative weights to 0
    for attribution in attributions:
        for i in range(len(attribution)):
            if attribution[i] < 0:
                attribution[i] = 0
    metrics = {}
    metrics['AUPRC'] = auprc(attributions, ground_truth)
    metrics.update(hard_rationale_metrics(attributions, hard_truth, ground_truth_avg))
  
    return metrics

In [None]:
#Define functional based metrics

def nonzero_weights(attributions):
    threshold = 0.001
    attributions = extract_weights(attributions)
    attributions = scale_weights(attributions)
    nonzero_weights = 0
    for explanation in attributions:
        abs_weights = np.abs(explanation)
        nonzero_weights += (abs_weights > threshold).sum()
    return nonzero_weights / len(attributions)

def robustness(attributions, method):
    initial_weights = extract_weights(attributions)

    #Add UNK token to the end of each test document
    tweaked_texts = []
    for i in range(len(test_texts)):
        tweaked_texts.append(test_texts[i] + ' 😁')

    #Get new weights
    new_weights = explainTexts(tweaked_texts, method)
    new_weights = extract_weights(new_weights)

    #Scale weights to [-1, 1]
    initial_weights = scale_weights(initial_weights)
    new_weights = scale_weights(new_weights)

    #Pad initial weights for UNK token
    for i in range(len(initial_weights)):
        while len(new_weights[i]) > len(initial_weights[i]):
            initial_weights[i] = np.append(initial_weights[i], 0.0)

    differences = []
    for i in range(len(initial_weights)):
        diff = np.sum(np.absolute(np.subtract(new_weights[i], initial_weights[i])))
        differences.append(diff)
    return np.average(differences)

def faithfulness(attributions, method):
    #Get original probabilities
    logits = model_predict(test_dataset)
    original_probabilities = softmax(logits, axis = 1)

    #Create new attention masks to ignore most important tokens for each test document
    tweaked_encodings = tokenizer(list(test_texts), truncation=True, padding=True)
    attention_masks = []
    weights = extract_weights(attributions)
    for i in range(len(weights)):
        explanation = weights[i]
        max_weight = max(explanation)
        max_indexes = [index+1 for index, value in enumerate(explanation) if value == max_weight]
        original_attention_mask = tweaked_encodings['attention_mask'][i]
        for index in max_indexes:
            original_attention_mask[index] = 0

    tweaked_dataset = HateSpeechDataset(tweaked_encodings, test_labels)

    #Get tweaked probabilities
    logits = model_predict(tweaked_dataset)
    tweaked_probabilities = softmax(logits, axis = 1)

    #Get predicted class of each original prediction
    y_preds = []
    for i in original_probabilities:
        y_preds.append(np.argmax(i))

    #Get probability of predicted class for each sample
    original_y = original_probabilities[np.arange(len(original_probabilities)), y_preds]
    tweaked_y = tweaked_probabilities[np.arange(len(tweaked_probabilities)), y_preds]
  
    return np.average(original_y - tweaked_y)

# Evaluate (Distil)BERT 

We now train the (Distil)Bert model, save its weights, and then run the performance and interpretability evaluation metrics on the test dataset and output the results.

In [None]:
#Run a prediction on a dataset and return the logits or the attention matrix
def model_predict(dataset, return_attention=False):
    if return_attention:
        predictions = trainer.predict(dataset).predictions
        attention_matrix = predictions[1]
        attention_matrix = np.array(list(attention_matrix))
        return attention_matrix
    else:
        all_logits = []
        dataloader = torch.utils.data.DataLoader(dataset, batch_size=100) #Change batch_size accordingly
        for _, batch in enumerate(dataloader):
            batch_dataset = HateSpeechDataset({key:batch[key] for key in ['input_ids','attention_mask']}, batch['labels'])
            predictions = trainer.predict(batch_dataset).predictions
            logits = predictions[0]
            all_logits.extend(logits)
        return np.array(all_logits)

if model_used == 'Bert':
    model = BertForSequenceClassification.from_pretrained("bert-base-uncased", output_attentions=True)
else:
    model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", output_attentions=True)

#Train model
trainer = Trainer(
            model=model,                         # the instantiated Transformers model to be trained
            args=training_args,                  # training arguments
            train_dataset=train_dataset,         # training dataset
            eval_dataset=validation_dataset      # evaluation dataset
)
trainer.train()

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Epoch,Training Loss,Validation Loss
1,0.5429,0.410997
2,0.4369,0.399812
3,0.3003,0.502414
4,0.2124,0.671766
5,0.1525,0.6294
6,0.0971,0.66348


TrainOutput(global_step=14436, training_loss=0.2903610202010301, metrics={'train_runtime': 2553.7235, 'train_samples_per_second': 22.612, 'train_steps_per_second': 5.653, 'total_flos': 5311644874505280.0, 'train_loss': 0.2903610202010301, 'epoch': 6.0})

In [None]:
#Save model
model.save_pretrained("bert_model/")

In [None]:
logits = model_predict(test_dataset)

#Get predicted labels
y_preds = []
for i in logits:
    y_preds.append(np.argmax(i))

#Compute performance metrics
name = model_used
performance_scores = {name:{}}
performance_scores[name]['F1'] = f1_score(test_labels, y_preds, average='macro')
performance_scores[name]['Precision'] = precision_score(test_labels, y_preds, average='macro')
performance_scores[name]['Recall'] = recall_score(test_labels, y_preds, average='macro')
performance_scores[name]['Accuracy'] = accuracy_score(test_labels, y_preds)
performance_scores[name]['Specificity'] = specificity(test_labels, y_preds)
performance_scores[name]['Sensitivity'] = sensitivity(test_labels, y_preds)

#Print metrics
pd.set_option('display.max_columns', None)
df = pd.DataFrame(performance_scores).T
df.to_csv('bert_performance.csv')

In [None]:
attributions = {}

#Compute explanation vectors for test dataset
for method in methods:
    print("Current method: ",method)
    attributions[method] = explainTexts(test_texts, method)

#Print an example
for method in attributions:
    print(method)
    print(attributions[method][-1])

Current method:  attention_all


Current method:  attention_last
Current method:  IG
attention_all
[('user', 0.018255835), ('china', 0.01690993), ('##man', 0.026505345), ('satan', 0.045628767), ('##ist', 0.022424813), ('lee', 0.01888416), ('hs', 0.017990084), ('##ien', 0.016633952), ('lo', 0.013514362), ('##ong', 0.019136101), ('malaysian', 0.042037945), ('criminals', 0.039498292), ('have', 0.02232611), ('been', 0.02109984), ('gossip', 0.02260688), ('##ing', 0.017562084), ('false', 0.021904739), ('information', 0.019355634), ('to', 0.020091066), ('the', 0.018966367), ('nt', 0.01700514), ('##uc', 0.016089827), ('supermarket', 0.031740677), ('se', 0.014989028), ('##mba', 0.018736701), ('##wang', 0.01750711), ('mart', 0.016503518), ('staff', 0.021204531), ('since', 0.020392813), ('mid', 0.015896866), ('number', 0.018105108), ('swift', 0.0152528975), ('discoveries', 0.021038383), ('if', 0.019939095), ('questioned', 0.02578365)]
attention_last
[('user', 0.02724658), ('china', 0.027133947), ('##man', 0.027255243), ('satan',

In [None]:
#Initialize interpretability scores
interpretability_scores = {}
for method in methods:
    interpretability_scores.setdefault(method, {})

#Compute interpretability metrics
for method in methods:
    print("Current method: ",method)
    interpretability_scores[method].update(rationales_metrics(attributions[method], ground_truth, hard_truth, ground_truth_avg))
    interpretability_scores[method]['average_nonzero_weights'] = nonzero_weights(attributions[method])
    interpretability_scores[method]['robustness'] = robustness(attributions[method], method)
    interpretability_scores[method]['faithfulness'] = faithfulness(attributions[method], method)

#Print metrics
df = pd.DataFrame(interpretability_scores).T
df.to_csv('bert_interpretations.csv')

Current method:  attention_all


Current method:  attention_last
Current method:  IG
