In [1]:
!pip install transformers_interpret --quiet
import pandas as pd
import time, datetime, numpy as np
from transformers import BertTokenizer, BertForSequenceClassification
from transformers_interpret import SequenceClassificationExplainer
import torch
from torch.utils.data import Dataset, DataLoader
import time

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.8/45.8 kB[0m [31m545.2 kB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m27.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
from google.colab import drive
drive.mount('/content/drive')

test = pd.read_csv('/content/drive/MyDrive/[CS4248] Project Folder/data/esnli_test.csv')

Mounted at /content/drive


In [3]:
def select_cols(df, col_list):
    '''
    Select columns from a dataframe
    '''
    return df[col_list]

def combine_sentences(df, col_list):

    results_df = df.copy()
    results_df['combined_text'] = '[CLS]' + results_df[col_list].astype(str).agg('[SEP]'.join, axis=1)
    return results_df

In [5]:
target_cols = ['Sentence1', 'Sentence2', 'Explanation_1', 'gold_label'] # Premise, Hypothesis, Explanation
test_df = select_cols(test, target_cols)

In [6]:
test_df = combine_sentences(test_df, target_cols[:-1])

lables = {
    'entailment': 0,
    'neutral': 1,
    'contradiction': 2
}

test_df['labels'] = test_df['gold_label'].map(lables)

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels = 3,
    # output_attentions = False,
    output_attentions = False,
    output_hidden_states = False,
)

#unfreezing layer 11 and the classifier. note: the pooler is still frozen
for name, param in model.named_parameters():
    if 'classifier' not in name and '11' not in name:
        param.requires_grad = False

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

In [10]:
model_type = 'premise_hypothesis_explanation'
model_save_path = f"/content/drive/MyDrive/[CS4248] Project Folder/models/{model_type}.pth"
model.load_state_dict(torch.load(model_save_path))

<All keys matched successfully>

In [11]:
VERBOSE = False
PROGRESS_VERBOSE = True

def normalize_dict(dictionary):

    """
    Normalize a dictionary to sum to 1.
    """
    total_abs_sum = sum(abs(value) for value in dictionary.values())
    normalized_dict = {key: value / total_abs_sum for key, value in dictionary.items()}

    for key, value in dictionary.items():
        if value < 0:
            normalized_dict[key] *= -1

    return normalized_dict

"""
Computes how much each segment contributes to the output of the model.
Each segment is seperated by a [SEP] token.

Input:
  - df: dataframe to test on, ensrue it contains the cols "combined_text" and "labels"
  - model: target fine-tuned model
  - col_names: list of column names to compute attributions for
  - tokenizer
  - (Optional) flag to normalize the attributions to sum to 1
Output: A dict containing a break down of the weightage each segment contributes to the output of the model.
"""
def compute_segment_attributions(df, model, tokenizer, col_names, normalize=False):
    cls_explainer = SequenceClassificationExplainer(model, tokenizer)
    results, false_results = {}, {}

    start_time = time.time()
    for index, row in df.iterrows():
        text = row['combined_text']
        true_label = row['labels']
        if index % 100 == 0:
            print(f"Processing index {index}, Time: {time.time() - start_time}") if PROGRESS_VERBOSE else None
            start_time = time.time()

        word_attributions = cls_explainer(text)
        tokens = tokenizer.tokenize(text)


        segments = text.split('[SEP]')
        segments[0] = segments[0].replace('[CLS]', '').strip()
        print(segments) if VERBOSE else None

        attribution_sums = {}
        token_index = 1

        for idx, segment in enumerate(segments):

            segment_tokens = tokenizer.tokenize(segment.strip())
            end_index = token_index + len(segment_tokens) + 1

            print(f"Attributes: {word_attributions[token_index:end_index]}") if VERBOSE else None # Remember that the tokenizer adds [CLS] and [SEP] as well

            segment_attributions = []


            for word, attr in word_attributions[token_index:end_index]:
              if word not in ['[CLS]', '[SEP]']:
                print(f"Using the word {word}") if VERBOSE else None
                segment_attributions.append(attr)

            sum_attributions = sum(segment_attributions)
            attribution_sums[col_names[idx]] = sum_attributions
            token_index = end_index

        if cls_explainer.predicted_class_index == true_label:
            results[index] = normalize_dict(attribution_sums) if normalize else attribution_sums
        else:
            results[index] = {}
            false_results[index] = normalize_dict(attribution_sums) if normalize else attribution_sums

    return results, false_results


#Format is [CLS] input_1 [SEP] input_2 [SEP] input_3
col_names = ['Premise', 'Hypothesis', 'Explanation_1']
results, false_results = compute_segment_attributions(test_df, model, tokenizer, col_names, normalize=True)
print(f'Results Processed!')

Processing index 0, Time: 0.001895904541015625
[('[CLS]', 0.0), ('[CLS]', 0.06864141061919121), ('not', 0.15255894013968443), ('all', 0.9008015207734533), ('churches', 0.24129000456430919), ('have', 0.2345774864022904), ('cracks', 0.0496907296013012), ('in', 0.11631427748064567), ('the', 0.09463425266536953), ('ceiling', -0.14956423951850517), ('[SEP]', 0.0)]
Processing index 100, Time: 13.62921667098999
Processing index 200, Time: 11.370181322097778
Processing index 300, Time: 10.928741693496704
Processing index 400, Time: 10.72820496559143
Processing index 500, Time: 11.383729219436646
Processing index 600, Time: 11.641113758087158
Processing index 700, Time: 11.320428133010864
Processing index 800, Time: 11.603541612625122
Processing index 900, Time: 12.454941749572754
Processing index 1000, Time: 10.529061079025269
Processing index 1100, Time: 11.46530532836914
Processing index 1200, Time: 12.531065464019775
Processing index 1300, Time: 11.957156419754028
Processing index 1400, Tim

In [None]:
import csv
file_path = f"/content/drive/MyDrive/[CS4248] Project Folder/results/{model_type}.csv"
false_file_path = f"/content/drive/MyDrive/[CS4248] Project Folder/results/{model_type}_false.csv"

fieldnames = ['Index', 'Premise', 'Hypothesis', 'Explanation_1']

with open(file_path, 'w', newline='') as csvfile:

    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    for index, row_data in results.items():
        writer.writerow({'Index': index, **row_data})
print(f"CSV file '{file_path}' has been created successfully.")

with open(false_file_path, 'w', newline='') as csvfile:

    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    for index, row_data in false_results.items():
        writer.writerow({'Index': index, **row_data})

print(f"False CSV file '{false_file_path}' has been created successfully.")

## Used to manipulate CSV data

In [13]:
results_df = pd.read_csv(f'/content/drive/MyDrive/[CS4248] Project Folder/results/{model_type}.csv')
results_df.dropna(inplace=True)

results_df['GroupIndex'] = (results_df.index // 10)
averages_df = results_df.groupby('GroupIndex').mean()

averages_df.to_csv(f'/content/drive/MyDrive/[CS4248] Project Folder/results/{model_type}_processed.csv')

In [14]:
results_df = pd.read_csv(f'/content/drive/MyDrive/[CS4248] Project Folder/results/{model_type}_false.csv')
results_df.dropna(inplace=True)

results_df['GroupIndex'] = (results_df.index // 10)
averages_df = results_df.groupby('GroupIndex').mean()

averages_df.to_csv(f'/content/drive/MyDrive/[CS4248] Project Folder/results/{model_type}_false_processed.csv')