In [1]:
# Install required packages

!pip3 install -r ../requirements.txt
!pip3 install transquest
!pip3 install lime





In [2]:
import os
import sys
import numpy as np
import json
from tqdm import tqdm
from scipy.stats import pearsonr
from IPython.core.display import display, HTML

In [3]:
# Change this according to your set up

os.environ['TRANSFORMERS_CACHE'] = 'cache'

In [4]:
RESULTS_FNAME = 'results.json'
SRC_LANG = 'ro'
TGT_LANG = 'en'
SPLIT = 'dev'

## Load dataset

In [5]:
data_dir = f'../data/{SPLIT}/{SRC_LANG}-{TGT_LANG}-{SPLIT}'
src = [s.strip() for s in open(f'{data_dir}/{SPLIT}.src').readlines()]
tgt = [s.strip() for s in open(f'{data_dir}/{SPLIT}.mt').readlines()]
wor = [list(map(int, s.strip().split())) for s in open(f'{data_dir}/{SPLIT}.tgt-tags').readlines()]
sen = [float(s.strip()) for s in open(f'{data_dir}/{SPLIT}.da').readlines()]
assert len(src) == len(tgt) == len(wor) == len(sen)
dataset = {'src': src, 'tgt': tgt, 'word_labels': wor, 'sent_labels': sen}

## Load Sentence-Level Model

In [6]:
import torch
from transquest.algo.sentence_level.monotransquest.run_model import MonoTransQuestModel

# Load model
model = MonoTransQuestModel(
    'xlmroberta',
    f'TransQuest/monotransquest-da-{SRC_LANG}_{TGT_LANG}-wiki', num_labels=1, use_cuda=torch.cuda.is_available()
)

# Evaluate predictions on the dataset
predictions, raw_outputs = model.predict(list(map(list, zip(dataset['src'], dataset['tgt']))))
print(pearsonr(predictions, dataset['sent_labels']))

  '"sox" backend is being deprecated. '
  f"use_multiprocessing automatically disabled as {model_type}"


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

(-1.0, 1.0)


## LIME Explanation

In [7]:
from lime.lime_text import LimeTextExplainer

explainer = LimeTextExplainer(class_names=['score', 'score'], bow=False, split_expression = ' ')

def explain_instance(model, text_a, text_b):
    def predict_proba(texts):
        text_src = [text_a] * len(texts)
        to_predict = list(zip(text_src, texts))
        to_predict = list(map(list, to_predict))
        preds, _ = model.predict(to_predict)
        return np.vstack((preds, preds)).T
    
    predictions, raw_outputs = model.predict([[text_a, text_b]])
    exp = explainer.explain_instance(text_b, predict_proba, num_features=len(text_b.split()), labels=(1, ))
    return predictions, exp.as_map()

In [15]:
def explain_dataset():
    results = []
    for idx in tqdm(range(len(dataset['src']))):
        pred_score, expl = explain_instance(model, dataset['src'][idx], dataset['tgt'][idx])
        expl = expl[1]
        assert len(expl) == len(dataset['tgt'][idx].split(' '))
        feature_maps = np.zeros(len(expl))
        for k, v in expl:
            feature_maps[k] = v * -1
        results.append(
            {
                'pred': float(pred_score),
                'expl': list(feature_maps),
                'ground_truth_word': dataset['word_labels'][idx],
                'ground_truth_sent': dataset['sent_labels'][idx],
            }
        )
    json.dump(results, open(RESULTS_FNAME, 'w'))
    return results
              

if not os.path.exists(RESULTS_FNAME):
    results = explain_dataset()
else:
    results = json.load(open(RESULTS_FNAME))

## Evaluate LIME Explanations

In [16]:
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from scripts.evaluate import evaluate_word_level

In [17]:
gold_expls = [item['ground_truth_word'] for item in results]
model_expls = [item['expl'] for item in results]

evaluate_word_level(gold_expls, model_expls)

AUC score: 0.353
AP score: 0.300
Recall at top-K: 0.310


## Visualizing the results

In [18]:
def colorize_twoway(words, color_array, max_width_shown=600):
    # words is a list of words
    # color_array is an array of numbers between 0 and 1 of length equal to words
    template_pos = '<span class="barcode"; style="color: black; background-color: rgba(255, 0, 0, {}); display:inline-block;">{}</span>'
    template_neg = '<span class="barcode"; style="color: black; background-color: rgba(0, 0, 255, {}); display:inline-block;">{}</span>'
    colored_string = ''
    for word, color in zip(words, color_array):
        if color > 0:
            colored_string += template_pos.format(color, '&nbsp' + word + '&nbsp')
        else:
            colored_string += template_neg.format(-color, '&nbsp' + word + '&nbsp')
    return '<div style="width:%dpx">' % max_width_shown + colored_string + '</div>'

In [19]:
def display_example(idx):
    res = results[idx]
    score, expl = res['pred'], res['expl']
    src_txt, tgt_txt = dataset['src'][idx], dataset['tgt'][idx]
    gt_label, gt_words = dataset['sent_labels'][idx], dataset['word_labels'][idx] 
    expl_adjusted = np.array(expl) / max(np.abs(np.array(expl)))
    
    print(f'Example id: {idx}')
    print(f'Source: {src_txt}')
    print(f'Target: {tgt_txt}')
    print(f'Ground truth: {gt_label}')
    print(f'Predicted: {score}')
    print(f'Ground truth annotation')
    display(HTML(colorize_twoway(tgt_txt.split(), gt_words)))
    print(f'LIME scores')
    display(HTML(colorize_twoway(tgt_txt.split(), expl_adjusted)))
    print('-'*50)

In [20]:
idx = 1
display_example(idx)

Example id: 1
Source: Dischetele sunt numite dischete de 4 MB , referitor la capacitatea lor neformatată .
Target: The floor coverings are called 4 MB disks related to their unformatable capacity .
Ground truth: 34.833333333333336
Predicted: 0.556640625
Ground truth annotation


LIME scores


--------------------------------------------------
