# Integrated Gradients analyses

## Set-up

In [1]:
from collections import defaultdict
import json
import numpy as np
import os
import pandas as pd

from feature_importance import IntegratedGradients

In [2]:
src_filename = os.path.join("..", "ig-embeddings-batch01.json")

In [3]:
with open(src_filename) as f:
    data = json.load(f)

In [4]:
len(data)

20000

In [5]:
IntegratedGradients.visualize(data[: 3])

True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
contradiction,contradiction (1.00),,10.23,[CLS] emptystring some soft nurse does not emptystring carries emptystring every mexican helmet [SEP] emptystring every soft nurse does not randomly carries not every emptystring helmet [SEP]
,,,,
entailment,entailment (1.00),,9.79,[CLS] emptystring some quiet robber does not lightly sees not every emptystring television [SEP] not every quiet robber emptystring emptystring lightly sees not every dull television [SEP]
,,,,
neutral,neutral (1.00),,-1.05,[CLS] emptystring some polite smith does not emptystring passes emptystring no mongolian carpet [SEP] not every polite smith emptystring emptystring frequently passes emptystring some mongolian carpet [SEP]
,,,,


In [6]:
data[0].keys()

dict_keys(['attr_class', 'attr_score', 'convergence_score', 'input_ids', 'pred_class', 'pred_probs', 'raw_input', 'true_class', 'word_attributions'])

## Class distribution for the sample

In [7]:
pd.Series([d['true_class'] for d in data]).value_counts()

entailment       6714
contradiction    6698
neutral          6588
dtype: int64

## Gather token-level data

In [8]:
tok_level_data = []

for d in data:
    for i, (tok, score) in enumerate(zip(d['raw_input'], d['word_attributions'])):
        tok_level_data.append({'token': tok, 'position': i, 'score': score, 'true_class': d['true_class']})

tok_df = pd.DataFrame(tok_level_data)

In [9]:
tok_df.head()

Unnamed: 0,token,position,score,true_class
0,[CLS],0,-0.022728,contradiction
1,emptystring,1,-0.070899,contradiction
2,some,2,0.073817,contradiction
3,soft,3,-0.05105,contradiction
4,nurse,4,-0.062294,contradiction


## Which tokens have the highest average score?

In [10]:
tok_mu = tok_df.groupby('token').apply(lambda x: x['score'].mean())

tok_mu.sort_values(ascending=False).head(20)

token
some       0.137986
no         0.111057
not        0.059782
thanks     0.057558
likes      0.054528
passes     0.045743
smells     0.044833
[SEP]      0.031804
chooses    0.025257
admits     0.024749
boasts     0.020690
films      0.019855
sends      0.008490
every      0.007833
asks       0.007785
offers     0.004550
bird       0.003863
finds     -0.001112
hears     -0.001921
misses    -0.002526
dtype: float64

## Which tokens have the highest average score by class?

In [11]:
tok_cls_mu = tok_df.groupby('token').apply(
    lambda x: x.groupby('true_class').apply(lambda x: x['score'].mean()))

In [12]:
tok_cls_mu.sort_values('contradiction', ascending=False).head(10)

true_class,contradiction,entailment,neutral
token,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
some,0.192145,0.196716,0.036903
no,0.131191,0.187325,0.006676
likes,0.127036,0.023303,0.027088
thanks,0.101049,0.064681,0.019483
smells,0.079022,0.030987,0.025416
films,0.067858,0.009308,-0.011701
not,0.064272,0.112694,0.00231
boasts,0.059288,0.005235,-0.001974
admits,0.041505,0.037796,-0.004841
every,0.035533,0.025916,-0.038922


In [13]:
tok_cls_mu.sort_values('entailment', ascending=False).head(10)

true_class,contradiction,entailment,neutral
token,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
some,0.192145,0.196716,0.036903
no,0.131191,0.187325,0.006676
not,0.064272,0.112694,0.00231
thanks,0.101049,0.064681,0.019483
passes,0.032713,0.059448,0.043415
admits,0.041505,0.037796,-0.004841
smells,0.079022,0.030987,0.025416
every,0.035533,0.025916,-0.038922
chooses,0.02917,0.024591,0.021147
likes,0.127036,0.023303,0.027088


In [14]:
tok_cls_mu.sort_values('neutral', ascending=False).head(10)

true_class,contradiction,entailment,neutral
token,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
[SEP],-0.029788,-0.022754,0.150028
safely,-0.061791,-0.083912,0.119294
rapidly,-0.078072,-0.090816,0.113432
occasionally,-0.072438,-0.077781,0.093891
door,-0.046942,-0.06186,0.092781
carefully,-0.069043,-0.093944,0.090341
badly,-0.070255,-0.08929,0.090143
quickly,-0.073228,-0.089163,0.089204
white,-0.073256,-0.093553,0.088619
freely,-0.06549,-0.095578,0.088147


## Which positions have the highest average score?

In [15]:
pos_mu = tok_df.groupby('position').apply(lambda x: x['score'].mean())

pos_mu.sort_values(ascending=False).head(20)

position
10    0.105636
23    0.102519
26    0.088942
2     0.045139
1     0.030606
11    0.023826
5     0.012824
15    0.008798
14    0.008029
3     0.005109
9    -0.003561
7    -0.003574
6    -0.004453
8    -0.013069
16   -0.016245
24   -0.023034
19   -0.024039
13   -0.025333
21   -0.025647
12   -0.026046
dtype: float64

In [16]:
# Use a specifie example to get a sense for what these positions are:

list(zip(range(len(data[0]['raw_input'])), data[0]['raw_input']))

[(0, '[CLS]'),
 (1, 'emptystring'),
 (2, 'some'),
 (3, 'soft'),
 (4, 'nurse'),
 (5, 'does'),
 (6, 'not'),
 (7, 'emptystring'),
 (8, 'carries'),
 (9, 'emptystring'),
 (10, 'every'),
 (11, 'mexican'),
 (12, 'helmet'),
 (13, '[SEP]'),
 (14, 'emptystring'),
 (15, 'every'),
 (16, 'soft'),
 (17, 'nurse'),
 (18, 'does'),
 (19, 'not'),
 (20, 'randomly'),
 (21, 'carries'),
 (22, 'not'),
 (23, 'every'),
 (24, 'emptystring'),
 (25, 'helmet'),
 (26, '[SEP]')]