In [1]:
%load_ext autoreload
%autoreload 2
import os
import os.path
import numpy as np
import sklearn
import spacy
import sys
from anchor import anchor_text
from myUtils import *
import pickle
import matplotlib.pyplot as plt
import pandas as pd
from simpletransformers.classification import ClassificationModel

In [2]:
# dataset from http://www.cs.cornell.edu/people/pabo/movie-review-data/
# Link: http://www.cs.cornell.edu/people/pabo/movie-review-data/rt-polaritydata.tar.gz
def load_polarity(path='sentiment-sentences'):
    data = []
    labels = []
    f_names = ['rt-polarity.neg', 'rt-polarity.pos']
    for (l, f) in enumerate(f_names):
        for line in open(os.path.join(path, f), 'rb'):
            try:
                line.decode('utf8')
            except:
                continue
            data.append(line.strip())
            labels.append(l)
    return data, labels

In [3]:
nlp = spacy.load('en_core_web_sm')

In [4]:
data, labels = load_polarity()
train, test, train_labels, test_labels = sklearn.model_selection.train_test_split(data, labels, test_size=.2, random_state=42)
train_labels = np.array(train_labels)
test_labels = np.array(test_labels)

In [5]:
train = [example.decode('utf-8') for example in train]
test = [example.decode('utf-8') for example in test]

In [6]:
train_df = pd.DataFrame(list(zip(train,train_labels)))
train_df.columns = ["text", "labels"]

test_df = pd.DataFrame(list(zip(test,test_labels)))
test_df.columns = ["text", "labels"]

In [7]:
def get_and_train():
    model = ClassificationModel('squeezebert', 'squeezebert/squeezebert-uncased',args =
                            {'num_labels':2, 'num_train_epochs' :3, 'weight_decay': 0.0002,
                             'max_seq_length': 64})
    #model.train_model(train_df)
    return model

In [8]:
load_trained = False

In [None]:
model = ClassificationModel('squeezebert',  'outputs') if load_trained else get_and_train()
model.args.silent = True
model.args.dynamic_quantize = True

Some weights of the model checkpoint at squeezebert/squeezebert-uncased were not used when initializing SqueezeBertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing SqueezeBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing SqueezeBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of SqueezeBertForSequenceClassification were no

In [17]:
def predict_lr(texts):
    return np.array(model.predict(texts)[0])

In [18]:
explainer = anchor_text.AnchorText(nlp, ['negative', 'positive'], use_unk_distribution=False)

In [19]:
anchor_examples = [example for example in train if 20< len(example) < 70 and len(example)>20][23:300]

In [20]:
pickle.dump( test, open( "results/text_test_deep.pickle", "wb" ))
pickle.dump( test_labels, open( "results/text_test_labels_deep.pickle", "wb" ))

In [None]:
my_utils = TextUtils(anchor_examples, test, explainer, predict_lr, "results/text_exps_bert.pickle")
explanations = my_utils.compute_explanations(list(range(len(anchor_examples))))

In [None]:
pickle.dump( explanations, open( "results/text_exps_bert_list.pickle", "wb" ))

## Loading Data

In [5]:
test = pickle.load( open( "results/text_test.pickle", "rb" ))
test_labels = pickle.load( open( "results/text_test_labels.pickle", "rb" ))
exps_file  = open( "results/text_exps.pickle", "rb" )
num_exps = 100
explanations= []

In [None]:
for _ in range(num_exps):
    explanations.append(pickle.load(exps_file))

In [None]:
explanations = myUtils.remove_duplicates(explanations)
explanations.sort(key=lambda exp: exp.test_cov)

In [None]:
best = explanations[-10:]
best.reverse()

for exp in best:
    print("------------------------")
    exp_label =  predict_lr([test[exp.index]])[0]
    print('Prediction:', explainer.class_names[exp_label])
    print('Anchor: %s' % (' AND '.join(exp.names)))
    print('Precision: %.2f' % exp.precision)
    print('Coverage: %.2f' % exp.coverage)
    print('Anchor test precision: %.2f' % (np.mean(predict_lr(test[exp.fit_examples]) == exp_label)))
    print('Anchor test coverage: %.2f' % (exp.test_cov))
    covered_labels = test_labels[exp.fit_examples]
    real_percentage = np.mean(covered_labels == exp_label)
    print('Anchor test REAL precision: %.2f' % real_percentage)

In [None]:
exps = explanations[-len(explanations)//2:]
exps = [exp for exp in exps if len(exp.fit_examples)>0]
real_precisions = [np.mean(test_labels[exp.fit_examples] == predict_lr([test[exp.index]])[0])
                   for exp in exps]
test_precisions = [np.mean(predict_lr(test[exp.fit_examples]) == predict_lr([test[exp.index]])[0]) 
                   for exp in exps]

In [None]:
plt.scatter(test_precisions, real_precisions, s = range(len(exps)), alpha = 0.9)
plt.xlabel('test precision')
plt.ylabel('real precision')
plt.title('text bert anchor experiment')
plt.savefig("results/deep_text.png")

In [None]:
img = plt.imread("results/text.png")
plt.figure(figsize = (10,10))
plt.axis('off')
_ = plt.imshow(img)