In [1]:
%load_ext autoreload
%autoreload 2
import os
import os.path
import sklearn.model_selection
import sklearn.linear_model
import sklearn.ensemble
import spacy
import sys
from sklearn.feature_extraction.text import CountVectorizer
from anchor import anchor_text
from myUtils import *
import pickle
import matplotlib.pyplot as plt

In [2]:
# dataset from http://www.cs.cornell.edu/people/pabo/movie-review-data/
# Link: http://www.cs.cornell.edu/people/pabo/movie-review-data/rt-polaritydata.tar.gz
def load_polarity(path='sentiment-sentences'):
    data = []
    labels = []
    f_names = ['rt-polarity.neg', 'rt-polarity.pos']
    for (l, f) in enumerate(f_names):
        for line in open(os.path.join(path, f), 'rb'):
            try:
                line.decode('utf8')
            except:
                continue
            data.append(line.strip())
            labels.append(l)
    return data, labels

Note: you must have spacy installed. Run:

        pip install spacy && python -m spacy download en_core_web_sm

If you want to run BERT, you have to install transformers and torch or tf: 

        pip install torch transformers spacy && python -m spacy download en_core_web_sm
        

In [3]:
nlp = spacy.load('en_core_web_sm')

In [5]:
data, labels = load_polarity()
train, test, train_labels, test_labels = sklearn.model_selection.train_test_split(data, labels, test_size=.2, random_state=42)
train, val, train_labels, val_labels = sklearn.model_selection.train_test_split(train, train_labels, test_size=.1, random_state=42)
train_labels = np.array(train_labels)
test_labels = np.array(test_labels)
val_labels = np.array(val_labels)
counter_test, counter_test_labels = TextUtils.counter_test()

In [6]:
vectorizer = CountVectorizer(min_df=1)
vectorizer.fit(train)
train_vectors = vectorizer.transform(train)
test_vectors = vectorizer.transform(test)
val_vectors = vectorizer.transform(val)
counter_test_vectors = vectorizer.transform(counter_test)

In [10]:
c = sklearn.linear_model.LogisticRegression()
# c = sklearn.ensemble.RandomForestClassifier(n_estimators=500, n_jobs=10)
c.fit(train_vectors, train_labels)
preds = c.predict(val_vectors)
print('Val accuracy', sklearn.metrics.accuracy_score(val_labels, preds))
def predict_lr(texts):
    return c.predict(vectorizer.transform(texts))

Val accuracy 0.7544910179640718


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## Using BERT

In [11]:
explainer = anchor_text.AnchorText(nlp, ['negative', 'positive'], use_unk_distribution=False)

In [8]:
test = [example.decode('utf-8') for example in test]
anchor_examples = [example.decode('utf-8') for example in train]
anchor_examples = [example for example in anchor_examples if 20< len(example) < 70 and len(example)>20][:500]

In [9]:
pickle.dump( test, open( "results/text_test.pickle", "wb" ))
pickle.dump( test_labels, open( "results/text_test_labels.pickle", "wb" ))

In [None]:
my_utils = TextUtils(anchor_examples, counter_test, explainer, predict_lr)
#explanations = my_utils.compute_explanations(np.random.choice(len(test), exps_num))
explanations = my_utils.compute_explanations(list(range(0, len(anchor_examples))))

In [None]:
pickle.dump( explanations, open( "results/text_exps.pickle", "wb" ))

# Loading Results

In [10]:
explanations = pickle.load( open( "results/text_exps.pickle", "rb" ))
test = pickle.load( open( "results/text_test.pickle", "rb" ))
test = np.array(test)
test_labels = pickle.load( open( "results/text_test_labels.pickle", "rb" ))

In [135]:
myUtils =  TextUtils(test, test, None, None, "results/transformer_exps.pickle")
explanations = myUtils.remove_duplicates(explanations)

In [136]:
explanations = [ExtendedExplanation(exp, test, test_labels, predict_lr, explainer) for exp in explanations if len(exp.fit_examples) >0]

In [137]:
#explanations = [exp for exp in explanations if len(exp.fit_examples) > 10] 
explanations.sort(key=lambda exp: exp.test_precision)

In [138]:
best = explanations[-10:]
best.reverse()

for exp in best:
    print("------------------------")
    exp_label =  predict_lr([test[exp.index]])[0]
    print('Prediction:', explainer.class_names[exp_label])
    print('Anchor: %s' % (' AND '.join(exp.names)))
    print('Precision: %.2f' % exp.precision)
    print('Coverage: %.2f' % exp.coverage)
    print('Anchor test precision: %.2f' % (np.mean(predict_lr(test[exp.fit_examples]) == exp_label)))
    print('Anchor test coverage: %.2f' % (exp.test_cov))
    covered_labels = test_labels[exp.fit_examples]
    real_percentage = np.mean(covered_labels == exp_label)
    print('Anchor test REAL precision: %.2f' % real_percentage)

------------------------
Prediction: positive
Anchor: moving
Precision: 0.95
Coverage: 0.00
Anchor test precision: 1.00
Anchor test coverage: 0.01
Anchor test REAL precision: 0.79
------------------------
Prediction: positive
Anchor: ride
Precision: 0.97
Coverage: 0.00
Anchor test precision: 1.00
Anchor test coverage: 0.01
Anchor test REAL precision: 0.79
------------------------
Prediction: negative
Anchor: boring
Precision: 1.00
Coverage: 0.00
Anchor test precision: 1.00
Anchor test coverage: 0.01
Anchor test REAL precision: 0.92
------------------------
Prediction: negative
Anchor: exercise
Precision: 0.97
Coverage: 0.00
Anchor test precision: 1.00
Anchor test coverage: 0.00
Anchor test REAL precision: 1.00
------------------------
Prediction: positive
Anchor: rare AND and
Precision: 0.96
Coverage: 0.00
Anchor test precision: 1.00
Anchor test coverage: 0.00
Anchor test REAL precision: 1.00
------------------------
Prediction: positive
Anchor: refreshing
Precision: 0.98
Coverage: 0.0

In [139]:
len(explanations)

388

In [140]:
exps = explanations[-len(explanations)//2:]
exps = [exp for exp in exps if len(exp.fit_examples)>2]
real_precisions = [np.mean(test_labels[exp.fit_examples] == predict_lr([test[exp.index]])[0])
                   for exp in exps]
test_precisions = [np.mean(predict_lr(test[exp.fit_examples]) == predict_lr([test[exp.index]])[0]) 
                   for exp in exps]

In [141]:
plt.clf()
plt.scatter(test_precisions, real_precisions, s = range(len(exps)), alpha = 0.9)
plt.xlabel('predicted class precision')
plt.ylabel('real class precision')
plt.title('logistic regression')
plt.savefig("results/text.png")

In [142]:
img = plt.imread("results/text.png")
plt.figure(figsize = (5,5))
plt.axis('off')
_ = plt.imshow(img)