In [1]:
# From: https://github.com/marcotcr/checklist/blob/master/notebooks/tutorials/3.%20Test%20types,%20expectation%20functions,%20running%20tests.ipynb

In [2]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [3]:

dataset = ['This was a very nice movie directed by John Smith.',
           'Mary Keen was brilliant.', 
          'I hated everything about this.',
          'This movie was very bad.',
          'I really liked this movie.',
          'just bad.',
          'amazing.',
          ]
pdataset = list(nlp.pipe(dataset))

In [5]:
import checklist
from checklist.editor import Editor
from checklist.perturb import Perturb
from checklist.test_types import MFT, INV, DIR

# Invariance Test

In [15]:
# t = Perturb.perturb(pdataset, Perturb.change_names)

t = Perturb.perturb(dataset, Perturb.add_typos)
print('\n'.join(t.data[0][:3]))
print('...')
test = INV(**t)

This was a very nice movie directed by John Smith.
This was a very nice movie directed byJ ohn Smith.
...


In [16]:
from pattern.en import sentiment

In [17]:

import numpy as np
def predict_proba(inputs):
    p1 = np.array([(sentiment(x)[0] + 1)/2. for x in inputs]).reshape(-1, 1)
    p0 = 1- p1
    return np.hstack((p0, p1))

In [18]:
# Predictions are random
predict_proba(['good', 'bad'])

array([[0.15, 0.85],
       [0.85, 0.15]])

In [19]:
from checklist.pred_wrapper import PredictorWrapper
wrapped_pp = PredictorWrapper.wrap_softmax(predict_proba)

In [20]:
test.run(wrapped_pp)
test.summary()

Predicting 14 examples
Test cases:      7
Fails (rate):    2 (28.6%)

Example fails:
0.9 Mary Keen was brilliant.
0.5 Mary Keen was brillinat.

----
0.8 amazing.
0.5 maazing.

----


In [21]:
test.visual_summary()

TestSummarizer(stats={'npassed': 5, 'nfailed': 2, 'nfiltered': 0}, summarizer={'name': None, 'description': No…

# Direction Test

In [22]:
def add_negative(x):
    phrases = ['Anyway, I thought it was bad.', 'Having said this, I hated it', 'The director should be fired.']
    return ['%s %s' % (x, p) for p in phrases]

In [23]:
dataset[0], add_negative(dataset[0])

('This was a very nice movie directed by John Smith.',
 ['This was a very nice movie directed by John Smith. Anyway, I thought it was bad.',
  'This was a very nice movie directed by John Smith. Having said this, I hated it',
  'This was a very nice movie directed by John Smith. The director should be fired.'])

In [24]:
from checklist.expect import Expect

In [25]:
monotonic_decreasing = Expect.monotonic(label=1, increasing=False, tolerance=0.1)

In [26]:
t = Perturb.perturb(dataset, add_negative)
test = DIR(**t, expect=monotonic_decreasing)

In [27]:
test.run(wrapped_pp)
test.summary()

Predicting 28 examples
Test cases:      7
After filtering: 6 (85.7%)
Fails (rate):    0 (0.0%)


In [28]:
test.visual_summary()

TestSummarizer(stats={'npassed': 6, 'nfailed': 0, 'nfiltered': 0}, summarizer={'name': None, 'description': No…