In [7]:
import spacy
import itertools
import sys
import sys
sys.path.insert(0, "../")

import checklist_fork.checklist.editor
import checklist_fork.checklist.text_generation
from checklist_fork.checklist.test_types import MFT, INV, DIR
from checklist_fork.checklist.expect import Expect
from checklist_fork.checklist.test_suite import TestSuite
import numpy as np
import spacy
from checklist_fork.checklist.perturb import Perturb

editor = checklist_fork.checklist.editor.Editor()


### Syntax for suggestions

In [8]:
professions = editor.suggest('{first_name} works as {a:mask}.')[:30]
print(', '.join(professions))

Some weights of RobertaForMaskedLM were not initialized from the model checkpoint at roberta-base and are newly initialized: ['lm_head.decoder.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


journalist, historian, secretary, nurse, waitress, accountant, engineer, attorney, artist, editor, architect, model, interpreter, escort, analyst, actor, actress, assistant, intern, economist, organizer, author, investigator, agent, administrator, executive, educator, investor, DJ, entrepreneur


### Syntax for sampling different instances of the same category

In [8]:
t = editor.template(
    'Is that {first_name} or {first_name} on the bus?',
    remove_duplicates=False, 
    nsamples=4)
print(t.data)

t = editor.template(
    'Is that {first_name1} or {first_name2} on the bus?',
    remove_duplicates=False, 
    nsamples=4)
print(t.data)

['Is that Sarah or Sarah on the bus?', 'Is that Julia or Julia on the bus?', 'Is that Rebecca or Rebecca on the bus?', 'Is that Peter or Peter on the bus?']
['Is that Robert or Frances on the bus?', 'Is that Eleanor or Mary on the bus?', 'Is that Victoria or Don on the bus?', 'Is that Harriet or Stephen on the bus?']


### Getting synonyms/antonyms

Can also get hypernyms, hyponyms and related words (either hyponym and hypernym)

In [17]:
a = "knowledgeable"

e = editor.synonyms('How can I become {moreless} %s?' % a, a, moreless=['more', 'less'])
print("Synonyms:", [a] + e)

e = editor.antonyms('How can I become {moreless} %s?' % a, a, moreless=['more', 'less'])
print("Antonyms:", [a] + e)

a = "brave"
e = editor.antonyms('How can I become {moreless} %s?' % a, a, moreless=['more', 'less'])
print("Antonyms:", [a] + e)

Synonyms: ['knowledgeable', 'learned', 'intimate']
Antonyms: ['knowledgeable']
Antonyms: ['brave', 'cautious', 'fearful', 'timid']


### Perturbation example: replacing synonyms

(Not working example)

In [20]:
import re
def replace_pairs(pairs):
    def replace_z(text):
        ret = []
        for x, y in pairs:
            t = re.sub(r'\b%s\b' % x, y, text )
            if t != text:
                ret.append(t)
            if y == 'smart':
                continue
            t = re.sub(r'\b%s\b' % y, x, text )
            if t != text:
                ret.append(t)
        return list(set(ret))
    return replace_z

# pairing is for QQP -- each input is a pair of questions
def apply_and_pair(fn):
    def ret_fn(text):
        ret = fn(text)
        return [(text, r) for r in ret]
    return ret_fn


name = '(question, f(question)) where f(question) replaces synonyms?' 
desc = 'Expect 1, should be easy because it\'s individual word changes'
t = Perturb.perturb(list(all_questions), apply_and_pair(replace_pairs(synonyms)), nsamples=1000, keep_original=False)
test = INV(t.data, threshold=0.1, name=name, description=desc, capability='Taxonomy')

NameError: name 'all_questions' is not defined

### Syntax: more elaborate slots processing in templates

In [61]:
antonyms = []
for a in ["brave", "stupid"]:
    antonyms.append(
        [a] + editor.antonyms('How can I become {moreless} %s?' % a, a, moreless=['more', 'less']))
print(antonyms)

t = editor.template(
    [(
    'How can I become more {x[0]}?',
    'How can I become less {x[0]}?',
    ),
    (
    'How can I become less {x[1]}?',
    'How can I become more {x[1]}?',
    )],
    unroll=True, # remove the inner lists
    x=antonyms,
    remove_duplicates=True, 
    nsamples=4)

print(t.data)


[['brave', 'fearful', 'cautious', 'timid'], ['stupid', 'smart', 'intelligent']]
[('How can I become more stupid?', 'How can I become less stupid?'), ('How can I become less smart?', 'How can I become more smart?'), ('How can I become more brave?', 'How can I become less brave?'), ('How can I become less fearful?', 'How can I become more fearful?'), ('How can I become more stupid?', 'How can I become less stupid?'), ('How can I become less smart?', 'How can I become more smart?'), ('How can I become more stupid?', 'How can I become less stupid?'), ('How can I become less smart?', 'How can I become more smart?')]
data


In [62]:
import munch
order = ['size', 'shape', 'age', 'color']
props = []
properties = {
    'color' : ['red', 'blue','yellow', 'green', 'pink', 'white', 'black', 'orange', 'grey', 'purple', 'brown'],
    'size' : ['big', 'small', 'tiny', 'enormous'],
    'age' : ['old', 'new'],
    'shape' : ['round', 'oval', 'square', 'triangular'],
    'material' : ['iron', 'wooden', 'ceramic', 'glass', 'stone']
}
for i in range(len(order)):
    for j in range(i + 1, len(order)):
        p1, p2 = order[i], order[j]
        for v1, v2 in itertools.product(properties[p1], properties[p2]):
            props.append(munch.Munch({
                'p1': p1,
                'p2': p2,
                'v1': v1,
                'v2': v2,
            }))
objects = ['box', 'clock', 'table', 'object', 'toy', 'painting', 'sculpture', 'thing', 'figure']


t = editor.template(
    'There is a {a:p.v1} ({p.p1}) {p.v2} ({p.p2}) {obj} in the room.',
    obj=objects,
    p=props,
    remove_duplicates=True,
    nsamples=4,
    save=True
)

print(t.data)

['There is a a square (shape) yellow (color) sculpture in the room.', 'There is a a tiny (size) red (color) clock in the room.', 'There is a a square (shape) white (color) table in the room.', 'There is a an oval (shape) yellow (color) clock in the room.']


### Syntax: to consider only certain examples in the test (using Expect)

E.g. below takes all questions that were considered the same in QQP and changes the names (so that they don't match anymore). It expects that predictions 1 change to 0.
(not working example)

Q: But what happens to the questions in which there is no name? For those the expectation also changes from 1 to 0 (incorrectly)? NO. The perturbation doesn't return sentences for which the perturbation doesn't apply.


In [33]:
# t = Perturb.perturb(parsed_qs, wrap_apply_to_each(Perturb.change_names), nsamples=1500, first_only=True)
t = Perturb.perturb(parsed_qs, change_each_wrapper(Perturb.change_names), nsamples=500, first_only=True)
expect_fn = Expect.eq(0)
expect_fn = Expect.slice_orig(expect_fn, lambda orig, *args: orig == 1)
name = 'Change first name in one of the questions'
desc = 'Take pairs that are originally predicted as duplicates, change first name in one of them and expect new prediction to be non-duplicate'
test = DIR(**t, expect=expect_fn, name=name, description=desc, capability='NER')
# test.run(new_pp)
# test.summary(3)
suite.add(test)

NameError: name 'parsed_qs' is not defined

### Example of elaborate expectation function

(Not a working example)

In [None]:
import collections
def extract_unknown_implications(pairs, labels):
    graph = collections.defaultdict(lambda: set())
    ls = {}
    for x, y in zip(pairs, labels):
        graph[x[0]].add(x[1])
        graph[x[1]].add(x[0])
        t = tuple(sorted(x))
        ls[t] = y

    d = []
    l = []
    for x in graph:
        if len(graph[x]) == 1:
            continue
        for y in graph[x]:
            t = tuple(sorted((x, y)))
    #         print(t, ls[t])
        new = list(set([tuple(sorted(a)) for a in itertools.product(list(graph[x]), list(graph[x])) if a[0] != a[1]]))
        new = [a for a in new if a not in ls]
        for b, c in new:
            t1 = tuple(sorted((x, b)))
            t2 = tuple(sorted((x, c)))
            l1 = ls[t1]
            l2 = ls[t2]
            if l1 + l2 == 2:
                l3 = 1
            elif l1 + l2 == 1:
                l3 = 0
            else:
                continue
            new_x = [(x, b), (x, c), (b, c)]
            new_l = np.array([l1, l2, l3])
            d.append(new_x)
            l.append(new_l)
    return d, l

data, ls = extract_unknown_implications(qs, labels)


def expect_triplet(xs, preds, confs, labels, meta=None):
    if (preds[0] + preds[1]) == 2:
        if preds[2] != 1:
            return np.array([-3, -2, -1])
        else:
            return np.array([True, True, True])
    if (preds[0] + preds[1] == 1) and preds[1] != 0:
        if preds[1] != 0:
            return np.array([-3, -2, -1])
        else:
            return np.array([True, True, True])
    return None
#     if preds[0] != labels[0] or preds[1] != labels[1]:
#         return None
#     if preds[2] == labels[2]:
#         return np.array([True, True, True])
#     else:
#         return np.array([-3, -2, -1])
expect = Expect.testcase(expect_triplet)

name = 'Testing implications'
desc = 'f(x, a) = 1 and f(x, b) = 1 => f(a, b) = 1\nf(x, a) = 1 and f(x, b) = 0 => f(a, b) = 0\n Only used (x, a, b) such that (x, a) and (x, b) in val dataset and (a, b) is not.\n Expectation function filters out examples where f(x, a) or f(x, b) are incorrect'
test = DIR(data, expect, labels=ls, name=name, description=desc, capability='Logic')
# test.run(new_pp)
# test.summary(n=3)
suite.add(test)

Another expect example

In [None]:
# the prediction can go up, but shouldn't do down
monotonic_label = Expect.monotonic(increasing=True, tolerance=0.1)
non_neutral_pred = lambda pred, *args, **kwargs: pred != 1
monotonic_label = Expect.slice_pairwise(monotonic_label, non_neutral_pred)

# the prediction can go down, but shouldn't go up
monotonic_label_down = Expect.monotonic(increasing=False, tolerance=0.1)
monotonic_label_down = Expect.slice_pairwise(monotonic_label_down, non_neutral_pred)

###  Elaborate template

It can be then fed into some function that takes the filled sentences and returns different templates.

In [39]:
state = editor.suggest('John is very {mask} about the project.')[:20]
very = ['very', 'extremely', 'really', 'quite', 'incredibly', 'particularly', 'highly', 'super']
somewhat = ['a little', 'somewhat', 'slightly', 'mildly']

temp_temp = editor.template(
{
    'contexts': [
        '{first_name} is {very} {s} about the project. {first_name1} is {s} about the project.',
        '{first_name1} is {s} about the project. {first_name} is {very} {s} about the project.',
        '{first_name} is {s} about the project. {first_name1} is {somewhat} {s} about the project.',
        '{first_name1} is {somewhat} {s} about the project. {first_name} is {s} about the project.',
        '{first_name} is {very} {s} about the project. {first_name1} is {somewhat} {s} about the project.',
        '{first_name1} is {somewhat} {s} about the project. {first_name} is {very} {s} about the project.',
    ],
    'qas': [
        (
            'Who is most {s} about the project?',
            '{first_name}'
        ), 
        (
            'Who is least {s} about the project?',
            '{first_name1}'
        ), 

    ]

},
s = state,
very=very,
somewhat=somewhat,
remove_duplicates=True,
nsamples=3,
save=True
)

print(temp_temp.data)

[{'contexts': ['Kate is super happy about the project. Pamela is happy about the project.', 'Pamela is happy about the project. Kate is super happy about the project.', 'Kate is happy about the project. Pamela is a little happy about the project.', 'Pamela is a little happy about the project. Kate is happy about the project.', 'Kate is super happy about the project. Pamela is a little happy about the project.', 'Pamela is a little happy about the project. Kate is super happy about the project.'], 'qas': [('Who is most happy about the project?', 'Kate'), ('Who is least happy about the project?', 'Pamela')]}, {'contexts': ['Larry is extremely vocal about the project. Caroline is vocal about the project.', 'Caroline is vocal about the project. Larry is extremely vocal about the project.', 'Larry is vocal about the project. Caroline is somewhat vocal about the project.', 'Caroline is somewhat vocal about the project. Larry is vocal about the project.', 'Larry is extremely vocal about the p

In [47]:
def crossproduct(t):
    # takes the output of editor.template and does the cross product of contexts and qas
    ret = []
    ret_labels = []

    for x in t.data:
        cs = x['contexts']
        qas = x['qas']
        d = list(itertools.product(cs, qas))
        ret.append([(x[0], x[1][0]) for x in d])
        ret_labels.append([x[1][1] for x in d])
    t.data = ret
    t.labels = ret_labels
    return t

t = crossproduct(editor.template(
    {
        'contexts': [
            '{first_name} is {very} {s} about the project. {first_name1} is {s} about the project.',
            '{first_name1} is {s} about the project. {first_name} is {very} {s} about the project.',
            '{first_name} is {s} about the project. {first_name1} is {somewhat} {s} about the project.',
            '{first_name1} is {somewhat} {s} about the project. {first_name} is {s} about the project.',
            '{first_name} is {very} {s} about the project. {first_name1} is {somewhat} {s} about the project.',
            '{first_name1} is {somewhat} {s} about the project. {first_name} is {very} {s} about the project.',
        ],
        'qas': [
            (
                'Who is most {s} about the project?',
                '{first_name}'
            ), 
            (
                'Who is least {s} about the project?',
                '{first_name1}'
            ), 

        ]

    },
    s = state,
    very=very,
    somewhat=somewhat,
    remove_duplicates=True,
    nsamples=3,
    save=True
    ))
print(t.data)

[[('Harriet is super curious about the project. Sharon is curious about the project.', 'Who is most curious about the project?'), ('Harriet is super curious about the project. Sharon is curious about the project.', 'Who is least curious about the project?'), ('Sharon is curious about the project. Harriet is super curious about the project.', 'Who is most curious about the project?'), ('Sharon is curious about the project. Harriet is super curious about the project.', 'Who is least curious about the project?'), ('Harriet is curious about the project. Sharon is mildly curious about the project.', 'Who is most curious about the project?'), ('Harriet is curious about the project. Sharon is mildly curious about the project.', 'Who is least curious about the project?'), ('Sharon is mildly curious about the project. Harriet is curious about the project.', 'Who is most curious about the project?'), ('Sharon is mildly curious about the project. Harriet is curious about the project.', 'Who is le

### Getting labels from the templates

In [9]:
import collections
fewer_profs = ['doctor', 'nurse', 'secretary', 'CEO']
t = editor.template(
    [
        ('{male} is not {a:prof}, {female} is.', 'Who is {a:prof}?', '{female}', 'woman', '{prof}'),
        ('{female} is not {a:prof}, {male} is.', 'Who is {a:prof}?', '{male}', 'man', '{prof}'),
    ],
#     prof=professions + ['doctor'],
    prof=fewer_profs,
    remove_duplicates=True,
    nsamples=2,
    unroll=True,
    save=True,
    )
data = [(d[0], d[1]) for d in t.data]
labels = [d[2] for d in t.data]
meta = [(d[3], d[4]) for d in t.data]

print(data)
print(labels)

# NOTE THAT the labels can be a list (a label for every data case) or a single value (label which applied to all)

test = MFT(data, expect=expect_squad, labels=labels, meta=meta, templates=t.templates,
          name='M/F failure rates should be similar for different professions', capability='Fairness',
          description='Using negation in context.')


[('Bill is not a secretary, Helen is.', 'Who is a secretary?'), ('Helen is not a secretary, Bill is.', 'Who is a secretary?'), ('Charlie is not a doctor, Emma is.', 'Who is a doctor?'), ('Emma is not a doctor, Charlie is.', 'Who is a doctor?')]
['Helen', 'Bill', 'Emma', 'Charlie']


NameError: name 'expect_squad' is not defined

### More elaborate results

Note that to get those results, the tests need to have meta information passed to them (as above). 

In [10]:
def print_fair(test):
    c = collections.Counter(test.meta)
    fail = collections.Counter([tuple(x) for x in np.array(test.meta)[test.fail_idxs()]])
    profs = set()
    for sex, prof in fail:
        profs.add(prof)
    prof_fail = {}
    get_fail = lambda f:fail[f] / c[f]
    for prof in profs:
        fail_m = get_fail(('man', prof))
        fail_f = get_fail(('woman', prof))
        prof_fail[prof] = (fail_m, fail_f)
    print('%-13s fail_men fail_women (count)' % 'profession')
    for prof, vs in sorted(prof_fail.items(), key=lambda x:max(x[1][0], x[1][1]), reverse=True):
        fail_m, fail_f = vs
        print('%-13s   %.1f      %.1f     (%d)' % (prof, 100 * fail_m, 100 * fail_f, c[('man', prof)]))

Also, the perturbation function can also return metadata (return Tuple[List[str], List[str]] where the 2nd list is metadata). Then when we use it in Perturb we set the meta parameter to True:

In [11]:
ret = Perturb.perturb(data, change_professions, keep_original=True, nsamples=1, meta=True)
print('Data')
print(ret.data)
print('Metadata')
print(ret.meta)

NameError: name 'change_professions' is not defined

### Perturb and return many options for invariance

In [None]:
neutral_words = set(
    ['.', 'the', 'The', ',', 'a', 'A', 'and', 'of', 'to', 'it', 'that', 'in',
     'this', 'for',  'you', 'there', 'or', 'an', 'by', 'about', 'flight', 'my',
     'in', 'of', 'have', 'with', 'was', 'at', 'it', 'get', 'from', 'this', 'Flight', 'plane'
    ])
forbidden = set(['No', 'no', 'Not', 'not', 'Nothing', 'nothing', 'without', 'but'] + pos_adj + neg_adj + pos_verb_present + pos_verb_past + neg_verb_present + neg_verb_past)
def change_neutral(d):
#     return d.text
    examples = []
    subs = []
    words_in = [x for x in d.capitalize().split() if x in neutral_words]
    if not words_in:
        return None
    for w in words_in:
        suggestions = [x for x in editor.suggest_replace(d, w, beam_size=5, words_and_sentences=True) if x[0] not in forbidden]
        examples.extend([x[1] for x in suggestions])
        subs.extend(['%s -> %s' % (w, x[0]) for x in suggestions])
    if examples:
        idxs = np.random.choice(len(examples), min(len(examples), 10), replace=False)
        return [examples[i] for i in idxs]
t = Perturb.perturb(sentences, change_neutral, nsamples=500)
test = INV(t.data)

### Appending sentences + some expectation functions

In [None]:
positive = editor.template('I {pos_verb_present} you.').data
positive += editor.template('You are {pos_adj}.').data
positive += ['I would fly with you again.']
positive.remove('You are happy.')

def add_phrase_function(phrases):
    def pert(d):
        while d[-1].pos_ == 'PUNCT':
            d = d[:-1]
        d = d.text
        ret = [d + '. ' + x for x in phrases]
        idx = np.random.choice(len(ret), 10, replace=False)
        ret = [ret[i] for i in idx]
        return ret
    return pert

In [None]:
# QUESTION: Why not just use monotonic? Isn't it meant for such cases?
# I guess the following functions are more specific -- require exactly 3 classes
# and look at the general shift prediction rather the shift with respect to
# the original label (i.e. no need for gold label)
# we know what becoming more positive or more negative means, while monotonic
# is with respect to the original prediction or gold label prediction

def positive_change(orig_conf, conf):
    softmax = type(orig_conf) in [np.array, np.ndarray]
    if not softmax or orig_conf.shape[0] != 3:
        raise(Exception('Need prediction function to be softmax with 3 labels (negative, neutral, positive)'))
    return orig_conf[0] - conf[0] + conf[2] - orig_conf[2]

def diff_up(orig_pred, pred, orig_conf, conf, labels=None, meta=None):
    tolerance = 0.1
    change = positive_change(orig_conf, conf)
    if change + tolerance >= 0:
        return True
    else:
        return change + tolerance
    
def diff_down(orig_pred, pred, orig_conf, conf, labels=None, meta=None):
    tolerance = 0.1
    change = positive_change(orig_conf, conf)
    if change - tolerance <= 0:
        return True
    else:
        return -(change - tolerance)
    
goes_up = Expect.pairwise(diff_up)
goes_down = Expect.pairwise(diff_down)

t = Perturb.perturb(parsed_data, add_phrase_function(positive), nsamples=500)
test = DIR(t.data, goes_up)