# Polyjuice first try

In [1]:
from transformers import AutoTokenizer, AutoModelWithLMHead
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM

model_path = "uw-hai/polyjuice"
generator = pipeline("text-generation",
    model=AutoModelForCausalLM.from_pretrained(model_path),
    tokenizer=AutoTokenizer.from_pretrained(model_path),
    framework="pt", device=-1)

In [2]:
from datasets import load_dataset
dataset = load_dataset("snli")
dataset['train'][2]

Reusing dataset snli (/home/steven/.cache/huggingface/datasets/snli/plain_text/1.0.0/1f60b67533b65ae0275561ff7828aad5ee4282d0e6f844fd148d05d3c6ea251b)


  0%|          | 0/3 [00:00<?, ?it/s]

{'premise': 'A person on a horse jumps over a broken down airplane.',
 'hypothesis': 'A person is outdoors, on a horse.',
 'label': 0}

In [3]:
original_premise = dataset['train'][2]['premise']
hypothesis = dataset['train'][2]['hypothesis']
original_label = dataset['train'][2]['label']

In [4]:
prompt_text = "{} {} <|perturb|> [insert]".format(original_premise, hypothesis)
# prompt_text = "A dog is embraced by the woman. <|perturb|> [negation] A dog is [BLANK] the woman."
generator(prompt_text, num_beams=1, num_return_sequences=1)[0]["generated_text"]


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


'A person on a horse jumps over a broken down airplane. A person is outdoors, on a horse. <|perturb|> [insert] A person on a horse jumps over a broken down [BLANK]. [SEP] structure'

# Simple heuristic for blank position

In [72]:
import benepar, spacy
from nltk import Tree

In [73]:
import en_core_web_sm
nlp = en_core_web_sm.load()

In [74]:
import nltk
benepar.download('benepar_en3')
  

[nltk_data] Downloading package benepar_en3 to
[nltk_data]     /Users/afra/nltk_data...
[nltk_data]   Package benepar_en3 is already up-to-date!


True

In [75]:
# nlp.add_pipe('benepar', config={'model': 'benepar_en3'})
doc = nlp('The time for action is now. It is never too late to do something.')
sent = list(doc.sents)[0]

In [12]:
!pip install svgling

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Collecting svgling
  Downloading svgling-0.3.1-py3-none-any.whl (21 kB)
Collecting svgwrite
  Downloading svgwrite-1.4.1-py3-none-any.whl (66 kB)
[K     |████████████████████████████████| 66 kB 2.9 MB/s eta 0:00:011
[?25hInstalling collected packages: svgwrite, svgling
Successfully installed svgling-0.3.1 svgwrite-1.4.1


In [76]:
from nltk import ParentedTree
def add_indices_to_terminals(treestring):
    tree = ParentedTree.fromstring(treestring)
    for idx, _ in enumerate(tree.leaves()):
        tree_location = tree.leaf_treeposition(idx)
        non_terminal = tree[tree_location[:-1]]
        non_terminal[0] = non_terminal[0] + "_" + str(idx)
    return str(tree)

In [77]:
def find_constituent(node, candidate_list):
    print(node)
    if len(node) == 1 and type(node[0]) == str:
        idx = int(node[0].split("_")[-1])
        return idx, idx
    
    first_idx = last_idx = 0
    for i in range(len(node)):
        idx = find_constituent(node[i], candidate_list)
        if i == 0:
            first_idx = idx[0]
        else:
            last_idx = idx[1]
    candidate_list.append((first_idx, last_idx))
    return first_idx, last_idx

In [78]:
def find_blank_index(sentence):
    doc = nlp(sentence)
    sent = list(doc.sents)[0]
    tree_str = add_indices_to_terminals(sent._.parse_string)
    tree = Tree.fromstring(tree_str)

# Polyjuice counterfactual generation
with random blank position

In [None]:
from polyjuice import Polyjuice
import random
pj = Polyjuice(model_path="uw-hai/polyjuice", is_cuda=True)

In [57]:
def _replace_word_with_blank(sentence: str) -> str:
    # TODO: Use constituent parsing
    words = sentence.split()
    words[random.randrange(len(words))] = "[BLANK]"
    return " ".join(words)

def _replace_word_with_blank2(sentence: str) -> str:
    random_blanks = pj.get_random_blanked_sentences(
        sentence=sentence,
        # only allow selecting from a preset range of token indexes
        pre_selected_idxes=None,
        # only select from a subset of dep tags
        deps=None,
        # blank sub-spans or just single tokens
        is_token_only=False,
        # maximum number of returned index tuple
        max_blank_sent_count=3,
        # maximum number of blanks per returned sentence
        max_blank_block=1
    )
    return list(random_blanks)[0]

def generate_nli_perturbations(
    premise: str,
    hypothesis: str,
    n: int = 3,
    ctrl_code="negation",
) -> "List[str]":
    """Perform a polyjuice perturbation in an attempt to generate a counterfactual.
    
    For our NLI task, the prompt contains both the premise and the hypothesis, but
    we only want to perturb the hypothesis. Therefore, we generate a blank in the
    hypothesis only.
    
    Strangely, pj.perturb always returns 3 results no matter what `n` I use (tested 2 and
    10). Sometimes it returns an empty list (no results). Maybe it gives up sometimes?
    """
    orig_sent = f"{premise} {hypothesis}"
    blanks_hyp = [_replace_word_with_blank2(hypothesis) for _ in range(n)]
    blanks = [f"{premise} {hypo2}" for hypo2 in blanks_hyp]
    print(orig_sent, blanks)
    results = pj.perturb(
        orig_sent=orig_sent,
        blanked_sent=blanks,
        ctrl_code=ctrl_code,
        n=n,
    )
    return results

In [62]:
generate_nli_perturbations(
    "The time for action is now.",
    "It is never too late to do something",
    ctrl_code="negation",
    n=10
)

The time for action is now. It is never too late to do something ['The time for action is now. It is never too late [BLANK] do something', 'The time for action is now. It is never too late to [BLANK] something', 'The time for action is now. It is never too late to [BLANK] something', 'The time for action is now. It is never too late [BLANK] do something', 'The time for action is now. It is never too [BLANK] to do something', 'The time for action is now. It is never too late [BLANK]', 'The time for action is now. It is never too [BLANK] to do something', 'The time for action is now. It is never too late to [BLANK] do something', 'The time for action is now. It is never [BLANK] too late to do something', 'The time for action is now. It is [BLANK] too late to do something']


['The time for action is now. It is never too late to start to do something']

In [63]:
"""
negation
'The time for action is now. It is never too late to do anything',
 "The time for action is now. It is never too late to do what's required.",
 'The time for action is now. It is never too late to do something,'
 'The time for action is now. It is never too late to shoot',
 'The time for action is now. It is never too late to fix',
 'The time for action is now. It is never too late yet to do something'
 
restructure
(fails to return results. Maybe I need to insert BLANKS differently,
or combine it with other control codes? Probably explained in the paper.)

insert
'The time for action is now. It is never too late to do something really good',
 'The time for action is now. It is never too late to do something.'
 'The time for action is now. It is never too late to do something!',
 'The time for action is now. It is never too late to do something.
 
shuffle
'The time for action is now. It is often never too late to do something'
'The time for action is now. It is never too late to start to do something'
"""

'\nnegation\n\'The time for action is now. It is never too late to do anything\',\n "The time for action is now. It is never too late to do what\'s required.",\n \'The time for action is now. It is never too late to do something,\'\n \'The time for action is now. It is never too late to shoot\',\n \'The time for action is now. It is never too late to fix\',\n \'The time for action is now. It is never too late yet to do something\'\n \nrestructure\n(fails to return results. Maybe I need to insert BLANKS differently,or combine it with other control codes?)\n\ninsert\n\'The time for action is now. It is never too late to do something really good\',\n \'The time for action is now. It is never too late to do something.\'\n \'The time for action is now. It is never too late to do something!\',\n \'The time for action is now. It is never too late to do something.\n \nshuffle\n\'The time for action is now. It is often never too late to do something\'\n\'The time for action is now. It is never 