In [2]:
import numpy as np
import pandas as pd
import torch
import nltk
import random
import time
import datetime
import itertools
import transformers
import sklearn
# import lxml
nltk.download('punkt')

from openprompt.plms import load_plm
from openprompt.prompts import ManualTemplate
from openprompt.prompts.prefix_tuning_template import PrefixTuningTemplate
from openprompt import PromptForGeneration, PromptDataLoader
from openprompt.data_utils import InputExample

from transformers import GPT2Tokenizer, GPT2Model, GPT2LMHeadModel
from torch.utils.data import Dataset
from sklearn.utils import shuffle
from bs4 import BeautifulSoup

[nltk_data] Downloading package punkt to /home/diego/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Download models
and store them in a local directory

In [3]:
# Load "gpt2-medium" with tokenizer
tok = transformers.GPT2Tokenizer.from_pretrained("gpt2-medium")
lm = GPT2LMHeadModel.from_pretrained("gpt2-medium", pad_token_id = tok.eos_token_id)
transformers.PreTrainedModel.save_pretrained(lm, "models/gpt2-medium")
transformers.PreTrainedTokenizer.save_pretrained(tok, "tokenizers/gpt2-medium")

Downloading:   0%|          | 0.00/0.99M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/718 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

('tokenizers/gpt2-medium/tokenizer_config.json',
 'tokenizers/gpt2-medium/special_tokens_map.json',
 'tokenizers/gpt2-medium/vocab.json',
 'tokenizers/gpt2-medium/merges.txt',
 'tokenizers/gpt2-medium/added_tokens.json')

In [None]:
# Load "gpt2-large" with tokenizer
tok = transformers.GPT2Tokenizer.from_pretrained("gpt2-large")
lm = GPT2LMHeadModel.from_pretrained("gpt2-large", pad_token_id = tok.eos_token_id)
transformers.PreTrainedModel.save_pretrained(lm, "models/gpt2-large")
transformers.PreTrainedTokenizer.save_pretrained(tok, "tokenizers/gpt2-large")

Downloading:   0%|          | 0.00/0.99M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/666 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.02G [00:00<?, ?B/s]

In [7]:
# # Load "gpt2-xl" with tokenizer
# tok = transformers.GPT2Tokenizer.from_pretrained("gpt2-medium")
# lm = GPT2LMHeadModel.from_pretrained("gpt2-medium", pad_token_id = tok.eos_token_id)
# transformers.PreTrainedModel.save_pretrained(lm, "models/gpt2-medium")
# transformers.PreTrainedTokenizer.save_pretrained(tok, "tokenizers/gpt2-medium")

('tokenizers/gpt2-medium/tokenizer_config.json',
 'tokenizers/gpt2-medium/special_tokens_map.json',
 'tokenizers/gpt2-medium/vocab.json',
 'tokenizers/gpt2-medium/merges.txt',
 'tokenizers/gpt2-medium/added_tokens.json')

In [2]:
tok = transformers.AutoTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
lm = transformers.AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
transformers.PreTrainedModel.save_pretrained(lm, "models/distilbert-sst2")
transformers.PreTrainedTokenizer.save_pretrained(tok, "tokenizers/distilbert-sst2")

('tokenizers/distilbert-sst2/tokenizer_config.json',
 'tokenizers/distilbert-sst2/special_tokens_map.json',
 'tokenizers/distilbert-sst2/vocab.txt',
 'tokenizers/distilbert-sst2/added_tokens.json',
 'tokenizers/distilbert-sst2/tokenizer.json')

# Experiments: methodology

- input: list of models, list of templates, list of generation params
- datasets: training and dev

-for a specific template:
--- for each model version:
----- generate counterfactuals for the training set, for a specific params cfg;
----- select the best params cfg based on the evaluation on the training set;
----- generate the counterfactuals for the dev set;
----- evaluate the generated counterfactuals

# TEMPLATE #1

[label_example] review: [example text]

[label_counterfactual] review: [generation token]

Label template 0:"Negative"; 1:"Positive"

In [8]:
label_for_template = {0:"Negative", 1:"Positive"}
template = '[label_a] review: [text_a]\n[label_b] review:'

# prompts need to provide the "mask" token at the end of the template
# because GPT2 can only predict the next word based on the past context
template_prompt = '{"placeholder":"text_a"}{"mask"}'

# TEMPLATE #2

The movie is [label_example]. [example text]

The movie is [label_counterfactual]. [generation token]

Label template 0:"bad"; 1:"good"

In [9]:
label_for_template = {0:"bad", 1:"good"}
template = 'The movie is [label_a]. [text_a]\nThe movie is [label_b].'

# prompts need to provide the "mask" token at the end of the template
# because GPT2 can only predict the next word based on the past context
template_prompt = '{"placeholder":"text_a"}{"mask"}'

In [10]:
random_seed = 5
allow_parallelization = False
empty_gpu_memory = True
generator_on_cuda = True
reduce_data = True
data_to_keep = 400

# model_names = ["gpt2-medium", "gpt2-large", "gpt2-xl"]
model_names = {"gpt2-medium"}
gen_params_grid = {"no_repeat_ngram_size":[4, 8, 12],
                   "num_beam":[5],
                   "temperature":[0, 0.8]}
sentiment_model_name = "distilbert-base-uncased-finetuned-sst-2-english"

# get all the entries of the grid
all_pars = sorted(gen_params_grid)
combinations_grid = list(itertools.product(*(gen_params_grid[par] for par in all_pars)))
print("List of configurations for generation:")
print(combinations_grid)
print(f"# of hypers cfgs:{len(combinations_grid)}")

List of configurations for generation:
[(4, 5, 0), (4, 5, 0.8), (8, 5, 0), (8, 5, 0.8), (12, 5, 0), (12, 5, 0.8)]
# of hypers cfgs:6


Select the best hyperparameters settings

In [11]:
torch.cuda.empty_cache()

In [12]:
results = []
for model_name in model_names:
    tic = time.perf_counter()
    print(f"{datetime.datetime.now()}: Begin experiments for model {model_name}.")
    _, tokenizer, model_config, WrapperClass = load_plm(model_name="gpt2", model_path=model_name)
    # tokenizer = GPT2Tokenizer.from_pretrained("tokenizers/" + model_name)
    plm = GPT2LMHeadModel.from_pretrained("models/" + model_name, pad_token_id = tokenizer.eos_token_id)
    print(f"{datetime.datetime.now()}:Model and tokenizer loaded.")

    for cfg_gen in combinations_grid:

        print(f"CFG tested: {cfg_gen}")
        # load and prepare training set
        training_set = load_dataset("train_paired.tsv")
        training_set = prepare_dataset(training_set,
                                       tokenizer,
                                       reduce_data,
                                       data_to_keep,
                                       random_seed)

        prompt_template = ManualTemplate(
            text = template_prompt,
            tokenizer = tokenizer,
        )

        training_set.wrap_dataset_instances(template_=template,
                                            label_template=label_for_template)

        data_loader = PromptDataLoader(
            dataset = list(training_set.get_dataset().values()),
            tokenizer = tokenizer,
            template = prompt_template,
            tokenizer_wrapper_class=WrapperClass,
        )

        # set the template and the generator
        prompt_for_generation = set_generator(prompt_template,
                                              plm,
                                              allow_parallelization,
                                              generator_on_cuda)

        print(f"{datetime.datetime.now()}: Begin of the generation.")
        # generate counterfactuals
        perform_generation(data_loader,
                           training_set,
                           tokenizer,
                           prompt_for_generation,
                           cfg_gen)
        print(f"{datetime.datetime.now()}: End of the generation.")

        # create the Evaluator object
        print(f"{datetime.datetime.now()}: Begin of evaluation.")
        sent_tokenizer, sent_model, sent_dict = load_sentiment_model(sentiment_model_name)
        sent_device = 0 if torch.cuda.is_available() else -1
        # print(sent_device)
        evaluator = Evaluator(training_set,
                              sent_tokenizer,
                              sent_model,
                              sent_dict,
                              sent_device)

        # predict labels for counterfactuals
        pred_labels, avg_confidence = evaluator.infer_predictions()
        # pred_labels = [1, 1]

        # calculate the LFS
        true_labels = [training_set.get_instance_by_id(guid).meta["label_b"] for guid in training_set]
        lfs = evaluator.lf_score(true_labels, pred_labels)

        # calculate BLUE
        mean_blue, var_blue = evaluator.blue_score()
        print(f"{datetime.datetime.now()}: End of the evaluation.")
        print(f"{model_name}| LFS:{lfs}, AVG_CONF:{avg_confidence}, mean_BLUE:{mean_blue}, var_BLUE:{var_blue}")
        results.append(f"{model_name}| LFS:{lfs}, AVG_CONF:{avg_confidence}, mean_BLUE:{mean_blue}, var_BLUE:{var_blue}")

        print()
    toc = time.perf_counter()
    print(f"End of experiments for model {model_name}.")
    print(f"Total execution time for model {model_name}: {toc - tic:0.4f}.")
    print()

2022-01-28 17:37:01.214853: Begin experiments for model gpt2-medium.


Using pad_token, but it is not set yet.


2022-01-28 17:37:18.362114:Model and tokenizer loaded.
CFG tested: (4, 5, 0)
Dataset's Dataframe prepared
# of data points in the dataset:  400
Dataset examples wrapped with prompt template


tokenizing: 400it [00:00, 473.41it/s]


2022-01-28 17:37:22.048809: Begin of the generation.
Total GPU memory available: 4240244736
Allocated GPU memory before generation: 1444470784
Allocated GPU memory reserved: 1447034880
Allocated GPU memory before generation: 1578193408
2022-01-28 17:37:25.236165, Step:0: 100 counterfactuals generated
Allocated GPU memory before generation: 1652913664


KeyboardInterrupt: 

In [11]:
results

['gpt2-medium| LFS:0.5725, AVG_CONF:0.9690174861252308, mean_BLUE:0.10141909904553367, var_BLUE:0.021700017855491926',
 'gpt2-medium| LFS:0.565, AVG_CONF:0.9737502524256706, mean_BLUE:0.17558427443312283, var_BLUE:0.04844216497166922',
 'gpt2-medium| LFS:0.53, AVG_CONF:0.9729126597940921, mean_BLUE:0.1712076253204455, var_BLUE:0.04703537604798624']

In [9]:
import requests
import lxml
from bs4 import BeautifulSoup
r = requests.get("https://www.allrecipes.com/recipes/96/salad/")
soup = BeautifulSoup(r.text, "lxml")
soup = BeautifulSoup(r.text, "html.parser")

<h1>Classes to be used</h1>

In [3]:
def reformat_sentiment(x):
    return int(x == 'Positive')

def load_dataset(name):
    # load the dataset
    url = 'https://raw.githubusercontent.com/acmi-lab/counterfactually-augmented-data/master/sentiment/combined/paired/' + name
    dataset = pd.read_csv(url, sep='\t')
    dataset.rename(columns={"Sentiment": "sentiment", "Text": "text", "batch_id": "paired_id"}, inplace=True)
    # reformat 'sentiment' column
    dataset['sentiment'] = dataset['sentiment'].apply(lambda value: reformat_sentiment(value))

    return dataset

def prepare_dataset(dataframe_, tokenizer_, reduce_dataset_, n_to_keep_, seed=1):
    # prepare the dataset for the generation
    dataset = SentimentDataset(loaded_dataset=dataframe_,
                               tokenizer_=tokenizer_,
                               max_length=1024)
    dataset.randomly_assign_conterfactuals(seed)
    dataset.prepare_dataframe_with_counterfacuals(reduce_dataset_,
                                                  n_to_keep_,
                                                  seed)
    dataset.prepare_dataset()
    print("# of data points in the dataset: ", len(dataset))

    return dataset

In [4]:
class SentimentDataset(Dataset):
    def __init__(self, loaded_dataset, tokenizer_, max_length):
        # get a copy of the dataset
        self.dataframe = loaded_dataset.copy()
        self.tokenizer = tokenizer_
        self.max_length = max_length
        self.dataframe_with_counterfactuals = None
        self.guids = []
        self.dataset = {}
        self.dataset_with_prompts = []

    def randomly_assign_conterfactuals(self, seed=1):
        # prepare the proper Dataframe for the dataset
        self.random_shuffle(seed)
        paired_ids = self.dataframe['paired_id'].values
        found_ids = {}
        counterfactual_column = []
        for id in paired_ids:
            counterfactual_column.append(self.__set_example_counter__(id, found_ids))
        self.dataframe['is_counterfactual'] = counterfactual_column

        print("Dataset's Dataframe prepared")

    def __set_example_counter__(self, idx, found_idsx):
        if idx in found_idsx:
            return 0
        else:
            found_idsx[idx] = 0
            return 1

        # prepare a dataset with input-output instances
    def prepare_dataframe_with_counterfacuals(self,
                                              reduce_dataset_,
                                              n_to_keep_,
                                              seed):

        # group by paired_id
        gb = self.dataframe.groupby(by=["paired_id"])

        # create new columns "example" and "counterfactual"
        example_column = []
        counter_column = []
        paired_id_column = []
        label_ex = []
        label_counter = []
        for group_id in gb.groups: # group_id == paired_id
            group = gb.get_group(group_id)
            is_counterfactual_column = group['is_counterfactual'].values
            text_column = group['text'].values
            sentiment_column = group['sentiment'].values
            for is_counter, text, label in zip(is_counterfactual_column,
                                               text_column,
                                               sentiment_column):
                if is_counter:
                    counter_column.append(text)
                    label_counter.append(label)
                else:
                    example_column.append(text)
                    label_ex.append(label)

            paired_id_column.append(group_id)

        # add the new columns to a new dataframe
        d = {'paired_id': paired_id_column,
             'example': example_column,
             'label_ex': label_ex,
             'counterfactual': counter_column,
             'label_counter': label_counter}
        self.dataframe_with_counterfactuals = pd.DataFrame(data=d)

        if reduce_dataset_:
            self.dataframe_with_counterfactuals = self.dataframe_with_counterfactuals.sample(n=n_to_keep_, random_state=seed)
            self.dataframe_with_counterfactuals.reset_index(drop=True, inplace=True)

    # convert the Dataframe into the InputExample format dataset of openprompt
    def prepare_dataset(self):
        for index, row in self.dataframe_with_counterfactuals.iterrows():
            self.dataset[row['paired_id']] = InputExample(guid=row['paired_id'],
                                                          text_a=BeautifulSoup(
                                                              row['example'], "lxml").text,
                                                          text_b=BeautifulSoup(
                                                              row['counterfactual'], "lxml").text,
                                                          meta={"label_a":row['label_ex'],
                                                                "label_b":row['label_counter'],
                                                                'example':BeautifulSoup(
                                                                    row['example'], "lxml").text,
                                                                'counterfactual':BeautifulSoup(
                                                                    row['counterfactual'], "lxml").text})
            self.guids.append(row['paired_id'])

    def wrap_dataset_instances(self, template_, label_template):
        # template is a string with the whole template
        # label_template is a dict with the mapping between label and template
        for idx in self.dataset:
            instance = self.dataset[idx]
            instance.text_a = template_.replace('[text_a]', instance.text_a)
            instance.text_a = instance.text_a.replace(
                '[label_a]', label_template[instance.meta['label_a']])
            instance.text_a = instance.text_a.replace(
                '[label_a]', label_template[instance.meta['label_a']])
            instance.text_a = instance.text_a.replace(
                '[label_b]', label_template[instance.meta['label_b']])

        print('Dataset examples wrapped with prompt template')

    def sort_by_paired_id(self):
        self.dataframe.sort_values('paired_id', inplace=True)

    def sort_prompted_by_paired_id(self):
        self.dataframe_with_prompts.sort_values('paired_id', inplace=True)

    def random_shuffle(self, seed):
        random.seed(seed)
        self.dataframe = shuffle(self.dataframe)

    # the same of __getitem__
    def get_instance_by_id(self, idx):
        return self.dataset[idx]

    # implemented because of inheritance from Dataset
    def __len__(self):
        return len(self.dataset)

    # implemented because of inheritance from Dataset
    def __iter__(self):
        return iter(self.dataset)

    def __next__(self):
        return iter(self.dataset)

    # implemented because of inheritance from Dataset
    def __getitem__(self, idx):
        return self.dataframe.__getitem__(idx)

    def get_dataset(self):
        return self.dataset

    def get_dataframe(self):
        return self.dataframe

    def get_dataframe_with_counterfactuals(self):
        return self.dataframe_with_counterfactuals

    def get_dataset_with_prompts(self):
        return self.dataset_with_prompts

In [5]:
def set_generator(prompt_template_, plm_, parallelization, cuda_gen):

    prompt_for_generation_ = PromptForGeneration(
        template = prompt_template_,
        freeze_plm = True,
        plm = plm_,
        plm_eval_mode = True
    )

    if torch.cuda.is_available() and cuda_gen:
        prompt_for_generation_ = prompt_for_generation_.cuda()

    if parallelization:
        prompt_for_generation_.parallelize()

    return prompt_for_generation_

In [6]:
def perform_generation(data_loader_,
                       dataset,
                       tokenizer_,
                       generator,
                       cfg_for_gen):
    generator.eval()

    print(f"Total GPU memory available: {torch.cuda.get_device_properties(0).total_memory}")
    print(f"Allocated GPU memory before generation: {torch.cuda.memory_allocated(0)}")
    print(f"Allocated GPU memory reserved: {torch.cuda.memory_reserved(0)}")
    for (step, inputs) in enumerate(data_loader_):

        # retrieve the instance involved
        instance_guid = inputs["guid"].numpy()[0]
        instance_to_update = dataset.get_instance_by_id(instance_guid)

        # we limit the output length to be reasonably equal to the input
        # context, i.e. the example
        max_length_example = len(tokenizer_.encode(instance_to_update.text_a))
        max_length_output = int(2 * max_length_example)

        # cfg_gen[0] = no_repeat_ngram_size
        # cfg_gen[1] = num_beam
        # cfg_gen[2] = temperature
        generation_arguments = {
            "max_length": max_length_output,
            "min_length": 5,
            "no_repeat_ngram_size": cfg_for_gen[0],
            "num_beams": cfg_for_gen[1],
            "temperature": cfg_for_gen[2],
            "do_sample": False,
            "top_k": 10,
            "top_p": 0,
            # "repetition_penalty": 2.0,
            # "num_return_sequences": 3
            # "early_stopping": True
        }
        # print(generation_arguments)

        if torch.cuda.is_available():
            inputs_to_device = inputs.cuda()
        else:
            inputs_to_device = inputs

        try:
            _, generated_counter = generator.generate(inputs_to_device,
                                                              verbose=True,
                                                              **generation_arguments)
        except Exception as e:
            print(instance_guid)
            print(e)

        # insert the generated counterfactual
        instance_to_update.meta["generated_counter"] = generated_counter[0]
        # print(inputs["guid"].numpy()[0])
        if torch.cuda.is_available() and empty_gpu_memory:
            torch.cuda.empty_cache()

        print(f"Allocated GPU memory before generation: {torch.cuda.memory_allocated(0)}")
        if (step % 100) == 0:
            print(f"{datetime.datetime.now()}, Step:{step}: 100 counterfactuals generated")
            # if torch.cuda.is_available():
            #   torch.cuda.empty_cache()

In [23]:
# inputsss = torch.tensor([1, 2, 3])
# inputsss.cpu()
# inputsss
gen_argumvsvs = {
    "max_length": 215,
    "min_length": 5,
    "no_repeat_ngram_size": 10,
    "num_beams": 1,
    "temperature": 0,
    "do_sample": False,
    "top_k": 10,
    "top_p": 0,
    # "repetition_penalty": 2.0,
    # "num_return_sequences": 3
    # "early_stopping": True
}

for (step, inputs) in enumerate(data_loader):
    codeout, geunter = prompt_for_generation.generate(inputs,
                                   verbose=True,
                                   **gen_argumvsvs)

    print(geunter[0])
    break

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument index in method wrapper__index_select)

In [21]:
prompt_for_generation

NameError: name 'prompt_for_generation' is not defined

In [19]:
gen_argumvsvs = {
    "max_length": 215,
    "min_length": 5,
    "no_repeat_ngram_size": 10,
    "num_beams": 1,
    "temperature": 0,
    "do_sample": False,
    "top_k": 10,
    "top_p": 0,
    # "repetition_penalty": 2.0,
    # "num_return_sequences": 3
    # "early_stopping": True
}

# prompt_ation = prompt_for_generation

for (step, inputs) in enumerate(data_loader):
    _, geunter = prompt_for_generation.generate(inputs.cuda(),
                                                      verbose=True,
                                                      **gen_argumvsvs)

    del codeout
    print(geunter[0])
    break

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument index in method wrapper__index_select)

In [7]:
class Evaluator:

    def __init__(self, s_dataset, s_tokenizer, s_model, s_label_dict, s_device):
        """Constructor of the evaluator

        @param: _sentiment_tokenizer The tokenizer for the sentiment classifier
        @param: _sentiment_model The sentiment classifier
        @param: _label_dict The mapping between classifier's outputs and labels
        """
        self._dataset = s_dataset
        self._sentiment_tokenizer = s_tokenizer
        self._sentiment_model = s_model
        self._label_dict = s_label_dict
        self.classifier = transformers.pipeline(
            task="sentiment-analysis",
            model=s_model,
            tokenizer=s_tokenizer,
            framework="pt",
            device=s_device)

    def infer_predictions(self):

        try:
            texts = [self._dataset.get_instance_by_id(guid).meta["generated_counter"] for guid in self._dataset]
            # for guid in self._dataset:
            #   self._dataset.get_instance_by_id(guid).meta["generated_counter"]
        except:
            print(f"There is an instance that does not have a counterfactual")
        predicted_labels = []
        score_labels = []

        for text_to_classify in texts:
            if len(text_to_classify) > 512:
                result = self.classifier(text_to_classify[:511])[0]
            else:
                result = self.classifier(text_to_classify)[0]

            predicted_labels.append(self._label_dict[result['label']])
            score_labels.append(result['score'])

        return predicted_labels, np.mean(score_labels)


    def lf_score(self, y_desired, y_pred):
        """Calculate the Label Flip Score (LFS)
        """
        return sklearn.metrics.accuracy_score(y_desired, y_pred)

    def blue_score(self):
        """Calculate the BLUE score for a pair of example-counter.

           Returns mean and variance of the BLUE scores.
        """

        BLEUscore = []
        true_counters = [self._dataset.get_instance_by_id(guid).meta["counterfactual"] for guid in self._dataset]
        gen_counters = [self._dataset.get_instance_by_id(guid).meta["generated_counter"] for guid in self._dataset]
        for true_counter, gen_counter in zip(true_counters, gen_counters):
            # example and counterfactual need to be tokenized first

            # the reference is the true counterfactual
            reference = nltk.tokenize.word_tokenize(true_counter)

            # the hypothesis is the generated counterfactual
            hypothesis = nltk.tokenize.word_tokenize(gen_counter)

            BLEUscore.append(nltk.translate.bleu_score.sentence_bleu([reference],
                                                                     hypothesis))
        return np.mean(BLEUscore), np.var(BLEUscore)

In [8]:
def load_sentiment_model(name):
    tokenizer_ = transformers.AutoTokenizer.from_pretrained("tokenizers/distilbert-sst2")
    model = transformers.AutoModelForSequenceClassification.from_pretrained("models/distilbert-sst2")

    if name == "gchhablani/fnet-base-finetuned-sst2":
        return tokenizer_, model, {'negative':0, 'positive':1}

    if name == "siebert/sentiment-roberta-large-english" or name == "distilbert-base-uncased-finetuned-sst-2-english":
        return tokenizer_, model, {'NEGATIVE':0, 'POSITIVE':1}

    return tokenizer_, model, {'LABEL_0':0, 'LABEL_1':1}