<h1>Describe experiments</h1>

When generating, we exploit the functionalities of the OpenPrompt library to manage prompts

In [86]:
import pandas as pd
import torch
from torch.utils.data import Dataset
import datetime
import itertools
import bs4
import yaml
import os

import transformers
import openprompt
from openprompt.prompts import ManualTemplate
from openprompt.data_utils import InputExample
from openprompt.plms.lm import LMTokenizerWrapper

In [83]:
READ_SETTINGS_FROM_FILE = True
SETTINGS_PATH = "/home/diego/counterfactuals-generation/sentiment_task/zs_gpt2_experiments/settings/"
SETTING_NAME = "zs_prompt_1_validation.yaml"

if READ_SETTINGS_FROM_FILE:
    a_yaml_file = open(f"{SETTINGS_PATH}{SETTING_NAME}")
    parsed_yaml_file = yaml.load(a_yaml_file, Loader=yaml.FullLoader)

    # the following params will be included in a yaml file
    RESULTS_PATH = parsed_yaml_file['RESULTS_PATH']
    SST2_MODEL_PATH = parsed_yaml_file['RESULTS_PATH']
    RANDOM_SEED_SHUFFLE = parsed_yaml_file['RANDOM_SEED_SHUFFLE']
    AUGMENT_VALSET = parsed_yaml_file['AUGMENT_VALSET']
    KEEP_FIRST_N = parsed_yaml_file['KEEP_FIRST_N']
    FOLDS = parsed_yaml_file['FOLDS']
    MODEL_NAME = parsed_yaml_file['MODEL_NAME']
    SPECIAL_TOKENS = parsed_yaml_file['SPECIAL_TOKENS']
    TEMPLATE_PROMPT = parsed_yaml_file['TEMPLATE_PROMPT']
    MAP_LABELS = parsed_yaml_file['MAP_LABELS']
    ON_CUDA = parsed_yaml_file['ON_CUDA']
    PARALLELIZATION = parsed_yaml_file['PARALLELIZATION']
    GEN_ARGS = parsed_yaml_file['GEN_ARGS']
    print("Experiment's params read from yaml file")
else:

    # the following params will be included in a yaml file
    RESULTS_PATH = "/home/diego/counterfactuals-generation/sentiment_task/zs_gpt2_experiments/results/"
    SST2_MODEL_PATH = "/home/diego/counterfactuals-generation/"

    RANDOM_SEED_SHUFFLE = 22
    AUGMENT_VALSET = True
    KEEP_FIRST_N = 1

    FOLDS = ["0"] # FOLDS = ["0", "1", "2", "3", "4"]
    MODEL_NAME = 'gpt2' #{gpt2 (gpt2-small, 12 layers), gpt2-medium (24 layers), gpt2-large (36 layers), gpt2-xl (48 layers),
                        # gpt2-fine-tuned-sst2 }
    # SPECIAL_TOKENS = {"bos_token": "<|BOS|>",
    #                   "eos_token": "<|EOS|>",
    #                   "unk_token": "<|UNK|>",
    #                   # "pad_token": "<|PAD|>",
    #                   "pad_token": "<|EOS|>",
    #                   "sep_token": "<|SEP|>"} # or set it to None
    SPECIAL_TOKENS = {"pad_token": "<|endoftext|>"} # or set it to None
    # SPECIAL_TOKENS = None

    # set the template for prompting
    # TEMPLATE_PROMPT = "<bos_token><label_ex> review:<sep><example_text><sep><label_counter> review:<sep>"
    TEMPLATE_PROMPT = "<label_ex> review:<example_text> <label_counter> review:"
    MAP_LABELS = {0:"Negative", 1:"Positive"}

    # CFGs on generation
    ON_CUDA = False
    PARALLELIZATION = False
    GEN_ARGS = {
        "no_repeat_ngram_size": [4, 8, 12],
        "num_beams": [5, 10],
        "repetition_penalty": [1.0, 2.0],
        "temperature": [0.7]
    }
    print("Experiment's params read from notebook")

Experiment's params read from yaml file


<h3>Load language model objects and example of generation</h3>

In [84]:
# Load language model objects
if MODEL_NAME == "gpt2-fine-tuned-sst2":
    load_path = f"{SST2_MODEL_PATH}{MODEL_NAME}"
else: # load model from the hugging face repository
    load_path = MODEL_NAME

tokenizer = transformers.GPT2Tokenizer.from_pretrained(load_path)
print("Downloaded tokenizer!")
if SPECIAL_TOKENS is not None:
    print(f"Len of tokenizer before adding tokens:{len(tokenizer)}")
    tokenizer.add_special_tokens(SPECIAL_TOKENS) # add special tokens
    print("Added special tokens to tokenizer!")
    print(f"Len of tokenizer after adding tokens:{len(tokenizer)}")

# Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
lm_config_class = transformers.GPT2Config.from_pretrained(load_path, pad_token_id = tokenizer.eos_token_id)

lm = transformers.GPT2LMHeadModel.from_pretrained(load_path, config=lm_config_class)
if SPECIAL_TOKENS is not None:
    #Special tokens added, model needs to be resized accordingly
    lm.resize_token_embeddings(len(tokenizer))

# load lm class for the tokenizer (for the generation with openprompt)
tokenizer_wrapper = LMTokenizerWrapper

print("Downloaded tokenizer, model and cfg!")

Downloaded tokenizer!
Len of tokenizer before adding tokens:50257
Added special tokens to tokenizer!
Len of tokenizer after adding tokens:50257
Downloaded tokenizer, model and cfg!


In [60]:
lm_config_class

GPT2Config {
  "_name_or_path": "/home/diego/counterfactuals-generation/gpt2-fine-tuned-sst2",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "pad_token_id": 50256,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "torch_dtype": "float32",
  "transformers_version": "4.14.1",
  "use_cache": true,
  "vocab_size": 50257
}

In [61]:
lm

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )


In [62]:
tokenizer

PreTrainedTokenizer(name_or_path='/home/diego/counterfactuals-generation/gpt2-fine-tuned-sst2', vocab_size=50257, model_max_len=1024, is_fast=False, padding_side='right', special_tokens={'bos_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'eos_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'unk_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True)})

In [7]:
text = "Hello, I am "
generated = torch.tensor(tokenizer.encode(text)).unsqueeze(0)
lm.eval()

out = lm.generate(generated, max_length=100,
                  temperature=0.9,
                  repetition_penalty=2.0)
tokenizer.decode(out[0], skip_special_tokens=True)

'Hello, I am \xa0a very good friend of yours.\nI have been a fan since the beginning and my first book was published in 1999 by HarperCollins. It is one that has inspired me to write more books about myself than any other author ever did before (and it\'s also an inspiration for many others). The story behind this novel follows two young women who are both on their way back from college when they meet up with someone named "The Man" at his house after he leaves'

<h3>Clarification about the generation</h3>
When generating counterfactuals in a few-shot learning scenario, we DO NOT add special tokens to the tokenizer/model, because the pre-trained model was not pre-trained to recognise such tokens. Differently, when fine-tuning the model, we can force the model to learn this new special tokens to guide (more granurarly) the counterfactual generation.

In [81]:
def load_raw_dataset(loading_path):
    train = pd.read_csv(loading_path + "training_set", sep='\t')
    val = pd.read_csv(loading_path + "val_set", sep='\t')
    test = pd.read_csv(loading_path + "test_set", sep='\t')
    return train, val, test

def augment_dataset(df_dataset):
    booked_ids = df_dataset["paired_id"].values
    examples = df_dataset["example"].values
    labels_ex = df_dataset["label_ex"].values
    counters = df_dataset["counterfactual"].values
    labels_counters = df_dataset["label_counter"].values

    ids = generate_custom_ids(booked_ids)
    d = {"paired_id": ids,
         "example": counters,
         "label_ex": labels_counters,
         "counterfactual": examples,
         "label_counter": labels_ex}
    new_df = pd.DataFrame(data=d)

    # append the new df
    df_final = pd.concat([df_dataset, new_df], ignore_index=True)

    return df_final

def generate_custom_ids(idxs):
    max_id = max(idxs)
    return [i for i in range(max_id + 1, max_id + 1 + len(idxs))]

def wrap_with_prompt(df_row, template):
    final_text = template.replace("<label_ex>", MAP_LABELS[df_row["label_ex"]])
    final_text = final_text.replace("<example_text>", df_row["example"])
    final_text = final_text.replace("<label_counter>", MAP_LABELS[df_row["label_counter"]])

    if SPECIAL_TOKENS is not None and "sep_token" in SPECIAL_TOKENS:
        final_text = final_text.replace("<sep>", SPECIAL_TOKENS["sep_token"])
    if SPECIAL_TOKENS is not None and "bos_token" in SPECIAL_TOKENS:
        final_text = final_text.replace("<bos_token>", SPECIAL_TOKENS["bos_token"])
    return final_text

class SentimentDataset(Dataset):
    def __init__(self, raw_dataframe):
        # get a copy of the dataframe
        self.raw_dataframe = raw_dataframe.copy(deep=True)
        self.guids = []
        self.dataset = {}

    # convert the Dataframe into the InputExample format dataset of openprompt
    def prepare_dataloader(self):
        for index, row in self.raw_dataframe.iterrows():
            self.dataset[row['paired_id']] = InputExample(guid=row['paired_id'],
                                                          text_a=bs4.BeautifulSoup(
                                                              row['wrapped_input'], "lxml").text,
                                                          meta={"label_ex":row['label_ex'],
                                                                "label_counter":row['label_counter'],
                                                                'example':bs4.BeautifulSoup(
                                                                    row['example'], "lxml").text,
                                                                'counterfactual':bs4.BeautifulSoup(
                                                                    row['counterfactual'], "lxml").text})
            self.guids.append(row['paired_id'])
        print('Dataloader prepared!')

    # the same of __getitem__
    def get_instance_by_id(self, idx):
        return self.dataset[idx]

    # implemented because of inheritance from Dataset
    def __len__(self):
        return len(self.dataset)

    # implemented because of inheritance from Dataset
    def __iter__(self):
        return iter(self.dataset)

    def __next__(self):
        return iter(self.dataset)

    # implemented because of inheritance from Dataset
    def __getitem__(self, idx):
        return self.dataframe.__getitem__(idx)

    def get_dataset(self):
        return self.dataset

    def get_raw_dataframe(self):
        return self.raw_dataframe

def set_generator(template, plm, parallelization, cuda_gen):

    prompt = openprompt.PromptForGeneration(
        template = template,
        freeze_plm = True,
        plm = plm,
        plm_eval_mode = True
    )

    if torch.cuda.is_available() and cuda_gen:
        prompt = prompt.cuda()

    if parallelization:
        prompt.parallelize()

    return prompt

class CounterGenerator:
    def __init__(self, dataloader, dataset, generator, tok, cfgs):
        self.dataloader = dataloader
        self.dataset = dataset
        self.generator = generator
        self.tok = tok
        self.gen_cfgs = cfgs

    def perform_generation(self, on_cuda):
        self.generator.eval()
        if torch.cuda.is_available() and on_cuda:
            print(f"Total GPU memory available: {torch.cuda.get_device_properties(0).total_memory}")
            print(f"Allocated GPU memory before generation: {torch.cuda.memory_allocated(0)}")
            print(f"Allocated GPU memory reserved: {torch.cuda.memory_reserved(0)}")

        for (step, inputs) in enumerate(self.dataloader):

            # retrieve the instance involved
            instance_guid = inputs["guid"].numpy()[0]
            instance_to_update = self.dataset.get_instance_by_id(instance_guid)

            # we limit the output length to be reasonably equal to the input
            # context, i.e. the example
            max_length_example = len(self.tok.encode(instance_to_update.text_a))
            max_length_output = int(2 * max_length_example)

            # cfg_gen[0] = no_repeat_ngram_size
            # cfg_gen[1] = num_beam
            # cfg_gen[2] = temperature
            generation_arguments = {
                "max_length": max_length_output,
                "min_length": 5,
                "no_repeat_ngram_size": self.gen_cfgs[0],
                "num_beams": self.gen_cfgs[1],
                "repetition_penalty": self.gen_cfgs[2],
                "temperature": self.gen_cfgs[3],
                "do_sample": False,
                "top_k": 10,
                "top_p": 0,
            }

            try:
                if torch.cuda.is_available() and on_cuda:
                    inputs = inputs.cuda()
                _, generated_counter = self.generator.generate(inputs,
                                                               verbose=False,
                                                               **generation_arguments)

                # insert the generated counterfactual
                instance_to_update.meta["generated_counter"] = generated_counter[0]
                print(generated_counter)

            except Exception as e:
                instance_to_update.meta["generated_counter"] = None
                print(instance_guid)
                print(e)

            if (step % 100) == 0 and (step > 0):
                print(f"{datetime.datetime.now()}, Step:{step}: 100 counterfactuals generated")

    def dataframe_from_dataset(self):
        paired_ids = [idx for idx in self.dataset]
        labels_ex = [self.dataset.get_instance_by_id(idx).meta["label_ex"] for idx in self.dataset]
        examples = [self.dataset.get_instance_by_id(idx).meta["example"] for idx in self.dataset]
        labels_counter = [self.dataset.get_instance_by_id(idx).meta["label_counter"] for idx in self.dataset]
        counterfactuals = [self.dataset.get_instance_by_id(idx).meta["counterfactual"] for idx in self.dataset]
        generated_counters = [self.dataset.get_instance_by_id(idx).meta["generated_counter"] for idx in self.dataset]
        d = {"paired_id":paired_ids,
             "label_ex":labels_ex,
             "example":examples,
             "label_counter":labels_counter,
             "counterfactual":counterfactuals,
             "generated_counter":generated_counters
             }
        return pd.DataFrame(data=d)

    def print_generation(self, path_to_print, args):
        # create a dataframe from dataset
        df_to_print = self.dataframe_from_dataset()

        # print such dataframe
        filename = f"{path_to_print[:-5]}-{args}.gen"
        df_to_print.to_csv(filename, sep='\t', index=False)

<h3>Generate counterfactuals</h3>

In [87]:
template_prompt = '{"placeholder":"text_a"}{"mask"}'
prompt_template = ManualTemplate(text = template_prompt, tokenizer = tokenizer)

all_pars = sorted(GEN_ARGS)
gen_grid = list(itertools.product(*(GEN_ARGS[par] for par in all_pars)))

for fold in FOLDS:

    # create dir to store the results
    res_path = f"{RESULTS_PATH}fold_{fold}"
    if not os.path.exists(res_path):
        # Create a new directory because it does not exist
        os.makedirs(res_path)

    print(f"{datetime.datetime.now()}: Beginning generation for fold {fold}")
    for gen_args in gen_grid:
        print(f"\nGeneration parameters: {gen_args}")
        # load the datasets
        df_trainset, df_valset, df_testset = load_raw_dataset(f"cad_imdb/fold_{fold}/")

        # shuffle valset
        df_valset = df_valset.sample(frac=1, random_state=RANDOM_SEED_SHUFFLE)

        # whether to duplicate the data by inverting example-counter the intances
        print(f"# of instances in the validation set:{len(df_valset)}")
        if AUGMENT_VALSET:
            df_valset = augment_dataset(df_valset)
            print(f"Augmented dataset - # of instances in the validation set:{len(df_valset)}")

        # whether to reduce the valset
        if KEEP_FIRST_N > 0:
            df_valset = df_valset.head(KEEP_FIRST_N)

        # wrap the datasets with the prompt template
        df_valset["wrapped_input"] = df_valset.apply(lambda row: wrap_with_prompt(row, TEMPLATE_PROMPT), axis=1)

        # prepare the data loader
        valset = SentimentDataset(raw_dataframe=df_valset)
        valset.prepare_dataloader()

        val_data_loader = openprompt.PromptDataLoader(
            dataset = list(valset.get_dataset().values()),
            tokenizer = tokenizer,
            template = prompt_template,
            tokenizer_wrapper_class=tokenizer_wrapper
        )

        # set the prompt for generation
        prompt_for_generation = set_generator(prompt_template,
                                              lm,
                                              PARALLELIZATION,
                                              ON_CUDA)

        # generate counterfactuals
        counter_generator = CounterGenerator(val_data_loader,
                                             valset,
                                             prompt_for_generation,
                                             tokenizer,
                                             gen_args
                                             )
        counter_generator.perform_generation(ON_CUDA)

        # print the generated counterfactuals
        print(f"{datetime.datetime.now()}: Printing generation...")
        counter_generator.print_generation(f"{RESULTS_PATH}fold_{fold}/{SETTING_NAME}", gen_args)
        print(f"{datetime.datetime.now()}: Finished to print...")

    print(f"{datetime.datetime.now()}: Generation completed for fold {fold}")

2022-03-10 17:29:47.940444: Beginning generation for fold 0

Generation parameters: (4, 5, 1.0, 0.7)
# of instances in the validation set:390
Augmented dataset - # of instances in the validation set:780
Dataloader prepared!


tokenizing: 1it [00:00, 342.70it/s]


['This is one of the best movies I have ever seen. It is very well written, well paced, and very well acted. The story is very well told, the characters are very well developed, and the action is very well choreographed. I would recommend this movie to anyone who is looking for a good action movie. It is a very well written and well acted movie, and I would recommend it to anyone who has never seen a movie like this before. I would also recommend this movie if you are looking for an action movie that is well paced, well choreographed, and well acted. It is one of my all time favorite movies of all time, and I highly recommend it to everyone who has ever watched this movie. I highly recommend this movie for anyone looking for a great action movie, and for anyone who is a fan of action movies. It is the best action movie of all time. I would highly recommend it for anyone who has ever seen this movie, and would recommend it for everyone who has never watched any other action movie. I']
2

tokenizing: 1it [00:00, 417.59it/s]


KeyboardInterrupt: 

In [27]:
# trainset.get_dataset()
# train_data_loader.raw_dataset
# df_valset.head(2)
for v in valset.get_dataset():
    print()

KeyboardInterrupt: 

In [11]:
df_trainset.head(2)

Unnamed: 0,paired_id,example,label_ex,counterfactual,label_counter
0,4,"Long, boring, blasphemous. Never have I been s...",0,"Long, fascinating, soulful. Never have I been ...",1
1,13,"If you haven't seen this, it's incredible. It ...",1,"If you haven't seen this, it's terrible. It is...",0


In [12]:
print()
# text = valset.get_dataset()[18436].text_a
# generated = torch.tensor(tokenizer.encode(text)).unsqueeze(0)
# # lm.eval()
#
# out = lm.generate(generated, max_length=tokenizer.model_max_length)
# tokenizer.decode(out[0], skip_special_tokens=True)




In [13]:
# print()
# text = "Hello, I am "
# generated = torch.tensor(tokenizer.encode(text)).unsqueeze(0)
# # lm.eval()
#
# out = lm.generate(generated, max_length=tokenizer.model_max_length)
# tokenizer.decode(out[0], skip_special_tokens=True)

In [14]:
out

tensor([[15496,    11,   314,   716,   220,  1849,    64,   845,   922,  1545,
           286, 12431,    13,   198,    40,   423,   587,   257,  4336,  1201,
           262,  3726,   290,   616,   717,  1492,   373,  3199,   287,  7358,
           416, 12686, 49645,   764,   632,   318,   530,   326,   468,  7867,
           502,   284,  3551,   517,  3835,   546,  3589,   621,   597,   584,
          1772,  1683,   750,   878,   357,   392,   340,   338,   635,   281,
         12141,   329,   867,  1854,   737,   383,  1621,  2157,   428,  5337,
          5679,   734,  1862,  1466,   508,   389,  1111,   319,   511,   835,
           736,   422,  4152,   618,   484,  1826,   510,   351,  2130,  3706,
           366,   464,  1869,     1,   379,   465,  2156,   706,   339,  5667]])