<h1>Prepare CAD IMDB for fine tuning</h1>

This notebook prepares the CAD IMDB review data that will serve to fine tune the LM afterwards (and the fine tuning will be performed by means of run_lm_finetuning.py).
We proceed as the following:
- Load the data
- Create a training and evaluation set, as did for zs_gpt2
- Reformat training and evaluation sets by wrapping in with the chosen template
- Save the datasets in a file

IMPORTANT NOTE: Try to insert "<|endoftext|>" character at the end of every single example in the dataset.

<h2>Modules and functions to use later</h2>

In [5]:
import numpy as np
import pandas as pd
import nltk
import random
import time
import datetime
import itertools
import sklearn
from torch.utils.data.dataset import T_co

nltk.download('punkt')

from openprompt.plms import load_plm
from openprompt.prompts import ManualTemplate
from openprompt.prompts.prefix_tuning_template import PrefixTuningTemplate
from openprompt import PromptForGeneration, PromptDataLoader
from openprompt.data_utils import InputExample
from torch.utils.data import Dataset

from sklearn.utils import shuffle
from bs4 import BeautifulSoup

[nltk_data] Downloading package punkt to /home/diego/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [6]:
def reformat_sentiment(x):
    return int(x == 'Positive')

def load_dataset(name):
    # load the dataset
    url = 'https://raw.githubusercontent.com/acmi-lab/counterfactually-augmented-data/master/sentiment/combined/paired/' + name
    dataset = pd.read_csv(url, sep='\t')
    dataset.rename(columns={"Sentiment": "sentiment", "Text": "text", "batch_id": "paired_id"}, inplace=True)
    # reformat 'sentiment' column
    dataset['sentiment'] = dataset['sentiment'].apply(lambda value: reformat_sentiment(value))

    return dataset

def prepare_dataset(dataframe_, reduce_dataset_, n_to_keep_, seed=1):
    # prepare the dataset for the generation
    dataset = SentimentDataset(loaded_dataset=dataframe_,)
    dataset.randomly_assign_conterfactuals(seed)
    dataset.prepare_dataframe_with_counterfacuals(reduce_dataset_,
                                                  n_to_keep_,
                                                  seed)
    dataset.prepare_dataset()
    print("# of data points in the dataset: ", len(dataset))

    return dataset

class SentimentDataset(Dataset):
    def __init__(self, loaded_dataset):
        # get a copy of the dataset
        self.dataframe = loaded_dataset.copy()
        self.dataframe_with_counterfactuals = None
        self.guids = []
        self.dataset = {}
        self.dataset_with_prompts = []

    def randomly_assign_conterfactuals(self, seed=1):
        # prepare the proper Dataframe for the dataset
        self.random_shuffle(seed)
        paired_ids = self.dataframe['paired_id'].values
        found_ids = {}
        counterfactual_column = []
        for id in paired_ids:
            counterfactual_column.append(self.__set_example_counter__(id, found_ids))
        self.dataframe['is_counterfactual'] = counterfactual_column

        print("Dataset's Dataframe prepared")

    def __set_example_counter__(self, idx, found_idsx):
        if idx in found_idsx:
            return 0
        else:
            found_idsx[idx] = 0
            return 1

    # prepare a dataset with input-output instances
    def prepare_dataframe_with_counterfacuals(self,
                                              reduce_dataset_,
                                              n_to_keep_,
                                              seed):

        # group by paired_id
        gb = self.dataframe.groupby(by=["paired_id"])

        # create new columns "example" and "counterfactual"
        example_column = []
        counter_column = []
        paired_id_column = []
        label_ex = []
        label_counter = []
        for group_id in gb.groups: # group_id == paired_id
            group = gb.get_group(group_id)
            is_counterfactual_column = group['is_counterfactual'].values
            text_column = group['text'].values
            sentiment_column = group['sentiment'].values
            for is_counter, text, label in zip(is_counterfactual_column,
                                               text_column,
                                               sentiment_column):
                if is_counter:
                    counter_column.append(text)
                    label_counter.append(label)
                else:
                    example_column.append(text)
                    label_ex.append(label)

            paired_id_column.append(group_id)

        # clean the text from html tags
        example_column = [BeautifulSoup(el, "lxml").text for el in example_column]
        counter_column = [BeautifulSoup(el, "lxml").text for el in counter_column]

        # add the new columns to a new dataframe
        d = {'paired_id': paired_id_column,
             'example': example_column,
             'label_ex': label_ex,
             'counterfactual': counter_column,
             'label_counter': label_counter}
        self.dataframe_with_counterfactuals = pd.DataFrame(data=d)

        if reduce_dataset_:
            self.dataframe_with_counterfactuals = self.dataframe_with_counterfactuals.sample(n=n_to_keep_, random_state=seed)
            self.dataframe_with_counterfactuals.reset_index(drop=True, inplace=True)

        self.dataframe_with_counterfactuals.sort_values(by="paired_id", ascending=True, inplace=True)

    # convert the Dataframe into the InputExample format dataset of openprompt
    def prepare_dataset(self):
        for index, row in self.dataframe_with_counterfactuals.iterrows():
            self.dataset[row['paired_id']] = InputExample(guid=row['paired_id'],
                                                          text_a=BeautifulSoup(
                                                              row['example'], "lxml").text,
                                                          text_b=BeautifulSoup(
                                                              row['counterfactual'], "lxml").text,
                                                          meta={"label_a":row['label_ex'],
                                                                "label_b":row['label_counter'],
                                                                'example':BeautifulSoup(
                                                                    row['example'], "lxml").text,
                                                                'counterfactual':BeautifulSoup(
                                                                    row['counterfactual'], "lxml").text})
            self.guids.append(row['paired_id'])

    def wrap_dataset_instances(self, template_, label_template):
        # template is a string with the whole template
        # label_template is a dict with the mapping between label and template
        for idx in self.dataset:
            instance = self.dataset[idx]
            instance.text_a = template_.replace('[text_a]', instance.text_a)
            instance.text_a = instance.text_a.replace(
                '[label_a]', label_template[instance.meta['label_a']])
            instance.text_a = instance.text_a.replace(
                '[label_a]', label_template[instance.meta['label_a']])
            instance.text_a = instance.text_a.replace(
                '[label_b]', label_template[instance.meta['label_b']])

        print('Dataset examples wrapped with prompt template')

    def sort_by_paired_id(self):
        self.dataframe.sort_values('paired_id', inplace=True)

    def sort_prompted_by_paired_id(self):
        self.dataframe_with_prompts.sort_values('paired_id', inplace=True)

    def random_shuffle(self, seed):
        random.seed(seed)
        self.dataframe = shuffle(self.dataframe)

    # the same of __getitem__
    def get_instance_by_id(self, idx):
        return self.dataset[idx]

    # implemented because of inheritance from Dataset
    def __len__(self):
        return len(self.dataset)

    # implemented because of inheritance from Dataset
    def __iter__(self):
        return iter(self.dataset)

    def __next__(self):
        return iter(self.dataset)

    # implemented because of inheritance from Dataset
    def __getitem__(self, idx):
        return self.dataframe.__getitem__(idx)

    def get_dataset(self):
        return self.dataset

    def get_dataframe(self):
        return self.dataframe

    def get_dataframe_with_counterfactuals(self):
        return self.dataframe_with_counterfactuals

    def get_dataset_with_prompts(self):
        return self.dataset_with_prompts

<h2>Execute from here to prepare the dataset</h2>

In [7]:
training_set = load_dataset("train_paired.tsv")
dev_set = load_dataset("dev_paired.tsv")
print(f"# of samples in the training set:{len(training_set)}")
print(f"# of samples in the dev set:{len(dev_set)}")

# of samples in the training set:3414
# of samples in the dev set:490


In [8]:
random_seed = 5 # this has to be the same across the experiments
reduce_data = False
data_to_keep = 0 # ignore if reduce_data = False
training_set = prepare_dataset(training_set,
                               reduce_data,
                               data_to_keep,
                               random_seed)

dev_set = prepare_dataset(dev_set,
                          reduce_data,
                          data_to_keep,
                          random_seed)

Dataset's Dataframe prepared
# of data points in the dataset:  1707
Dataset's Dataframe prepared
# of data points in the dataset:  245


In [9]:
training_set.get_dataframe_with_counterfactuals().head(2)

Unnamed: 0,paired_id,example,label_ex,counterfactual,label_counter
0,4,"Long, boring, blasphemous. Never have I been s...",0,"Long, fascinating, soulful. Never have I been ...",1
1,40,"So good! Rent or buy the original, too! Watch ...",1,Not good! Rent or buy the original! Watch this...,0


# TEMPLATE #1 (apparently, it is the best on performances obtained for the ZS scenario)

[label_example] review: [example text]

[label_counterfactual] review: [generation token]

Label template 0:"Negative"; 1:"Positive"

In [10]:
def map_label(label):
    if label == 1:
        return "Positive"
    else:
        return "Negative"

In [11]:
is_bert_model = True
include_end_text = False # if to include the end of text token
wrapped_text = []
if is_bert_model:
    eos_token = "[EOS]"
else:
    eos_token = "<|endoftext|>"

trainset = training_set.get_dataframe_with_counterfactuals()
for _, row in trainset.iterrows():
    label_example = map_label(row["label_ex"])
    example = row["example"]
    counter = row["counterfactual"]
    label_counter = map_label(row["label_counter"])
    wrapped = f"{label_example} review: {example}\n{label_counter} review: {counter}"
    if include_end_text:
        wrapped += eos_token
    wrapped_text.append(wrapped)

In [12]:
trainset["wrapped_text"] = wrapped_text
print(len(trainset))
trainset.head(2)

1707


Unnamed: 0,paired_id,example,label_ex,counterfactual,label_counter,wrapped_text
0,4,"Long, boring, blasphemous. Never have I been s...",0,"Long, fascinating, soulful. Never have I been ...",1,"Negative review: Long, boring, blasphemous. Ne..."
1,40,"So good! Rent or buy the original, too! Watch ...",1,Not good! Rent or buy the original! Watch this...,0,Positive review: So good! Rent or buy the orig...


In [13]:
trainset.loc[0]['wrapped_text']

'Negative review: Long, boring, blasphemous. Never have I been so glad to see ending credits roll.\nPositive review: Long, fascinating, soulful. Never have I been so sad to see ending credits roll.'

In [105]:
def print_data_for_bert(text_to_print, filename):
    with open(filename, 'w') as file_writer:
        for t in text_to_print:
            file_writer.write('[CLS]' + t + '[SEP]' + "\n\n")

In [106]:
# texts = trainset["wrapped_text"].values
# if include_end_text:
#     output_name = 'imdb_counter_train_eos.txt'
# else:
#     output_name = 'imdb_counter_train.txt'
# with open(output_name, 'w') as file_writer:
#     for t in texts:
#         file_writer.write(t + "\n")

print_data_for_bert(trainset["wrapped_text"].values, 'imdb_counter_train_eos.txt')

In [123]:
from transformers import BertTokenizer, AutoTokenizer
from datasets import load_dataset
import torch

def tokenize_function(examples):
    return tokenizer(examples, padding="max_length", truncation=True)

In [None]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased")

In [178]:
class TextDataset(Dataset):
    def __init__(self, raw_examples):
        self.examples = raw_examples
        print("Instantiated pytorch Dataset. The dataset is built from a list. If you want to bulid it from a raw text file use the preprocess_from_raw() method")

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, item):
        return torch.tensor(self.examples[item])

    def preprocess_from_raw(self, raw_text):
        pass

In [141]:
# file_path = "imdb_counter_train_eos.txt"
# with open(file_path, encoding="utf-8") as f:
#     text = f.read()

In [179]:
# preprocessed_dataset = TextDataset(list(trainset["wrapped_text"].values))
# print(len(preprocessed_dataset))

Instantiated pytorch Dataset. The dataset is built from a list. If you want to bulid it from a raw text file use the preprocess_from_raw() method
1707


In [183]:

# tokenized_datasets = preprocessed_dataset.map(tokenize_function, batched=True)
tokenized = preprocessed_dataset.map(tokenize_function)
# len(tokenized_datasets)
def tokenize_function(examples):
    return tokenizer(examples, padding="max_length", truncation=True)

In [185]:
tokenized

<torch.utils.data.datapipes.map.callable.MapperMapDataPipe at 0x7f4bc717a410>

In [163]:
from datasets import load_dataset
dataset = load_dataset('squad', split='train')

Downloading:   0%|          | 0.00/1.97k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.02k [00:00<?, ?B/s]

Downloading and preparing dataset squad/plain_text (download: 33.51 MiB, generated: 85.63 MiB, post-processed: Unknown size, total: 119.14 MiB) to /home/diego/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453...


  0%|          | 0/2 [00:00<?, ?it/s]

Downloading:   0%|          | 0.00/8.12M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.05M [00:00<?, ?B/s]

  0%|          | 0/2 [00:00<?, ?it/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset squad downloaded and prepared to /home/diego/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453. Subsequent calls will reuse this data.


In [167]:
def add_prefix(examplee):
    examplee['question'] = 'My sentence: ' + examplee['question']
    return examplee

In [169]:
updated_dataset = dataset.map(add_prefix)
updated_dataset['question'][:5]

Loading cached processed dataset at /home/diego/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453/cache-ca509dce492fb026.arrow


['My sentence: To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?',
 'My sentence: What is in front of the Notre Dame Main Building?',
 'My sentence: The Basilica of the Sacred heart at Notre Dame is beside to which structure?',
 'My sentence: What is the Grotto at Notre Dame?',
 'My sentence: What sits on top of the Main Building at Notre Dame?']

In [166]:
dataset

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 87599
})

In [170]:
encoded_input = tokenizer("Hello, I'm a single sentence!")

print(encoded_input)

{'input_ids': [101, 8667, 117, 146, 112, 182, 170, 1423, 5650, 106, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


AttributeError: type object 'Dataset' has no attribute 'from_pandas'

In [16]:
from datasets import Dataset

dataset = Dataset.from_pandas(trainset)
print('Dataset instantiated')

Dataset instantiated


In [191]:
dataset['wrapped_text'][0]

'Positive review: Long, fascinating, soulful. Never have I been so sad to see ending credits roll.\nNegative review: Long, boring, blasphemous. Never have I been so glad to see ending credits roll.'

In [14]:
def tokenize_function(examples):
    return tokenizer(examples['wrapped_text'], padding="max_length", truncation=True)

In [17]:
tokenized = dataset.map(tokenize_function)
tokenized

0ex [00:00, ?ex/s]

NameError: name 'tokenizer' is not defined

In [18]:
print(len(tokenized['input_ids']))
first_ex = tokenized['input_ids'][0]

NameError: name 'tokenized' is not defined

In [19]:
tokenizer.decode(first_ex)

NameError: name 'tokenizer' is not defined

In [203]:
tokenized

Dataset({
    features: ['paired_id', 'example', 'label_ex', 'counterfactual', 'label_counter', 'wrapped_text', '__index_level_0__', 'input_ids', 'attention_mask'],
    num_rows: 1707
})

In [204]:
# Load "gpt2-medium" with tokenizer
tok = GPT2Tokenizer.from_pretrained("gpt2-medium")
lm = GPT2LMHeadModel.from_pretrained("gpt2-medium", pad_token_id = tok.pad_token_id, eos_token_id = tok.eos_token_id)

In [210]:
training_args = TrainingArguments("test_trainer")
trainer = Trainer(model=lm, args=training_args, train_dataset=tokenized)

In [211]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: counterfactual, example, wrapped_text, label_counter, label_ex, __index_level_0__, paired_id.
***** Running training *****
  Num examples = 1707
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 642


RuntimeError: CUDA out of memory. Tried to allocate 20.00 MiB (GPU 0; 3.95 GiB total capacity; 2.84 GiB already allocated; 7.44 MiB free; 2.85 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [213]:
len(tokenized['wrapped_text'])

1707

In [215]:
d = {'text': tokenized['wrapped_text']}
df_to_print = pd.DataFrame(data=d)
df_to_print.head(2)

Unnamed: 0,text
0,"Positive review: Long, fascinating, soulful. N..."
1,Positive review: So good! Rent or buy the orig...


In [1]:
df_to_print.to_csv("imdb_wrapped", index=False, header=False)

NameError: name 'df_to_print' is not defined

In [20]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TrainingArguments, Trainer
import pandas as pd
# Load "gpt2-medium" with tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2-medium")
lm = GPT2LMHeadModel.from_pretrained("gpt2-medium")
# lm = GPT2LMHeadModel.from_pretrained("gpt2-medium", pad_token_id = tok.pad_token_id, eos_token_id = tok.eos_token_id)
tokenizer

PreTrainedTokenizer(name_or_path='gpt2-medium', vocab_size=50257, model_max_len=1024, is_fast=False, padding_side='right', special_tokens={'bos_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'eos_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'unk_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True)})

In [21]:
encoded_input = tokenizer("Hello, I'm a single sentence!")
tokenizer.decode(encoded_input["input_ids"])

"Hello, I'm a single sentence!"

In [34]:
encoded_input = tokenizer(["Ciao", "My name is Diego"])
print(encoded_input["input_ids"])
for el in encoded_input["input_ids"]:
    print(tokenizer.decode(el))

[[34, 13481], [3666, 1438, 318, 9500]]
<class 'list'>
Ciao
<class 'list'>
My name is Diego


In [36]:
from datasets import Dataset
