In [None]:
!pip install datasets --quiet

In [1]:
import os
import sys
import random
from datetime import datetime
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

In [13]:
import random
import torch
import numpy as np
from torch.utils.data import Dataset
from datasets import load_dataset

class RaceQuestionAnswerGeneration(Dataset):
    def __init__(self, tokenizer, data_split,separator='<sep>'):
        """
        task:
            - input: context
            - output: question <sep> answer
        args:
            tokenizer: tokenizer
            data_split: train, validation, test
        """
        data = load_dataset("squad", split=data_split)
        self.data = data
        self.tokenizer = tokenizer
        self.separator = separator
        print("SquadQuestionAnswerGeneration Initialized")

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        example = self.data[idx]
        question = example["question"]
        context = example["context"]
        answers = example["answers"]["text"]

        # Choose a random answer for simplicity
        answer = random.choice(answers)

        # input & output
        input = context
        output = question + ' ' +self.separator+'  '+ answer
        return {'input': input, 'output': output}

#     def __init__(self, tokenizer, data_split, separator='<sep>'):
#         """
#         task:
#             - input: article (i.e. context)
#             - output: question <sep> answer
#         args:
#             tokenizer: tokenizer
#             data_split: train, validation, test
#         """
#         data = load_dataset("race", "all", split=data_split)
#         self.data = data
#         self.tokenizer = tokenizer
#         self.separator = separator
#         self.label_mapping = {label: i for i, label in enumerate(["A", "B", "C", "D"])}
#         print("RaceQuestionAnswerGeneration Initialized")

#     def __len__(self):
#         return len(self.data)

#     def __getitem__(self, idx):
#         example = self.data[idx]
#         # example_id = example["example_id"]
#         question = example["question"]
#         context = example["article"]
#         options = example["options"]
#         label_example = example["answer"]
#         answer = options[self.label_mapping[label_example]]

#         # input & output
#         input = context
#         output = question + ' ' + self.separator + ' ' + answer
#         return {'input': input, 'output': output}


# class RaceDistractorGeneration(Dataset):
#     def __init__(self, tokenizer, data_split, shuffle_distractors=False, separator='<sep>'):
#         """
#         task:
#             - input: question <sep> answer <sep> article
#             - output: distractor1 <sep> distractor2 <sep> distractor3
#         args:
#             tokenizer: tokenizer
#             data_split: train, validation, test
#         """
#         data = load_dataset("race", "all", split=data_split)
#         self.data = data
#         self.tokenizer = tokenizer
#         self.separator = separator
#         self.label_mapping = {label: i for i, label in enumerate(["A", "B", "C", "D"])}
#         self.all_labels = [0, 1, 2, 3]
#         self.shuffle_distractors = shuffle_distractors
#         print("RaceQuestionAnswerGeneration Initialized")

#     def __len__(self):
#         return len(self.data)

#     def __getitem__(self, idx):
#         example = self.data[idx]
#         # example_id = example["example_id"]
#         question = example["question"]
#         context = example["article"]
#         options = example["options"]
#         label_example = example["answer"]
#         answer_i = self.label_mapping[label_example]
#         answer = options[answer_i]
#         distractor_ids = [i for i in self.all_labels if i != answer_i]
#         if self.shuffle_distractors:
#             random.shuffle(distractor_ids)
#         distractors = [options[i] for i in distractor_ids]

#         # input & output
#         input = question + ' ' + self.separator + ' ' + answer + ' ' + self.separator + ' ' + context
#         output = distractors[0] + ' ' + self.separator + ' ' + distractors[1] + ' ' + self.separator + ' ' + distractors[2]
#         return {'input': input, 'output': output}

# class RaceAnsweringModel(Dataset):
#     def __init__(self,
#             data_split,
#         ):
#         """
#         """
#         data = load_dataset("race", "all", split=data_split)
#         self.data = data
#         self.label_mapping = {label: i for i, label in enumerate(["A", "B", "C", "D"])}
#         self.all_labels = [0, 1, 2, 3]
#         print("RaceAnsweringModel Initialized")

#     def __len__(self):
#         return len(self.data)

#     def __getitem__(self, idx):
#         example = self.data[idx]
#         question = example["question"]
#         context = example["article"]
#         options = example["options"]
#         label_example = example["answer"]
#         answer_i = self.label_mapping[label_example]

#         return {'context': context, 'question': question, 'options': options, 'answer_i': answer_i}

In [14]:
save_dir = "C:/Users/Aayush/Documents/VIT/capstone/model"
# t5_model = "potsawee/t5-large-generation-squad-QuestionAnswer"
t5_model = "t5-small"

model_name = f"{t5_model}-Race-QA-Generation-version0"


lr0 = 5e-5  # Slightly reduce learning rate
batch_size = 8  # Keep batch size as 1
num_workers = 0  # Set number of workers to 0 for simplicity
num_epochs = 1  # Train for 1 epoch for a start
max_length = 512  # Reduce maximum sequence length to conserve memory
valid_step = 5000  # Reduce validation step to save more frequently


In [15]:
t5_tokenizer = AutoTokenizer.from_pretrained(t5_model, model_max_length=max_length)
t5_tokenizer.add_special_tokens({"sep_token": "<sep>"})

1

In [16]:
torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
print("torch_device:", torch_device)


print("lr0:", lr0)
print("batch_size:", batch_size)
print("num_workers:", num_workers)
print("num_epochs:", num_epochs)
print("valid_step:", valid_step)
print("max_length:", max_length)

torch_device: cpu
lr0: 5e-05
batch_size: 8
num_workers: 0
num_epochs: 1
valid_step: 5000
max_length: 512


In [17]:
def experiment():
    # ---------------------------- Data ---------------------------- #
    train_data = RaceQuestionAnswerGeneration(
        tokenizer = t5_tokenizer,
        data_split = "train",
        separator = t5_tokenizer.sep_token,
    )
    print("len_train_data:", len(train_data))
    train_loader = torch.utils.data.DataLoader(
                    train_data,
                    batch_size=batch_size,
                    num_workers=num_workers,
                    shuffle=True,
                    collate_fn=collate_fn)

    valid_data = RaceQuestionAnswerGeneration(
        tokenizer = t5_tokenizer,
        data_split = "validation",
        separator = t5_tokenizer.sep_token,
    )
    print("len_valid_data:", len(valid_data))
    valid_loader = torch.utils.data.DataLoader(
                    valid_data,
                    batch_size=batch_size,
                    num_workers=num_workers,
                    shuffle=False,
                    collate_fn=collate_fn)

    # ---------------------------- Model ---------------------------- #
    model = AutoModelForSeq2SeqLM.from_pretrained(t5_model)
    if torch_device == "cuda":
        model.cuda()
    print("#parameters:", sum(p.numel() for p in model.parameters() if p.requires_grad))
    # by default, it's not training!!!
    model.train()

    # ----------------- Optimizer and Loss Function ----------------- #
    optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=lr0,betas=(0.9,0.999),eps=1e-08,weight_decay=0)
    optimizer.zero_grad()
    training_step = 0
    stop_counter = 0
    best_val_loss = 99999999
    for epoch_i in range(num_epochs):

        for iter_, sample in enumerate(train_loader):
            if sample is None:
                continue

            input_ids, attention_mask = sample['input_ids'], sample['attention_mask']
            labels = sample['labels']

            if torch_device == 'cuda':
                input_ids = input_ids.cuda()
                attention_mask = attention_mask.cuda()
                labels = labels.cuda()

            loss = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels).loss
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

            if training_step % 1 == 0:
                print("{}, step = {}, loss = {:.8f}".format(str(datetime.now()), training_step, loss))
                sys.stdout.flush()

            if training_step % valid_step == 0:
                state = {
                    'training_step': training_step,
                    'model': model.state_dict(),
                }
                savepath = "{}/{}-step{}.pt".format(save_dir, model_name, training_step)
                filename = 'finalized_model.sav'
#                 pickle.dump(, open(filename, 'wb'))
                torch.save(model.state_dict(), savepath)
#                 model.save_pretrained(savepath)
                print("Saved at {}".format(savepath))

                model.eval()
                with torch.no_grad():
                    valid_loss = validation(model, valid_loader)
                    if valid_loss is not None:
                        print("Valid Loss = {:.6f}".format(valid_loss))
                    else:
                        print("Validation loss is None.")
                model.train()

                if valid_loss is not None and valid_loss < best_val_loss:
                    stop_counter = 0
                    best_val_loss = valid_loss
                    print("Model improved".format(stop_counter))
                else:
                    stop_counter += 1
                    print("Model not improved #{}".format(stop_counter))
                    if stop_counter == 3:
                        print("Stop training!")
                        return

            training_step += 1
            
        print("finish epoch: {}".format(epoch_i+1))

    print("Finish Training")

In [18]:
# def validation(model, valid_loader):
#     valid_loss = 0
#     counter = 0
#     for sample in valid_loader:
#         input_ids, attention_mask = sample['input_ids'], sample['attention_mask']
#         labels = sample['labels']

#         if torch_device == 'cuda':
#             input_ids = input_ids.cuda()
#             attention_mask = attention_mask.cuda()
#             labels = labels.cuda()

#         loss = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels).loss
#         valid_loss += loss.item()
#         counter += 1
#         if counter % 50 == 0:
#             print("#", end="")
#             sys.stdout.flush()
#     print()
def validation(model, valid_loader):
    valid_loss = 0
    counter = 0
    for sample in valid_loader:
        input_ids, attention_mask = sample['input_ids'], sample['attention_mask']
        labels = sample['labels']

        if torch_device == 'cuda':
            input_ids = input_ids.cuda()
            attention_mask = attention_mask.cuda()
            labels = labels.cuda()

        try:
            loss = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels).loss
            valid_loss += loss.item()
            counter += 1
        except Exception as e:
            print(f"Exception during validation: {e}")
            print(f"Sample information: {sample}")
            continue

        if counter % 50 == 0:
            print("#", end="")
            sys.stdout.flush()

    print()
    if counter == 0:
        print("No samples processed during validation.")
        return None

    average_loss = valid_loss / counter
    print(f"Validation Loss = {average_loss:.6f}")
    return average_loss


In [19]:
def collate_fn(list_of_items):
    """
    each item is a dictionary:
    """
    list_of_items = [x for x in list_of_items if x is not None]
    batch_size = len(list_of_items)
    if batch_size == 0: return None

    input_sequences, output_sequences = [], []
    for item in list_of_items:
        input_sequences.append(item['input'])
        output_sequences.append(item['output'])

    encoding = t5_tokenizer(
        input_sequences,
        padding="longest",
        max_length=max_length,
        truncation=True,
        return_tensors="pt",
    )
    input_ids, attention_mask = encoding.input_ids, encoding.attention_mask

    target_encoding = t5_tokenizer(
        output_sequences,
        padding="longest",
        max_length=max_length,
        truncation=True,
    )

    # the forward function automatically creates the correct decoder_input_ids
    labels = target_encoding.input_ids
    # replace padding token id's of the labels by -100 so it's ignored by the loss
    labels = torch.tensor(labels)
    labels[labels == t5_tokenizer.pad_token_id] = -100

    return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': labels,
    }

In [20]:
if __name__ == "__main__":
    experiment()

SquadQuestionAnswerGeneration Initialized
len_train_data: 87599
SquadQuestionAnswerGeneration Initialized
len_valid_data: 10570
#parameters: 60506624
2024-01-23 09:05:52.915083, step = 0, loss = 5.18086481
Saved at C:/Users/Aayush/Documents/VIT/capstone/model/t5-small-Race-QA-Generation-version0-step0.pt


KeyboardInterrupt: 

In [2]:
from torch.utils.data import Dataset
from transformers import T5Tokenizer
from datasets import load_dataset

class SquadQuestionAnswerGeneration(Dataset):
    def __init__(self, tokenizer, data_split):
        """
        task:
            - input: context
            - output: question <sep> answer
        args:
            tokenizer: tokenizer
            data_split: train, validation, test
        """
        data = load_dataset("squad", split=data_split)
        self.data = data
        self.tokenizer = tokenizer
        print("SquadQuestionAnswerGeneration Initialized")

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        example = self.data[idx]
        question = example["question"]
        context = example["context"]
        answers = example["answers"]["text"]

        # Choose a random answer for simplicity
        answer = random.choice(answers)

        # input & output
        input_text = context
        output_text = question + ' <sep> ' + answer

        # Tokenize input and output
        input_ids = self.tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True)
        output_ids = self.tokenizer.encode(output_text, return_tensors="pt", max_length=128, truncation=True)

        return {'input_ids': input_ids, 'output_ids': output_ids}

# Example usage:
tokenizer = T5Tokenizer.from_pretrained("t5-small")
squad_dataset = SquadQuestionAnswerGeneration(tokenizer, "train")
sample = squad_dataset[0]

print("Input Text:", tokenizer.decode(sample['input_ids'][0], skip_special_tokens=True))
print("Output Text:", tokenizer.decode(sample['output_ids'][0], skip_special_tokens=True))


You are using the legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This means that tokens that come after special tokens will not be properly handled. We recommend you to read the related pull request available at https://github.com/huggingface/transformers/pull/24565


Downloading readme:   0%|          | 0.00/7.83k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

SquadQuestionAnswerGeneration Initialized
Input Text: Architecturally, the school has a Catholic character. Atop the Main Building's gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.
Output Text: To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France? sep> Saint Bernadette Soubirous
