In [None]:
# Importing libraries
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
import os
import datasets

# Importing the T5 modules from huggingface/transformers
from transformers import T5Tokenizer, AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, AutoModelForCausalLM

In [None]:
from peft import get_peft_config, get_peft_model, get_peft_model_state_dict, PrefixTuningConfig, PromptTuningConfig, TaskType
import torch
from datasets import load_dataset
import os

os.environ["TOKENIZERS_PARALLELISM"] = "false"
# os.environ["CUDA_VISIBLE_DEVICES"] = "3"
from transformers import AutoTokenizer
from torch.utils.data import DataLoader
from transformers import default_data_collator, get_linear_schedule_with_warmup
from tqdm import tqdm
from datasets import load_dataset

# device = "cuda"
# model_name_or_path = "t5-large"
# tokenizer_name_or_path = "t5-large"

# checkpoint_name = "financial_sentiment_analysis_prefix_tuning_v1.pt"
# text_column = "sentence"
# label_column = "text_label"
# max_length = 128
# lr = 1e-2
# num_epochs = 5
# batch_size = 8

In [None]:
dataset = datasets.load_dataset("squad")

In [None]:
tokenizer = AutoTokenizer.from_pretrained("t5-base")
model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
#model = AutoModelForCausalLM.from_pretrained("t5-base")

In [None]:
peft_config = PromptTuningConfig(task_type=TaskType.SEQ_2_SEQ_LM, inference_mode=False, num_virtual_tokens=20)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

In [None]:
context = dataset["train"]["context"][:10000]
answer = [i["text"][0] for i in dataset["train"]["answers"][:10000]]
#inputs = [f"answer: {ans} context: {con}" for ans, con in zip(answer, context)]
questions = dataset["train"]["question"][:10000]

model_inputs = tokenizer(inputs, return_tensors="pt")
labels = tokenizer(questions, return_tensors="pt")
labels = labels["input_ids"]
labels[labels == tokenizer.pad_token_id] = -100
model_inputs["labels"] = labels

In [None]:
econtext = dataset["validation"]["context"][:1000]
eanswer = [i["text"][0] for i in dataset["validation"]["answers"][:1000]]
#einputs = [f"answer: {ans} context: {con}" for ans, con in zip(answer, context)]
equestions = dataset["validation"]["question"][:1000]

emodel_inputs = tokenizer(eanswer, econtext, max_length=512, padding="max_length", truncation=True, return_tensors="pt")
elabels = tokenizer(questions, max_length=30, padding="max_length", truncation=True, return_tensors="pt")
elabels = elabels["input_ids"]
elabels[labels == tokenizer.pad_token_id] = -100
emodel_inputs["labels"] = labels

In [None]:
class CustomDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings


    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        return item

    def __len__(self):
        return len(self.encodings["labels"])

In [None]:
train_dataset = CustomDataset(model_inputs)
eval_dataset = CustomDataset(emodel_inputs)

In [None]:
#train_dataset = processed_datasets["train"]
#eval_dataset = processed_datasets["validation"]

train_dataloader = DataLoader(
    train_dataset, shuffle=True, collate_fn=default_data_collator, batch_size=8, pin_memory=True
)
eval_dataloader = DataLoader(eval_dataset, collate_fn=default_data_collator, batch_size=16, pin_memory=True)

In [None]:
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
lr_scheduler = get_linear_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=(len(train_dataloader) * 5),
)

In [None]:
device="cuda"
model = model.to(device)

for epoch in range(3):
    model.train()
    total_loss = 0
    for step, batch in enumerate(tqdm(train_dataloader)):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        total_loss += loss.detach().float()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

    model.eval()
    eval_loss = 0
    eval_preds = []
    for step, batch in enumerate(tqdm(eval_dataloader)):
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
        loss = outputs.loss
        eval_loss += loss.detach().float()
        eval_preds.extend(
            tokenizer.batch_decode(torch.argmax(outputs.logits, -1).detach().cpu().numpy(), skip_special_tokens=True)
        )

    eval_epoch_loss = eval_loss / len(train_dataloader)
    eval_ppl = torch.exp(eval_epoch_loss)
    train_epoch_loss = total_loss / len(eval_dataloader)
    train_ppl = torch.exp(train_epoch_loss)
    print(f"{epoch=}: {train_ppl=} {train_epoch_loss=} {eval_ppl=} {eval_epoch_loss=}")
