In [None]:
import torch
import pandas as pd
from tqdm import tqdm
import torch.nn as nn
import torch.optim as optim
import spacy
from sklearn.model_selection import KFold
from torch.utils.data import Dataset, DataLoader, RandomSampler
import pandas as pd
import numpy as np
import transformers
import matplotlib.pyplot as plt
from transformers import T5Tokenizer, T5Model, T5ForConditionalGeneration, T5TokenizerFast

import warnings
warnings.filterwarnings("ignore")

In [None]:


# Read the input CSV file into a pandas DataFrame with the correct delimiter
df = pd.read_csv('/kaggle/input/my-dataset/QA_dataset.csv')
df = df.sample(frac = 1)
df = df.reset_index(drop=True)

# Initialize an empty DataFrame for transformed data
data = pd.DataFrame(columns=['context', 'question', 'answer'])

frames = []

# Iterate through each row and transform the data
for _, row in df.iterrows():
    paragraph = row['Paragraphs']
    temp_df = pd.DataFrame({
        'context': [paragraph] * 3,
        'question': [row[f'Question{i}'] for i in range(1, 4)],
        'answer': [row[f'Answer{i}'] for i in range(1, 4)]
    })
    
    frames.append(temp_df)

# Concatenate the list of DataFrames
data = pd.concat(frames, ignore_index=True)


In [None]:

data = data.iloc[0:1000,:]

In [None]:
data

In [None]:
Q_LEN = 256   # Question Length
T_LEN = 32    # Target Length
BATCH_SIZE = 16
LEARNING_RATE = 1e-4
EPOCHS = 15

In [None]:
TOKENIZER = T5TokenizerFast.from_pretrained("t5-base")

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [None]:
def prepare_qa_data(tokenizer, dataframe, q_len, t_len):
    questions = dataframe["question"]
    context = dataframe["context"]
    answer = dataframe['answer']

    qa_data = []

    for idx in range(len(questions)):
        question = questions[idx]
        context_text = context[idx]
        answer_text = answer[idx]

        context_tokenized = tokenizer(context_text, max_length=q_len, padding="max_length",
                                       truncation=True, pad_to_max_length=True, add_special_tokens=True)
        question_tokenized = tokenizer(question, max_length=t_len, padding="max_length",
                                     truncation=True, pad_to_max_length=True, add_special_tokens=True)

        labels = torch.tensor(question_tokenized["input_ids"], dtype=torch.long)
        labels[labels == 0] = -100

        qa_data.append({
            "input_ids": torch.tensor(context_tokenized["input_ids"], dtype=torch.long),
            "attention_mask": torch.tensor(context_tokenized["attention_mask"], dtype=torch.long),
            "labels": labels,
            "decoder_attention_mask": torch.tensor(question_tokenized["attention_mask"], dtype=torch.long)
        })

    return qa_data

def create_loaders(tokenizer, data, q_len, t_len, batch_size, train_data, val_data):

    train_sampler = RandomSampler(train_data.index)
    val_sampler = RandomSampler(val_data.index)

    qa_data = prepare_qa_data(tokenizer, data, q_len, t_len)

    train_loader = DataLoader(qa_data, batch_size=batch_size, sampler=train_sampler)
    val_loader = DataLoader(qa_data, batch_size=batch_size, sampler=val_sampler)

    return train_loader, val_loader

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize KFold cross-validator
kf = KFold(n_splits=5, shuffle=True, random_state=42)
data.reset_index(drop=True, inplace=True)

# Initialize list of lists to store validation losses for each fold
validation_losses_per_fold = [[] for _ in range(kf.n_splits)]
train_losses_per_fold = [[] for _ in range(kf.n_splits)]

for fold, (train_idx, test_idx) in enumerate(kf.split(data), 1):
    print(f"Fold #{fold}")

    # Prepare dataset
    train_dataset = data.iloc[train_idx]
    test_dataset = data.iloc[test_idx]

    # Create DataLoader
    train_loader, test_loader = create_loaders(TOKENIZER, data, Q_LEN, T_LEN, BATCH_SIZE, train_dataset,test_dataset)
    # Create the model and move it to the appropriate device
    model = T5ForConditionalGeneration.from_pretrained("t5-base").to(device)


    # Define optimizer
    optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE)

    # Training loop
    for epoch in range(EPOCHS):
        model.train()
        train_loss = 0.0
        total_samples = 0
        for batch in tqdm(train_loader, desc="Training batches"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)
            decoder_attention_mask = batch["decoder_attention_mask"].to(device)

            optimizer.zero_grad()
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels,
                decoder_attention_mask=decoder_attention_mask
            )
            loss = outputs.loss
            loss.backward()
            optimizer.step()

            train_loss += loss.item() * len(input_ids)
            total_samples += len(input_ids)

        # Store average training loss for current epoch
        train_loss /= total_samples
        train_losses_per_fold[fold - 1].append(train_loss)

        # Validation
        model.eval()
        with torch.no_grad():
            val_loss = 0.0
            total_samples = 0
            for batch in tqdm(test_loader, desc="Validation batches"):
                input_ids = batch["input_ids"].to(device)
                attention_mask = batch["attention_mask"].to(device)
                labels = batch["labels"].to(device)
                decoder_attention_mask = batch["decoder_attention_mask"].to(device)

                outputs = model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    labels=labels,
                    decoder_attention_mask=decoder_attention_mask
                )
                val_loss += outputs.loss.item() * len(input_ids)
                total_samples += len(input_ids)
            
            val_loss /= total_samples

        # Store validation loss for current fold and epoch
        validation_losses_per_fold[fold - 1].append(val_loss)
        print(f"Epoch {epoch + 1}/{EPOCHS}, Train Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}")

    print("Training completed for fold.\n")
    
# Plot training vs validation loss for each fold
plt.figure(figsize=(10, 6))
for fold, (train_losses, val_losses) in enumerate(zip(train_losses_per_fold, validation_losses_per_fold), 1):
    plt.plot(range(1, EPOCHS + 1), train_losses, label=f"Fold {fold} Train")
    plt.plot(range(1, EPOCHS + 1), val_losses, label=f"Fold {fold} Validation")

plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("Training vs Validation Loss")
plt.legend()
plt.grid(True)
plt.show()
