In [1]:
import torch
import pandas as pd
import numpy as np


In [2]:
df= pd.read_csv("../data/train.csv")

In [3]:
df.head()

Unnamed: 0,instruction,context,response,category
0,When did Virgin Australia start operating?,"Virgin Australia, the trading name of Virgin A...",Virgin Australia commenced services on 31 Augu...,closed_qa
1,Which is a species of fish? Tope or Rope,,Tope,classification
2,Why can camels survive for long without water?,,Camels use the fat in their humps to keep them...,open_qa
3,"Alice's parents have three daughters: Amy, Jes...",,The name of the third daughter is Alice,open_qa
4,When was Tomoaki Komorida born?,Komorida was born in Kumamoto Prefecture on Ju...,"Tomoaki Komorida was born on July 10,1981.",closed_qa


In [4]:
df.shape

(15011, 4)

In [5]:
df.columns

Index(['instruction', 'context', 'response', 'category'], dtype='object')

In [6]:
df.instruction.isna().sum()

np.int64(0)

In [7]:
df.response.isna().sum()

np.int64(0)

In [8]:
df.context.isna().sum()

np.int64(10545)

In [9]:
train_portion = int(df.shape[0] * 0.85)  # 85% for training
test_portion = int(df.shape[0] * 0.1)    # 10% for testing
val_portion = df.shape[0] - train_portion - test_portion  # Remaining 5% for validation
df=df.sample(frac=1)
train_data = df[:train_portion]
test_data = df[train_portion:train_portion + test_portion]
val_data = df[train_portion + test_portion:]

In [10]:
train_data.shape

(12759, 4)

In [11]:
def format_input(entry):
    
    instruction_text = (
        f"Below is an instruction that describes a task. "
        f"Write a response that appropriately completes the request."
        f"\n\n### Instruction:\n{entry['instruction']}"
    )
    
    input_text = f"\n\n### Input:\n{entry['context']}" if entry["context"] else ""

    return instruction_text + input_text

In [12]:
import tiktoken
tokenizer= tiktoken.get_encoding("gpt2")

In [13]:
from torch.utils.data import Dataset,DataLoader


class InstructionDataset(Dataset):
    def __init__(self, data: pd.DataFrame, tokenizer):
        self.data = data

        # Pre-tokenize texts
        self.encoded_texts = []
        for entry in data.iterrows():
            entry=dict(entry[1])
            instruction_plus_input = format_input(entry)
            response_text = f"\n\n ### Response:\n{entry['response']}"
            full_text = instruction_plus_input + response_text
            self.encoded_texts.append(
                tokenizer.encode(full_text)
            )

    def __getitem__(self, index):
        return self.encoded_texts[index]

    def __len__(self):
        return len(self.data)

In [14]:
da=InstructionDataset(train_data,tokenizer)

In [15]:
def custom_collate_fn(
    batch,
    pad_token_id=50256,
    ignore_index=-100,
    allowed_max_length=None,
    device="cpu"
):
    # Find the longest sequence in the batch
    batch_max_length = max(len(item)+1 for item in batch)

    # Pad and prepare inputs and targets
    inputs_lst, targets_lst = [], []

    for item in batch:
        new_item = item.copy()
        # Add an <|endoftext|> token
        new_item += [pad_token_id]
        # Pad sequences to max_length
        padded = (
            new_item + [pad_token_id] *
            (batch_max_length - len(new_item))
        )
        inputs = torch.tensor(padded[:-1])  # Truncate the last token for inputs
        targets = torch.tensor(padded[1:])  # Shift +1 to the right for targets

        # New: Replace all but the first padding tokens in targets by ignore_index
        mask = targets == pad_token_id
        indices = torch.nonzero(mask).squeeze()
        if indices.numel() > 1:
            targets[indices[1:]] = ignore_index

        # New: Optionally truncate to maximum sequence length
        if allowed_max_length is not None:
            inputs = inputs[:allowed_max_length]
            targets = targets[:allowed_max_length]

        inputs_lst.append(inputs)
        targets_lst.append(targets)

    # Convert list of inputs and targets to tensors and transfer to target device
    inputs_tensor = torch.stack(inputs_lst).to(device)
    targets_tensor = torch.stack(targets_lst).to(device)

    return inputs_tensor, targets_tensor

In [16]:
inputs_1 = [0, 1, 2, 3, 4]
inputs_2 = [5, 6]
inputs_3 = [7, 8, 9]

batch = (
    inputs_1,
    inputs_2,
    inputs_3
)

inputs, targets = custom_collate_fn(batch)
print(inputs)
print(targets)

tensor([[    0,     1,     2,     3,     4],
        [    5,     6, 50256, 50256, 50256],
        [    7,     8,     9, 50256, 50256]])
tensor([[    1,     2,     3,     4, 50256],
        [    6, 50256,  -100,  -100,  -100],
        [    8,     9, 50256,  -100,  -100]])


In [17]:
device= "cuda" if torch.cuda.is_available() else "cpu"

In [18]:
from functools import partial
customized_collate_fn = partial(custom_collate_fn, device=device, allowed_max_length=1024)