### Step 1: Preparing the dataset

In [1]:
import json

file_path = "instruction-data.json"

with open(file_path, "r", encoding="utf-8") as file:
    data = json.load(file)

print("Number of entries:", len(data))

Number of entries: 1100


In [2]:
print("Example entry:\n", data[50])

Example entry:
 {'instruction': 'Identify the correct spelling of the following word.', 'input': 'Ocassion', 'output': "The correct spelling is 'Occasion.'"}


### Converting instructions into Alpaca format

In [3]:
def format_input(entry):
    instruction_text = (
    f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.
    \n### Instruction:\n{entry['instruction']}"""
    )
    
    input_text = f"\n\n### Input:\n{entry['input']}" if entry["input"] else ""
    
    return instruction_text + input_text

In [4]:
model_input = format_input(data[50])
desired_response = f"\n\n### Response:\n{data[50]['output']}"

print(model_input + desired_response)

Below is an instruction that describes a task. Write a response that appropriately completes the request.
    
### Instruction:
Identify the correct spelling of the following word.

### Input:
Ocassion

### Response:
The correct spelling is 'Occasion.'


In [5]:
model_input = format_input(data[999])
desired_response = f"\n\n### Response:\n{data[999]['output']}"

print(model_input + desired_response)

Below is an instruction that describes a task. Write a response that appropriately completes the request.
    
### Instruction:
What is an antonym of 'complicated'?

### Response:
An antonym of 'complicated' is 'simple'.


### Splitting dataset into train/test/val

In [6]:
train_portion = int(len(data) * 0.85)    # 85%
test_portion = int(len(data) * 0.1) # 10%
val_portion = len(data) - train_portion - test_portion  # 5%

train_data = data[:train_portion]
test_data = data[train_portion:train_portion + test_portion]
val_data = data[train_portion + test_portion:]

print("Training set length:", len(train_data))
print("Test set length:", len(test_data))
print("Val set length:", len(val_data))

Training set length: 935
Test set length: 110
Val set length: 55


### Step 2: Organizing data into training batches

In [7]:
from torch.utils.data import Dataset

class InstructionDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        
        # Pre-tokenize texts
        self.encoded_texts = []
        for entry in data:
            instruction_plus_input = format_input(entry)
            response_text = f"\n\n### Response:\n{entry['output']}"
            full_text = instruction_plus_input + response_text
            self.encoded_texts.append(
                tokenizer.encode(full_text)
            )
            
    def __getitem__(self, index):
        return self.encoded_texts[index]
    
    def __len__(self):
        return len(self.data)

In [8]:
import tiktoken

tokenizer = tiktoken.get_encoding("gpt2")

In [9]:
### Step 1: Find the longest sequence in the batch
### Step 2: Pad and prepare inputs
### Step 3: Remove extra padded token added earlier
### Step 4: Convert list of inputs to tensor and transfer to target device

In [10]:
import torch

def custom_collate_draft_1(batch, pad_token_id=50256, device="cpu"):
    
    # Find the longest sequence in the batch and increase the max length by +1, which will add one extra padding token below
    batch_max_length = max(len(item)+1 for item in batch)
    
    # Pad and prepare inputs
    inputs_list = []
    
    for item in batch:
        new_item = item.copy()
        # Add an <|endoftext|> token
        new_item += [pad_token_id]
        # Pad sequences to batch_max_length
        padded = (
            new_item + [pad_token_id] * (batch_max_length - len(new_item))
        )
        # Via padded[:-1], we remove the extra padded token that has been added via the +1 setting in match_max_length
        inputs = torch.tensor(padded[:-1])
        inputs_list.append(inputs)
        
    # Convert list of inputs to tensor and transfer to target device
    inputs_tensor = torch.stack(inputs_list).to(device)
    return inputs_tensor

In [11]:
inputs_1 = [0, 1, 2, 3, 4]
inputs_2 = [5, 6]
inputs_3 = [7, 8, 9]

batch = (inputs_1, inputs_2, inputs_3)

print(custom_collate_draft_1(batch))

tensor([[    0,     1,     2,     3,     4],
        [    5,     6, 50256, 50256, 50256],
        [    7,     8,     9, 50256, 50256]])


### Creating target token ids for training

In [12]:
def custom_collate_draft_2(batch, pad_token_id=50256, device="cpu"):
    
    # Find the longest sequence in the batch and increase the max length by +1, which will add one extra padding token below
    batch_max_length = max(len(item)+1 for item in batch)
    
    # Pad and prepare inputs
    inputs_list, targets_list = [], []
    
    for item in batch:
        new_item = item.copy()
        # Add an <|endoftext|> token
        new_item += [pad_token_id]
        # Pad sequences to batch_max_length
        padded = (
            new_item + [pad_token_id] * (batch_max_length - len(new_item))
        )
        # Via padded[:-1], we remove the extra padded token that has been added via the +1 setting in match_max_length
        inputs = torch.tensor(padded[:-1])
        # Shift +1 to the right for targets
        targets = torch.tensor(padded[1:])
        inputs_list.append(inputs)
        targets_list.append(targets)
        
    # Convert list of inputs to tensor and transfer to target device
    inputs_tensor = torch.stack(inputs_list).to(device)
    targets_tensor = torch.stack(targets_list).to(device)
    return inputs_tensor, targets_tensor

In [13]:
### Step 1: Truncate the last token for inputs
### Step 2: Shift +1 to the right for targets

In [14]:
inputs_1 = [0, 1, 2, 3, 4]
inputs_2 = [5, 6]
inputs_3 = [7, 8, 9]

batch = (inputs_1, inputs_2, inputs_3)

inputs, targets = custom_collate_draft_2(batch)
print(inputs)
print(targets)

tensor([[    0,     1,     2,     3,     4],
        [    5,     6, 50256, 50256, 50256],
        [    7,     8,     9, 50256, 50256]])
tensor([[    1,     2,     3,     4, 50256],
        [    6, 50256, 50256, 50256, 50256],
        [    8,     9, 50256, 50256, 50256]])


In [15]:
# We retain one end of ext token in the target list to allow the LLM to learn when to generate an end of text token in response to
# instructions, which we use as an indicator that the generated response is now complete

In [16]:
def custom_collate_fn(batch, pad_token_id=50256, ignore_index=-100, allowed_max_length=None, device="cpu"):
    # Find the longest sequence in the batch and increase the max length by +1, which will add one extra padding token below
    batch_max_length = max(len(item)+1 for item in batch)
    
    # Pad and prepare inputs
    inputs_list, targets_list = [], []
    
    for item in batch:
        new_item = item.copy()
        # Add an <|endoftext|> token
        new_item += [pad_token_id]
        # Pad sequences to batch_max_length
        padded = (
            new_item + [pad_token_id] * (batch_max_length - len(new_item))
        )
        # Via padded[:-1], we remove the extra padded token that has been added via the +1 setting in match_max_length
        inputs = torch.tensor(padded[:-1])
        # Shift +1 to the right for targets
        targets = torch.tensor(padded[1:])
        
        # Replace all but the first padding tokens in the targets by ignore_index
        mask = targets == pad_token_id
        indices = torch.nonzero(mask).squeeze()
        if indices.numel() > 1:
            targets[indices[1:]] = ignore_index
        
        # Optionally truncate to maximum sequence length
        if allowed_max_length is not None:
            inputs = inputs[:allowed_max_length]
            targets = inputs[:allowed_max_length]
        
        inputs_list.append(inputs)
        targets_list.append(targets)
        
    # Convert list of inputs to tensor and transfer to target device
    inputs_tensor = torch.stack(inputs_list).to(device)
    targets_tensor = torch.stack(targets_list).to(device)
    
    return inputs_tensor, targets_tensor

In [17]:
inputs_1 = [0, 1, 2, 3, 4]
inputs_2 = [5, 6]
inputs_3 = [7, 8, 9]

batch = (inputs_1, inputs_2, inputs_3)

inputs, targets = custom_collate_fn(batch)
print(inputs)
print(targets)

tensor([[    0,     1,     2,     3,     4],
        [    5,     6, 50256, 50256, 50256],
        [    7,     8,     9, 50256, 50256]])
tensor([[    1,     2,     3,     4, 50256],
        [    6, 50256,  -100,  -100,  -100],
        [    8,     9, 50256,  -100,  -100]])


In [None]:
### Why replacing by -100?
### For demonstration purposes, consider the following simple and self contained example where each output logit can correspond 
### to a potential token

In [18]:
logits_1 = torch.tensor(
    [[-1.0, 1.0],
     [-0.5, 1.5]]
)
targets_1 = torch.tensor([0, 1])

loss_1 = torch.nn.functional.cross_entropy(logits_1, targets_1)
print(loss_1)

tensor(1.1269)


In [21]:
logits_2 = torch.tensor(
    [[-1.0, 1.0],
     [-0.5, 1.5],
     [-0.5, 1.5]]
)
targets_2 = torch.tensor([0, 1, 1])

loss_2 = torch.nn.functional.cross_entropy(logits_2, targets_2)
print(loss_2)

tensor(0.7936)


In [22]:
targets_3 = torch.tensor([0, 1, -100])

loss_3 = torch.nn.functional.cross_entropy(logits_2, targets_3)
print(loss_3)
print("loss_1 == loss_3:", loss_1 == loss_3)

tensor(1.1269)
loss_1 == loss_3: tensor(True)


In [23]:
### Based on this result we can see that the resulting loss on these 3 training examples is identical to the loss we calculated from
### the 2 training examples.
### In other words, the cross entropy loss function ignored the third entry in the targets_3 vector, the token ID corresponding
### to -100.
### The default setting of the cross entropy loss function has ignore_index=-100. This means that it ignores targets labeled with -100


### Masking target token IDs

In [24]:
### In addition to masking out padding tokens, it is also common to mask out the target token IDs that correspond to the instruction
### By masking out the target token IDs that correspond to the instruction, the LLM cross entropy loss is only computed for the
### generated response target IDs.
### By masking out the instruction tokens, the model is trained to focus on generating accurate responses rather than additionally 
### also memorizing instructions, which can help with reducing overfitting.

In [25]:
### Currently, researchers are divided on whether masking the instructions is universally beneficial during instruction finetuning
### For instance, a recent paper titled "Instruction tuning with loss over instructions" demonstrated that not masking the 
### instructions benefits the LLM performance.