<a href="https://colab.research.google.com/github/alexlinapp/proofLLM/blob/main/finetuning_instruction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import json
import os
import urllib

def download_and_load_file(file_path, url):
  if not os.path.exists(file_path):
    with urllib.request.urlopen(url) as response:
      text_data = response.read().decode('utf-8')
    with open(file_path, 'w', encoding='utf-8') as file:
        file.write(text_data)

  with open(file_path, 'r', encoding='utf-8') as file:
    data = json.load(file)

  return data

file_path = "instruction-data.json"
url = url = (
    "https://raw.githubusercontent.com/rasbt/LLMs-from-scratch"
    "/main/ch07/01_main-chapter-code/instruction-data.json"
)

data = download_and_load_file(file_path, url)
data = download_and_load_file(file_path, url)
print("Number of entries:", len(data))
print(data[0].items())


Number of entries: 1100
dict_items([('instruction', 'Evaluate the following phrase by transforming it into the spelling given.'), ('input', 'freind --> friend'), ('output', 'The spelling of the given phrase "freind" is incorrect, the correct spelling is "friend".')])


{'instruction': 'Suggest a more formal synonym for "happy."', 'input': '', 'output': 'A more formal synonym for "happy" is "content."'}


In [2]:
def format_input(entry):
  instruction_text = (
  f"Below is an instruction that describes a task. "
  f"Write a response that appropriately completes the request."
  f"\n\n### Instruction:\n{entry['instruction']}"
  )

  input_text = (
  f"\n\n### Input:\n{entry['input']}" if entry["input"] else ""
  )
  return instruction_text + input_text

In [3]:
model_input = format_input(data[50])
desired_response = f"\n\n### Response:\n{data[50]['output']}"
print(model_input + desired_response)

Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
Identify the correct spelling of the following word.

### Input:
Ocassion

### Response:
The correct spelling is 'Occasion.'


In [13]:
train_portion = int(len(data) * 0.85)   # 85% for training
test_portion = int(len(data) * 0.1)     # 10% for test
val_portion = len(data) - train_portion - test_portion    # 5 % for validation

train_data = data[:train_portion]
test_data = data[train_portion:train_portion + test_portion]
val_data = data[train_portion + test_portion:]


print("Training set length:", len(train_data))
print("Validation set length:", len(val_data))
print("Test set length:", len(test_data))



Training set length: 935
Validation set length: 55
Test set length: 110
{'instruction': 'Evaluate the following phrase by transforming it into the spelling given.', 'input': 'freind --> friend', 'output': 'The spelling of the given phrase "freind" is incorrect, the correct spelling is "friend".'}


In [5]:
import torch
from torch.utils.data import Dataset

class InstructionDataSet(Dataset):
  def __init__(self, data, tokenizer):
    self.data = data
    self.encoded_texts = []
    for entry in data:
      instruction_plus_input = format_input(entry)
      response_text = f"\n\n### Response:\n{entry['output']}"
      full_text = instruction_plus_input + response_text
      self.encoded_texts.append(tokenizer.encode(full_text))

  def __getitem__(self, idx):
    return self.encoded_texts[idx]

  def __len__(self):
    return len(self.encoded_texts)



In [6]:
import tiktoken
tokenizer = tiktoken.get_encoding("gpt2")
print(tokenizer.encode("<|endoftext|>", allowed_special={"<|endoftext|>"}))


[50256]


In [7]:
def custom_collate_draft1(batch, device="cpu", pad_token_id=50256):
  batch_max_length = max(len(item) + 1 for item in batch)
  inputs_lst = []

  for item in batch:
    new_item = item.copy()
    new_item += [pad_token_id]
    padded = (new_item + [pad_token_id] * (batch_max_length - len(new_item)))

    inputs = torch.tensor(padded[:-1])
    inputs_lst.append(inputs)

  inputs_tensor = torch.stack(inputs_lst).to(device)
  return inputs_tensor

def custom_collate_draft2(batch, device="cpu", pad_token_id=50256):
  batch_max_length = max(len(item) + 1 for item in batch)
  inputs_lst, targets_lst = [], []


  for item in batch:
    new_item = item.copy()
    new_item += [pad_token_id]
    padded = (new_item + [pad_token_id] * (batch_max_length - len(new_item)))

    inputs = torch.tensor(padded[:-1])
    targets = torch.tensor(padded[1:])
    inputs_lst.append(inputs)
    targets_lst.append(targets)


  inputs_tensor = torch.stack(inputs_lst).to(device)
  targets_tensor = torch.stack(targets_lst).to(device)
  return inputs_tensor, targets_tensor


# use ignore_idx = -100 because built in PyTorch nn.crossentropy loss ignores targets with target_label=-100
def custom_collate_fn(batch, ignore_idx=-100, allowed_max_length=None, device="cpu", pad_token_id=50256):
  batch_max_length = max(len(item) + 1 for item in batch)
  inputs_lst, targets_lst = [], []


  for item in batch:
    new_item = item.copy()
    new_item += [pad_token_id]
    padded = (new_item + [pad_token_id] * (batch_max_length - len(new_item)))

    inputs = torch.tensor(padded[:-1])
    targets = torch.tensor(padded[1:])

    mask = targets == pad_token_id
    indices = torch.nonzero(mask).squeeze()
    if indices.numel() > 1:
      targets[indices[1:]] = ignore_idx   # using targets[[5,6,7]] where [5,6,7] could be list or tensor or tuple and sets tho elements to ignore_idx

    inputs_lst.append(inputs)
    targets_lst.append(targets)


  inputs_tensor = torch.stack(inputs_lst).to(device)
  targets_tensor = torch.stack(targets_lst).to(device)
  return inputs_tensor, targets_tensor

In [8]:
# testing custom_collate_draft1 function

inputs_1 = [0,1,2,3,4]
inputs_2 = [5,6]
inputs_3 = [7,8,9]
batch = (inputs_1, inputs_2, inputs_3)

print(custom_collate_draft1(batch))

inputs, targets = custom_collate_draft2(batch)
print(inputs)
print(targets)

inputs, targets = custom_collate_fn(batch)
print(inputs)
print(targets)

tensor([[    0,     1,     2,     3,     4],
        [    5,     6, 50256, 50256, 50256],
        [    7,     8,     9, 50256, 50256]])
tensor([[    0,     1,     2,     3,     4],
        [    5,     6, 50256, 50256, 50256],
        [    7,     8,     9, 50256, 50256]])
tensor([[    1,     2,     3,     4, 50256],
        [    6, 50256, 50256, 50256, 50256],
        [    8,     9, 50256, 50256, 50256]])
tensor([[    0,     1,     2,     3,     4],
        [    5,     6, 50256, 50256, 50256],
        [    7,     8,     9, 50256, 50256]])
tensor([[    1,     2,     3,     4, 50256],
        [    6, 50256,  -100,  -100,  -100],
        [    8,     9, 50256,  -100,  -100]])


In [9]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

from functools import partial

customized_collate_fn = partial(custom_collate_fn, device=device, allowed_max_length=1024)

from torch.utils.data import DataLoader

num_workers = 0
batch_size = 8

torch.manual_seed(123)

train_dataset = InstructionDataSet(train_data, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True,
                          collate_fn=customized_collate_fn,
                          drop_last=True,
                          num_workers=num_workers)

val_dataset = InstructionDataSet(val_data, tokenizer)
val_loader = DataLoader(val_dataset, batch_size=batch_size,
                        collate_fn=customized_collate_fn,
                        shuffle=False,
                        drop_last=False,
                        num_workers=num_workers)


test_dataset = InstructionDataSet(test_data, tokenizer)
test_loader = DataLoader(test_dataset, batch_size=batch_size,
                        collate_fn=customized_collate_fn,
                        shuffle=False,
                        drop_last=False,
                        num_workers=num_workers)
print("Train loader:")
for inputs, targets in train_loader:
  print(inputs.shape, targets.shape)


Device: cuda
Train loader:
torch.Size([8, 61]) torch.Size([8, 61])
torch.Size([8, 76]) torch.Size([8, 76])
torch.Size([8, 73]) torch.Size([8, 73])
torch.Size([8, 68]) torch.Size([8, 68])
torch.Size([8, 65]) torch.Size([8, 65])
torch.Size([8, 72]) torch.Size([8, 72])
torch.Size([8, 80]) torch.Size([8, 80])
torch.Size([8, 67]) torch.Size([8, 67])
torch.Size([8, 62]) torch.Size([8, 62])
torch.Size([8, 75]) torch.Size([8, 75])
torch.Size([8, 62]) torch.Size([8, 62])
torch.Size([8, 68]) torch.Size([8, 68])
torch.Size([8, 67]) torch.Size([8, 67])
torch.Size([8, 77]) torch.Size([8, 77])
torch.Size([8, 69]) torch.Size([8, 69])
torch.Size([8, 79]) torch.Size([8, 79])
torch.Size([8, 71]) torch.Size([8, 71])
torch.Size([8, 66]) torch.Size([8, 66])
torch.Size([8, 83]) torch.Size([8, 83])
torch.Size([8, 68]) torch.Size([8, 68])
torch.Size([8, 80]) torch.Size([8, 80])
torch.Size([8, 71]) torch.Size([8, 71])
torch.Size([8, 69]) torch.Size([8, 69])
torch.Size([8, 65]) torch.Size([8, 65])
torch.Size([8

In [53]:
'''

Download boilerplate code from gpt_download.py

'''
import importlib
import urllib.request
url = (
 "https://raw.githubusercontent.com/rasbt/"
 "LLMs-from-scratch/main/ch05/"
 "01_main-chapter-code/gpt_download.py"
)
filename = url.split('/')[-1]
file_name, _ = urllib.request.urlretrieve(url, filename)
from gpt_download import download_and_load_gpt2
import previous_chapters
importlib.reload(previous_chapters)
from previous_chapters import *


In [64]:
BASE_CONFIG = {
 "vocab_size": 50257, # Vocabulary size
 "context_length": 1024, # Context length
 "drop_rate": 0.0, # Dropout rate
 "qkv_bias": True # Query-key-value bias
}

model_configs = {
 "gpt2-small (124M)": {"emb_dim": 768, "n_layers": 12, "n_heads": 12},
 "gpt2-medium (355M)": {"emb_dim": 1024, "n_layers": 24, "n_heads": 16},
 "gpt2-large (774M)": {"emb_dim": 1280, "n_layers": 36, "n_heads": 20},
 "gpt2-xl (1558M)": {"emb_dim": 1600, "n_layers": 48, "n_heads": 25},
}

CHOOSE_MODEL = "gpt2-medium (355M)"
BASE_CONFIG.update(model_configs[CHOOSE_MODEL])
model_size = CHOOSE_MODEL.split(" ")[-1].lstrip("(").rstrip(")")
settings, params = download_and_load_gpt2(
 model_size=model_size,
 models_dir="gpt2"
)
model = GPTModel(BASE_CONFIG)
load_weights_into_gpt(model, params)
model.eval();

File already exists and is up-to-date: gpt2/355M/checkpoint
File already exists and is up-to-date: gpt2/355M/encoder.json
File already exists and is up-to-date: gpt2/355M/hparams.json
File already exists and is up-to-date: gpt2/355M/model.ckpt.data-00000-of-00001
File already exists and is up-to-date: gpt2/355M/model.ckpt.index
File already exists and is up-to-date: gpt2/355M/model.ckpt.meta
File already exists and is up-to-date: gpt2/355M/vocab.bpe


In [26]:
torch.manual_seed(123)
input_text = format_input(val_data[0])
print(input_text)

token_ids = generate(model=model, idx=text_to_token_ids(input_text, tokenizer),
                     max_new_tokens=35,
                     context_size=BASE_CONFIG['context_length'],
                     eos_id=50256)
generated_text = token_ids_to_text(token_ids, tokenizer);

Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
Convert the active sentence to passive: 'The chef cooks the meal every day.'


In [27]:
print(generated_text)
response_text = generated_text[len(input_text):].strip()
print("==========================")
print(response_text)

Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
Convert the active sentence to passive: 'The chef cooks the meal every day.'

### Response:

The chef cooks the meal every day.

### Instruction:

Convert the active sentence to passive: 'The chef cooks the
### Response:

The chef cooks the meal every day.

### Instruction:

Convert the active sentence to passive: 'The chef cooks the


In [65]:
model.to(device)
torch.manual_seed(123)

with torch.no_grad():
  train_loss = calc_loss_loader(train_loader, model, device=device, num_batches=5)
  val_loss = calc_loss_loader(val_loader, model, device=device, num_batches=5)

print(f"Train loss: {train_loss:.4f}")
print(f"Validation loss: {val_loss:.4f}")


Train loss: 3.8259
Validation loss: 3.7619


In [66]:
import time

start_time = time.time()
torch.manual_seed(123)
optimizer = torch.optim.AdamW(model.parameters(), lr=0.00005, weight_decay=0.1)
num_epochs = 2

train_losses, val_losses, tokens_seen = train_model_simple(model=model,
                                                           train_loader=train_loader,
                                                           val_loader=val_loader,
                                                           optimizer=optimizer,
                                                           device=device,
                                                           num_epochs=num_epochs,
                                                           eval_freq=5,
                                                           eval_iter=5,
                                                           start_context=format_input(val_data[0]),
                                                           tokenizer=tokenizer)

end_time = time.time()
execution_time = (end_time - start_time) / 60
print(f"Training complete in: {execution_time} minutes")

Ep 1 (Step 000000): Train loss 2.637, Val loss 2.626
Ep 1 (Step 000005): Train loss 1.174, Val loss 1.103
Ep 1 (Step 000010): Train loss 0.872, Val loss 0.944
Ep 1 (Step 000015): Train loss 0.857, Val loss 0.906
Ep 1 (Step 000020): Train loss 0.776, Val loss 0.881
Ep 1 (Step 000025): Train loss 0.754, Val loss 0.859
Ep 1 (Step 000030): Train loss 0.799, Val loss 0.836
Ep 1 (Step 000035): Train loss 0.714, Val loss 0.808
Ep 1 (Step 000040): Train loss 0.672, Val loss 0.806
Ep 1 (Step 000045): Train loss 0.633, Val loss 0.789
Ep 1 (Step 000050): Train loss 0.663, Val loss 0.783
Ep 1 (Step 000055): Train loss 0.760, Val loss 0.763
Ep 1 (Step 000060): Train loss 0.719, Val loss 0.743
Ep 1 (Step 000065): Train loss 0.653, Val loss 0.735
Ep 1 (Step 000070): Train loss 0.533, Val loss 0.729
Ep 1 (Step 000075): Train loss 0.568, Val loss 0.729
Ep 1 (Step 000080): Train loss 0.604, Val loss 0.725
Ep 1 (Step 000085): Train loss 0.509, Val loss 0.710
Ep 1 (Step 000090): Train loss 0.563, Val loss

In [45]:
torch.manual_seed(123)
for entry in test_data[:3]:
  input_text = format_input(entry)    # format_input takes in a dict
  token_ids = generate(model=model,
                       idx=text_to_token_ids(input_text, tokenizer).to(device),
                       max_new_tokens=256,
                       context_size=BASE_CONFIG['context_length'],
                       eos_id=50256)
  generated_text = token_ids_to_text(token_ids, tokenizer)
  response_text = (generated_text[len(input_text):]
                   .replace("### Response:", "")
                   .strip())
  print(input_text)
  print(f"\nCorrect response:\n>> {entry['output']}")
  print(f"\nModel's PURE RESPONSE:\n>> {generated_text.strip()}")
  print(f"\nModel response:\n>> {response_text.strip()}")
  print("-------------------------------------")

Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
Rewrite the sentence using a simile.

### Input:
The car is very fast.

Correct response:
>> The car is as fast as lightning.

Model's PURE RESPONSE:
>> Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
Rewrite the sentence using a simile.

### Input:
The car is very fast.

### Response:
The car is as fast as a bullet.

Model response:
>> The car is as fast as a bullet.
-------------------------------------
Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
What type of cloud is typically associated with thunderstorms?

Correct response:
>> The type of cloud typically associated with thunderstorms is cumulonimbus.

Model's PURE RESPONSE:
>> Below is an instruction that describes a task. Write a response that appropriate

In [84]:
from tqdm import tqdm

for i, entry in tqdm(enumerate(test_data), total=len(test_data)):
  input_text = format_input(entry)

  token_ids = generate(model=model,
                       idx=text_to_token_ids(input_text, tokenizer).to(device),
                       max_new_tokens=256,
                       context_size=BASE_CONFIG['context_length'],
                       eos_id=50256)
  generated_text = token_ids_to_text(token_ids, tokenizer)

  response_text = generated_text[len(input_text):].replace("### Response:", "").strip()
  test_data[i]["model_response"] = response_text

with open("instruction-data-with-response.json", "w") as file:
 json.dump(test_data, file, indent=4)



100%|██████████| 110/110 [01:13<00:00,  1.50it/s]


In [85]:
import re
file_name = f"{re.sub(r'[ ()]', '', CHOOSE_MODEL) }-sft.pth"
torch.save(model.state_dict(), file_name)
print(f"Model saved as {file_name}")

# Load model via
# model.load_state_dict(torch.load("gpt2-medium355M-sft.pth"))

Model saved as gpt2-medium355M-sft.pth


In [83]:
import previous_chapters
importlib.reload(previous_chapters)
from previous_chapters import *

# WHAT HPAPENS WHEN YOU DONTN STOP AT EOS. The model seems to hallucinate
# afterwards. Most liekly because it was not trained well after EOS
#print(text_to_token_ids(format_input(val_data[0]), tokenizer).shape)
s = generate_and_print_sample(model, tokenizer, device=device,
                              start_context=format_input(val_data[1]))

print("HELLEOEAIJILLAKSJDLKSADJ")
print(format_input(val_data[1]))
print("========")





### Response:
Dance is a verb.<|endoftext|>The following is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
What is the capital of the United Kingdom?

### Response:
The capital of the United Kingdom is London.<|endoftext|>The following is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
What is the capital of the Netherlands?

### Response
HELLEOEAIJILLAKSJDLKSADJ
Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
Classify an input string as either a noun or a verb.

### Input:
Dance
