<a href="https://colab.research.google.com/github/alexlinapp/proofLLM/blob/main/finetuning_LoRA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [47]:
from pathlib import Path
import pandas as pd
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import tiktoken
'''

Download boilerplate code from gpt_download.py

'''
import importlib
import urllib.request
url = (
 "https://raw.githubusercontent.com/rasbt/"
 "LLMs-from-scratch/main/ch05/"
 "01_main-chapter-code/gpt_download.py"
)
filename = url.split('/')[-1]
file_name, _ = urllib.request.urlretrieve(url, filename)


from gpt_class_finetune import *
import importlib
import previous_chapters
importlib.reload(previous_chapters)
from previous_chapters import *
url = \
"https://archive.ics.uci.edu/static/public/228/sms+spam+collection.zip"
zip_path = "sms_spam_collection.zip"
extracted_path = "sms_spam_collection"
data_file_path = Path(extracted_path) / "SMSSpamCollection.tsv"
download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)

df = pd.read_csv(data_file_path, sep="\t", header=None, names=["Label", "Text"])
balanced_df = create_balanced_dataset(df)
balanced_df["Label"] = balanced_df["Label"].map({"ham" : 0, "spam" : 1})

train_df, validation_df, test_df = random_split(balanced_df, 0.7, 0.1)
train_df.to_csv("train.csv", index=None)
validation_df.to_csv("validation.csv", index=None)
test_df.to_csv("test.csv", index=None)


sms_spam_collection/SMSSpamCollection.tsv already exists. Skipping download and extraction.


In [48]:
tokenizer = tiktoken.get_encoding("gpt2")
train_dataset = SpamDataset("train.csv", tokenizer, max_length=None)
val_dataset = SpamDataset("validation.csv", tokenizer, max_length=None)
test_dataset = SpamDataset("test.csv", tokenizer, max_length=None)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
num_workers=0
batch_size=8
torch.manual_seed(123)

train_loader = DataLoader(train_dataset, batch_size=batch_size,
                          shuffle=True,
                          num_workers=num_workers,
                          drop_last=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size,
                        shuffle=False,
                        num_workers=num_workers,
                        drop_last=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size,
                         shuffle=False,
                         num_workers=num_workers,
                         drop_last=False)
# print("Train Loader: ")
# for input, target in train_loader:
#   print(input.shape, target.shape)

In [49]:
CHOOSE_MODEL = "gpt2-small (124M)"
INPUT_PROMPT = "Every effort moves"

BASE_CONFIG = {
    "vocab_size" : 50257,
    "context_length" : 1024,
    "drop_rate" : 0.0,
    "qkv_bias" : True
}

model_configs = {
  "gpt2-small (124M)": {"emb_dim": 768, "n_layers": 12, "n_heads": 12},
  "gpt2-medium (355M)": {"emb_dim": 1024, "n_layers": 24, "n_heads": 16},
  "gpt2-large (774M)": {"emb_dim": 1280, "n_layers": 36, "n_heads": 20},
  "gpt2-xl (1558M)": {"emb_dim": 1600, "n_layers": 48, "n_heads": 25},
}

BASE_CONFIG.update(model_configs[CHOOSE_MODEL])
model_size = CHOOSE_MODEL.split(" ")[-1].lstrip("(").rstrip(")")
settings, params = download_and_load_gpt2(model_size=model_size, models_dir="gpt2")
model = GPTModel(BASE_CONFIG)
load_weights_into_gpt(model, params)
model.eval();

File already exists and is up-to-date: gpt2/124M/checkpoint
File already exists and is up-to-date: gpt2/124M/encoder.json
File already exists and is up-to-date: gpt2/124M/hparams.json
File already exists and is up-to-date: gpt2/124M/model.ckpt.data-00000-of-00001
File already exists and is up-to-date: gpt2/124M/model.ckpt.index
File already exists and is up-to-date: gpt2/124M/model.ckpt.meta
File already exists and is up-to-date: gpt2/124M/vocab.bpe


In [51]:
torch.manual_seed(123)
model.out_head = torch.nn.Linear(BASE_CONFIG["emb_dim"], 2)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
# for parameters in model.parameters():
#   parameters.requires_grad = False
# for parameters in model.out_head.parameters():
#   parameters.requires_grad = True

In [52]:
torch.manual_seed(123)

train_acc = calc_accuracy_loader(train_loader, model, device, num_batches=10)
val_acc = calc_accuracy_loader(val_loader, model, device, num_batches=10)
test_acc = calc_accuracy_loader(test_loader, model, device, num_batches=10)

print(f"Train Accuracy: {train_acc}")
print(f"Validation Accuracy: {val_acc}")
print(f"Test Accuracy: {test_acc}")

Train Accuracy: 0.4625
Validation Accuracy: 0.45
Test Accuracy: 0.4875


In [53]:
import math
import torch.nn as nn
class LoRALayer(nn.Module):
  def __init__(self, in_dim, out_dim, rank, alpha):
    super().__init__()
    self.A = torch.nn.Parameter(torch.empty(in_dim, rank))
    torch.nn.init.kaiming_uniform(self.A, a=math.sqrt(5))   # Same initiallization layer used in nn.Linear for Pytorch
    self.B = torch.nn.Parameter(torch.zeros(rank, out_dim))
    self.alpha = alpha
  def forward(self, x):
    return (x @ self.A @ self.B) * self.alpha

class LinearWithLoRA(nn.Module):
  def __init__(self, linear, rank, alpha):
    super().__init__()
    self.linear = linear
    self.lora = LoRALayer(linear.in_features, linear.out_features, rank, alpha)

  def forward(self, x):
    return self.linear(x) + self.lora(x)


def replace_linear_with_lora(model, rank, alpha):
  for name, module in model.named_children():
    if isinstance(module, torch.nn.Linear):
      setattr(model, name, LinearWithLoRA(module, rank, alpha))
    else:
      replace_linear_with_lora(module, rank, alpha) # recursively applies to modules in children


In [54]:
for param in model.parameters():
  param.requires_grad = False
num_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Number of trainable parameters: {num_parameters}")
replace_linear_with_lora(model, rank=16, alpha=16)
num_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Number of trainable parameters: {num_parameters}")

Number of trainable parameters: 0
Number of trainable parameters: 2666528


  torch.nn.init.kaiming_uniform(self.A, a=math.sqrt(5))   # Same initiallization layer used in nn.Linear for Pytorch


In [55]:
torch.manual_seed(123)
model.to(device)
train_accuracy = calc_accuracy_loader(
 train_loader, model, device, num_batches=10
)
val_accuracy = calc_accuracy_loader(
 val_loader, model, device, num_batches=10
)
test_accuracy = calc_accuracy_loader(
 test_loader, model, device, num_batches=10
)
print(f"Training accuracy: {train_accuracy*100:.2f}%")
print(f"Validation accuracy: {val_accuracy*100:.2f}%")
print(f"Test accuracy: {test_accuracy*100:.2f}%")

Training accuracy: 46.25%
Validation accuracy: 45.00%
Test accuracy: 48.75%


In [56]:
import time

start_time = time.time()
model.to(device)
torch.manual_seed(123)
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5, weight_decay=0.1)
num_epochs=5
train_losses, val_losses, train_accs, val_accs, examples_seen = train_classifier_simple(model,
    train_loader,
    val_loader,
    optimizer,
    device,
    num_epochs,
    eval_freq=50,
    eval_iter=5
)

endtime = time.time()
execution_time_minutes = (endtime - start_time) / 60
print(f"Execution time: {execution_time_minutes:.2f} minutes")



Ep 1 (Step 000000): Train loss 3.820, Val loss 2.495
Ep 1 (Step 000050): Train loss 0.396, Val loss 0.373
Ep 1 (Step 000100): Train loss 0.111, Val loss 0.194
Training accuracy: 97.50% | Validation accuracy: 92.50%
Ep 2 (Step 000150): Train loss 0.135, Val loss 0.082
Ep 2 (Step 000200): Train loss 0.012, Val loss 0.045
Ep 2 (Step 000250): Train loss 0.027, Val loss 0.169
Training accuracy: 100.00% | Validation accuracy: 97.50%
Ep 3 (Step 000300): Train loss 0.129, Val loss 0.087
Ep 3 (Step 000350): Train loss 0.396, Val loss 0.505
Training accuracy: 100.00% | Validation accuracy: 95.00%
Ep 4 (Step 000400): Train loss 0.006, Val loss 0.128
Ep 4 (Step 000450): Train loss 0.013, Val loss 0.120
Ep 4 (Step 000500): Train loss 0.052, Val loss 0.162
Training accuracy: 100.00% | Validation accuracy: 90.00%
Ep 5 (Step 000550): Train loss 0.003, Val loss 0.628
Ep 5 (Step 000600): Train loss 0.000, Val loss 0.204
Training accuracy: 100.00% | Validation accuracy: 90.00%
Execution time: 1.89 minute

In [None]:
# print(len(params['blocks']), settings)
# text_1 = "Every effort moves you"
# token_ids = generate_text_simple(model, text_to_token_ids(text_1, tokenizer),
#                                  15, BASE_CONFIG["context_length"])
# print(token_ids_to_text(token_ids, tokenizer))