<a href="https://colab.research.google.com/github/alexlinapp/proofLLM/blob/main/finetuning_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import urllib.request
import zipfile
import os
from pathlib import Path

url = "https://archive.ics.uci.edu/static/public/228/sms+spam+collection.zip"
zip_path = "sms_spam_collection.zip"
extracted_path = "sms_spam_collection"
data_file_path = Path(extracted_path) / "SMSSpamCollection.tsv"

print("Hello, nothing should download from this.")
def download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path):
  if data_file_path.exists():
    print(f"Data file already exists at {data_file_path}. Skipping download and extraction.")
    return

  # downloads the file
  with urllib.request.urlopen(url) as response:
    with open(zip_path, "wb") as zip_file:
      zip_file.write(response.read())

  # unzips the file
  with zipfile.ZipFile(zip_path, "r") as zip_ref:
    zip_ref.extractall(extracted_path)

  original_file_path = Path(extracted_path) / "SMSSpamCollection"
  os.rename(original_file_path, data_file_path)
  print(f"File downloaded and saved as {data_file_path}")

download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)


Hello, nothing should download from this.
File downloaded and saved as sms_spam_collection/SMSSpamCollection.tsv


In [2]:
import tiktoken
tokenizer = tiktoken.get_encoding("gpt2")
print(tokenizer.encode("<|endoftext|>", allowed_special={"<|endoftext|>"}))

[50256]


In [3]:
import pandas as pd

df = pd.read_csv(data_file_path, sep="\t", header=None, names=["label", "text"])
df
print(df["label"].value_counts())


def create_balanced_dataset(df):
  num_spam = df[df["label"] == "spam"].shape[0]
  ham_subset = df[df["label"] == "ham"].sample(n=num_spam, random_state=123)
  balanced_df = pd.concat([ham_subset, df[df["label"] == "spam"]])
  return balanced_df

def random_split(df, train_frac, validation_frac):
  df = df.sample(frac=1, random_state=123).reset_index(drop=True)
  train_end = int(train_frac * len(df))
  validation_end = train_end + int(validation_frac * len(df))


  train_df = df[:train_end]
  validation_df = df[train_end:validation_end]
  test_df = df[validation_end:]

  return train_df, validation_df, test_df


balanced_df = create_balanced_dataset(df)
print(balanced_df["label"].value_counts())

balanced_df["label"] = balanced_df["label"].map({"ham": 0, "spam": 1})

train_df, validation_df, test_df = random_split(balanced_df, 0.7, 0.1)

train_df.to_csv("train.tsv", index=None)
validation_df.to_csv("validation.tsv", index=None)
test_df.to_csv("test.tsv", index=None)

label
ham     4825
spam     747
Name: count, dtype: int64
label
ham     747
spam    747
Name: count, dtype: int64


In [4]:
import torch
from torch.utils.data import Dataset

class SpamDataset(Dataset):
  def __init__(self, csv_file, tokenizer, max_length=None, pad_token_id=50256):
    self.data = pd.read_csv(csv_file)
    self.encoded_texts = [tokenizer.encode(text) for text in self.data['text']]
    if max_length is None:
      self.max_length = self.__longest_encoded_length()
    else:
      self.max_length = max_length
      self.encoded_texts = [encoded_text[:self.max_length] for encoded_text in self.encoded_texts]

    self.encoded_texts = [encoded_text + [pad_token_id] * (self.max_length - len(encoded_text)) for encoded_text in self.encoded_texts]

  def __getitem__(self, idx):
    encoded = self.encoded_texts[idx]
    label = self.data.iloc[idx]["label"]
    return (torch.tensor(encoded), torch.tensor(label))
  def __len__(self):
    return len(self.data)

  def __longest_encoded_length(self):
    max_length = 0
    for encoded_text in self.encoded_texts:
      max_length = max(max_length, len(encoded_text))
    return max_length

In [5]:
train_dataset= SpamDataset(csv_file="train.tsv", tokenizer=tokenizer)
print(train_dataset.max_length)
validation_dataset = SpamDataset("validation.tsv", tokenizer, max_length=train_dataset.max_length)
print(validation_dataset.max_length)
test_dataset = SpamDataset("test.tsv", tokenizer, max_length=train_dataset.max_length)

120
120


In [6]:
from torch.utils.data import DataLoader

num_workers = 0
batch_size = 8;
torch.manual_seed(123)

train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers, drop_last=True)
validation_loader = DataLoader(dataset=validation_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers, drop_last=False)
test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers, drop_last=False)

x = next(iter(train_loader))
next(iter(train_loader))[0].shape, next(iter(train_loader))[1].shape


print(f"len(train_loader): {len(train_loader)}")
print(f"len(validation_loader): {len(validation_loader)}")
print(f"len(test_loader): {len(test_loader)}")

len(train_loader): 130
len(validation_loader): 19
len(test_loader): 38


In [30]:
CHOOSE_MODEL = "gpt2-small (124M)"
INPUT_PROMPT = "Every effort moves"

BASE_CONFIG = {
        "vocab_size" : 50257,
        "context_length" : 1024,
        "drop_rate" : 0.0,
        "qkv_bias"  : True
}
model_configs = {
 "gpt2-small (124M)": {"emb_dim": 768, "n_layers": 12, "n_heads": 12},
 "gpt2-medium (355M)": {"emb_dim": 1024, "n_layers": 24, "n_heads": 16},
 "gpt2-large (774M)": {"emb_dim": 1280, "n_layers": 36, "n_heads": 20},
 "gpt2-xl (1558M)": {"emb_dim": 1600, "n_layers": 48, "n_heads": 25},
}
BASE_CONFIG.update(model_configs[CHOOSE_MODEL])

In [31]:
'''

Download boilerplate code from gpt_download.py

'''

import urllib.request
url = (
 "https://raw.githubusercontent.com/rasbt/"
 "LLMs-from-scratch/main/ch05/"
 "01_main-chapter-code/gpt_download.py"
)
filename = url.split('/')[-1]
file_name, _ = urllib.request.urlretrieve(url, filename)
from gpt_download import download_and_load_gpt2
from previous_chapters import GPTModel, load_weights_into_gpt, generate_text_simple, text_to_token_ids, token_ids_to_text

model_size = CHOOSE_MODEL.split(' ')[-1].lstrip('(').rstrip(')')
settings, params = download_and_load_gpt2(model_size=model_size, models_dir="gpt2")
model = GPTModel(BASE_CONFIG)
load_weights_into_gpt(model, params)
model.eval();   # use semicolon to suppress the output display

File already exists and is up-to-date: gpt2/124M/checkpoint
File already exists and is up-to-date: gpt2/124M/encoder.json
File already exists and is up-to-date: gpt2/124M/hparams.json
File already exists and is up-to-date: gpt2/124M/model.ckpt.data-00000-of-00001
File already exists and is up-to-date: gpt2/124M/model.ckpt.index
File already exists and is up-to-date: gpt2/124M/model.ckpt.meta
File already exists and is up-to-date: gpt2/124M/vocab.bpe


In [None]:
text_1 = "Every effort moves you"
token_ids = generate_text_simple(model=model, idx=text_to_token_ids(text_1, tokenizer),
                                max_new_tokens=15,
                                 context_size=BASE_CONFIG["context_length"])
print(token_ids_to_text(token_ids, tokenizer))

text_2 = (
    "Is the following text 'spam'? Answer with 'yes' or 'no':"
    " 'You are a winner you have been specially"
    " selected to receive $1000 cash or a $2000 award.'"
)

token_ids = generate_text_simple(model=model, idx=text_to_token_ids(text_2, tokenizer),
                                 max_new_tokens=15,
                                 context_size=BASE_CONFIG["context_length"])
print(token_ids_to_text(token_ids, tokenizer))

Every effort moves you forward.

The first step is to understand the importance of your work
Is the following text 'spam'? Answer with 'yes' or 'no': 'You are a winner you have been specially selected to receive $1000 cash or a $2000 award.'

The following text 'spam'? Answer with 'yes' or


In [32]:
for param in model.parameters():
  param.requires_grad = False

torch.manual_seed(123)
num_classes = 2
model.out_head = torch.nn.Linear(
    in_features=BASE_CONFIG["emb_dim"],
    out_features=num_classes
)

# Unfreeze final transformer block
for param in model.trf_blocks[-1].parameters():
  param.requires_grad=True

for param in model.final_norm.parameters():
  param.requires_grad=True

In [None]:
inputs = tokenizer.encode("Do you have time")
inputs = torch.tensor(inputs).unsqueeze(0)
print("Inputs: ", inputs)
print("Input Dimensions: ", inputs.shape)

with torch.no_grad():
  outputs = model(inputs)
  print("Outputs: ", outputs)
  print("Output Dimensions: ", outputs.shape)


last_output_tokens = outputs[:,-1,:]
print("last output tokens:", last_output_tokens)

probas = torch.softmax(last_output_tokens, dim=-1)
print("Probas: ", probas)
label = torch.argmax(probas)

Inputs:  tensor([[5211,  345,  423,  640]])
Input Dimensions:  torch.Size([1, 4])


RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument index in method wrapper_CUDA__index_select)

In [33]:
def calc_accuracy_loader(data_loader, model, device, num_batches=None):
  model.eval()
  correct_predictions, total_predictions = 0, 0
  if num_batches is None:
    num_batches = len(data_loader)
  else:
    num_batches = min(num_batches, len(data_loader))
  for batch_idx, (inputs, targets) in enumerate(data_loader):
    if batch_idx < num_batches:
      inputs, targets = inputs.to(device), targets.to(device)

      with torch.no_grad():
        logits = model(inputs)[:,-1,:]
      predicted_labels = torch.argmax(logits, dim=-1)
      total_predictions += predicted_labels.shape[0]
      correct_predictions += (predicted_labels == targets).sum().item()
    else:
      break
  return correct_predictions / total_predictions

In [34]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

torch.manual_seed(123)
train_accuracy = calc_accuracy_loader(train_loader, model, device, 10)
val_accuracy = calc_accuracy_loader(validation_loader, model, device, 10)

test_accuracy = calc_accuracy_loader(
 test_loader, model, device, num_batches=10
)

# Not fine-tuned. Basically random guessing ~50%
print(f"Training accuracy: {train_accuracy*100:.2f}%")
print(f"Validation accuracy: {val_accuracy*100:.2f}%")
print(f"Test accuracy: {test_accuracy*100:.2f}%")


Training accuracy: 46.25%
Validation accuracy: 45.00%
Test accuracy: 48.75%


In [35]:
def calc_loss_batch(input_batch, target_batch, model, device):
  input_batch = input_batch.to(device)
  target_batch = target_batch.to(device)
  logits = model(input_batch)[:,-1,:]
  loss = torch.nn.functional.cross_entropy(logits, target_batch)
  return loss

def calc_loss_loader(data_loader, model, device, num_batches=None):
  loss = 0
  if len(data_loader) == 0:
    return float("nan")
  elif num_batches is None:
    num_batches = len(data_loader)
  else:
    num_batches = min(num_batches, len(data_loader))
  for i, (input_batch, target_batch) in enumerate(data_loader):
    if i >= num_batches:
      break
    loss += calc_loss_batch(input_batch, target_batch, model, device).item()
  return loss / num_batches

In [39]:

with torch.no_grad():
  train_loss = calc_loss_loader(train_loader, model, device, num_batches=5)
  val_loss = calc_loss_loader(validation_loader, model, device, num_batches=5)
  test_loss = calc_loss_loader(test_loader, model, device, num_batches=5)
print(f"Training loss: {train_loss:.3f}")
print(f"Validation loss: {val_loss:.3f}")
print(f"Test loss: {test_loss:.3f}")

# Don't expect losses to be the same. Batches are shuffled each time and only using 5 batches, far less than the dataset

Training loss: 2.194
Validation loss: 2.583
Test loss: 2.322


In [40]:
def evaluate_model(model, train_loader, val_loader, device, eval_iter):
  model.eval()
  with torch.no_grad():
    train_loss = calc_loss_loader(train_loader, model, device, eval_iter)
    val_loss = calc_loss_loader(val_loader, model, device, eval_iter)
  model.train()
  return train_loss, val_loss

def train_classifier_simple(model, train_loader, val_loader, optimizer,
                            device, num_epochs, eval_freq, eval_iter):
  train_losses, val_losses, train_acc, val_acc = [],[],[],[]
  examples_seen, global_step = 0, -1
  model.to(device)
  for epoch in range(num_epochs):
    model.train()

    for input_batch, target_batch in train_loader:
      optimizer.zero_grad()
      loss = calc_loss_batch(input_batch, target_batch, model, device)
      loss.backward()
      optimizer.step()
      examples_seen += input_batch.shape[0]
      global_step += 1

      if global_step % eval_freq == 0:
        train_loss, val_loss = evaluate_model(model, train_loader, val_loader, device, eval_iter)
        train_losses.append(train_loss)
        val_losses.append(val_loss)
        print(f"Ep {epoch+1} (Step {global_step:06d}): Train loss {train_loss:.3f}, Val loss {val_loss:.3f}")

    train_accuracy = calc_accuracy_loader(
    train_loader, model, device, num_batches=eval_iter)
    val_accuracy = calc_accuracy_loader(
    val_loader, model, device, num_batches=eval_iter)
    print(f"Training accuracy: {train_accuracy*100:.2f}% | ", end="")
    print(f"Validation accuracy: {val_accuracy*100:.2f}%")
    train_acc.append(train_accuracy)
    val_acc.append(val_accuracy)
  return train_losses, val_losses, train_acc, val_acc, examples_seen

In [41]:
import time

start_time = time.time()
torch.manual_seed(123)
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5, weight_decay=0.1)
num_epochs = 5
train_losses, val_losses, train_accs, val_accs, examples_seen = \
 train_classifier_simple(
 model, train_loader, validation_loader, optimizer, device,
 num_epochs=num_epochs, eval_freq=50,
 eval_iter=5
 )
end_time = time.time()
execution_time_minutes = (end_time - start_time) / 60
print(f"Training completed in {execution_time_minutes:.2f} minutes.")

Ep 1 (Step 000000): Train loss 2.153, Val loss 2.392
Ep 1 (Step 000050): Train loss 0.617, Val loss 0.637
Ep 1 (Step 000100): Train loss 0.523, Val loss 0.557
Training accuracy: 70.00% | Validation accuracy: 72.50%
Ep 2 (Step 000150): Train loss 0.561, Val loss 0.489
Ep 2 (Step 000200): Train loss 0.419, Val loss 0.397
Ep 2 (Step 000250): Train loss 0.409, Val loss 0.353
Training accuracy: 82.50% | Validation accuracy: 85.00%
Ep 3 (Step 000300): Train loss 0.333, Val loss 0.320
Ep 3 (Step 000350): Train loss 0.340, Val loss 0.306
Training accuracy: 90.00% | Validation accuracy: 90.00%
Ep 4 (Step 000400): Train loss 0.136, Val loss 0.200
Ep 4 (Step 000450): Train loss 0.153, Val loss 0.132
Ep 4 (Step 000500): Train loss 0.222, Val loss 0.137
Training accuracy: 100.00% | Validation accuracy: 97.50%
Ep 5 (Step 000550): Train loss 0.207, Val loss 0.143
Ep 5 (Step 000600): Train loss 0.083, Val loss 0.074
Training accuracy: 100.00% | Validation accuracy: 97.50%
Training completed in 1.00 mi

In [53]:
train_accuracy = calc_accuracy_loader(train_loader, model, device)
val_accuracy = calc_accuracy_loader(validation_loader, model, device)
test_accuracy = calc_accuracy_loader(test_loader, model, device)

print(f"Training accuracy: {train_accuracy*100:.2f}%")
print(f"Validation accuracy: {val_accuracy*100:.2f}%")
print(f"Test accuracy: {test_accuracy*100:.2f}%")

Training accuracy: 97.21%
Validation accuracy: 97.32%
Test accuracy: 95.67%


In [None]:
torch.save({"model_state" : model.state_dict(),
            "optimizer_state" : optimizer.state_dict()}, "gpt_classifier.pth")

In [28]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model2 = GPTModel(BASE_CONFIG)
model2.out_head = torch.nn.Linear(in_features=BASE_CONFIG["emb_dim"],
    out_features=2)
checkpoint = torch.load("gpt_classifier.pth", map_location=device)
model2.load_state_dict(checkpoint["model_state"])
optimizer2 = torch.optim.AdamW(model.parameters(), lr=5e-4, weight_decay=0.1)
optimizer2.load_state_dict(checkpoint["optimizer_state"])

In [41]:
# Test code
model2.eval()
text1 = "Every effort moves you"
token_ids = generate_text_simple(model=model2, idx=text_to_token_ids(text1, tokenizer),
                                 max_new_tokens=1,
                                 context_size=BASE_CONFIG["context_length"])
print(token_ids)
print(token_ids_to_text(token_ids, tokenizer))
# Shouldn't print out good stuff. Since argmax from generate_text_simple only return 0 or 1

tensor([[6109, 3626, 6100,  345,    1]])
Every effort moves you"


In [27]:
def classify_review(text, model, tokenizer, device, max_length=None, pad_token_id=50256):
  model.eval()

  input_ids = tokenizer.encode(text)
  supported_context_length = model.pos_emb.weight.shape[0]

  max_len = min(max_length, supported_context_length) if max_length is not None else supported_context_length
  input_ids = input_ids[:max_len]

  input_ids += [pad_token_id] * (max_length - len(input_ids))
  input_ids = torch.tensor(input_ids).unsqueeze(0).to(device)

  with torch.no_grad():
    logits = model(input_ids)[:,-1,:]
  predicted_label = torch.argmax(logits, dim=-1).item()

  return "spam" if predicted_label == 1 else "not spam"


In [40]:
text_1 = (
 "You are a winner you have been specially"
 " selected to receive $1000 cash or a $2000 award."
)
print(classify_review(text_1, model2, tokenizer, device,
                      max_length=train_dataset.max_length))

text_2 = ("Hey, just wanted to check if we're still on"
 " for dinner tonight? Let me know!")
print(classify_review(text_2, model2, tokenizer,
                      device, max_length=train_dataset.max_length))

text_3 = ("Every effort moves you")
print(classify_review(text_3, model2, tokenizer, device,
                      max_length=train_dataset.max_length))

spam
not spam
not spam


In [25]:
def calc_accuracy_loader_var(data_loader, model, device, num_batches=None, token_idx=-1):
  model.eval()
  correct_predictions, total_predictions = 0, 0
  if num_batches is None:
    num_batches = len(data_loader)
  else:
    num_batches = min(num_batches, len(data_loader))
  for batch_idx, (inputs, targets) in enumerate(data_loader):
    if batch_idx < num_batches:
      inputs, targets = inputs.to(device), targets.to(device)

      with torch.no_grad():
        logits = model(inputs)[:,token_idx,:]
      predicted_labels = torch.argmax(logits, dim=-1)
      total_predictions += predicted_labels.shape[0]
      correct_predictions += (predicted_labels == targets).sum().item()
    else:
      break
  return correct_predictions / total_predictions


def calc_loss_batch_var(input_batch, target_batch, model, device, token_idx=-1):
  input_batch = input_batch.to(device)
  target_batch = target_batch.to(device)
  logits = model(input_batch)[:,token_idx,:]
  loss = torch.nn.functional.cross_entropy(logits, target_batch)
  return loss

def calc_loss_loader_var(data_loader, model, device, num_batches=None, token_idx=-1):
  loss = 0
  if len(data_loader) == 0:
    return float("nan")
  elif num_batches is None:
    num_batches = len(data_loader)
  else:
    num_batches = min(num_batches, len(data_loader))
  for i, (input_batch, target_batch) in enumerate(data_loader):
    if i >= num_batches:
      break
    loss += calc_loss_batch_var(input_batch, target_batch, model, device, token_idx).item()
  return loss / num_batches

def evaluate_model_var(model, train_loader, val_loader, device, eval_iter, token_idx):
  model.eval()
  with torch.no_grad():
    train_loss = calc_loss_loader_var(train_loader, model, device, eval_iter, token_idx)
    val_loss = calc_loss_loader_var(val_loader, model, device, eval_iter, token_idx)
  model.train()
  return train_loss, val_loss

def train_classifier_simple_var(model, train_loader, val_loader, optimizer,
                            device, num_epochs, eval_freq, eval_iter, token_idx=-1):
  train_losses, val_losses, train_acc, val_acc = [],[],[],[]
  examples_seen, global_step = 0, -1
  model.to(device)
  for epoch in range(num_epochs):
    model.train()

    for input_batch, target_batch in train_loader:
      optimizer.zero_grad()
      loss = calc_loss_batch_var(input_batch, target_batch, model, device, token_idx)
      loss.backward()
      optimizer.step()
      examples_seen += input_batch.shape[0]
      global_step += 1

      if global_step % eval_freq == 0:
        train_loss, val_loss = evaluate_model_var(model, train_loader, val_loader, device, eval_iter, token_idx)
        train_losses.append(train_loss)
        val_losses.append(val_loss)
        print(f"Ep {epoch+1} (Step {global_step:06d}): Train loss {train_loss:.3f}, Val loss {val_loss:.3f}")

    train_accuracy = calc_accuracy_loader_var(
    train_loader, model, device, num_batches=eval_iter, token_idx=token_idx)
    val_accuracy = calc_accuracy_loader_var(
    val_loader, model, device, num_batches=eval_iter, token_idx=token_idx)
    print(f"Training accuracy: {train_accuracy*100:.2f}% | ", end="")
    print(f"Validation accuracy: {val_accuracy*100:.2f}%")
    train_acc.append(train_accuracy)
    val_acc.append(val_accuracy)
  return train_losses, val_losses, train_acc, val_acc, examples_seen

In [62]:
import time
# Different hyperparameter tuning
num_workers3 = 0
batch_size3 = 8
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Model with padding the input to maximum context_length
train_dataset3= SpamDataset(csv_file="train.tsv", tokenizer=tokenizer, max_length=None)
#print(train_dataset3.max_length)
validation_dataset3 = SpamDataset("validation.tsv", tokenizer, max_length=train_dataset3.max_length)
#print(validation_dataset3.max_length)
# NEED max_length = train_dataset3.max_length to ENSURE stable training!
test_dataset3 = SpamDataset("test.tsv", tokenizer, max_length=train_dataset3.max_length)

torch.manual_seed(123)
train_loader3 = DataLoader(dataset=train_dataset3, batch_size=batch_size3,
                          shuffle=True, drop_last=True, num_workers=num_workers3)
val_loader3 = DataLoader(dataset=validation_dataset3, batch_size=batch_size3,
                        shuffle=False, drop_last=False, num_workers=num_workers3)

test_loader3 = DataLoader(dataset=test_dataset3, batch_size=batch_size3,
                         shuffle=False, drop_last=False, num_workers=num_workers3)




model3 = GPTModel(BASE_CONFIG)
load_weights_into_gpt(model3, params)
torch.manual_seed(123)
model3.out_head = torch.nn.Linear(in_features=BASE_CONFIG["emb_dim"],
    out_features=2)

# Freeze parameters
for param in model3.parameters():
  param.requires_grad = False

# Unfreeze out_head and last transformer block and final_norm
for param in model3.out_head.parameters():
  param.requires_grad = True
for param in model3.trf_blocks[-1].parameters():
  param.requires_grad = True
for param in model3.final_norm.parameters():
  param.requires_grad = True


# set token_idx = 0, it performs much worse. In fact it overfits training data after epoch2 and beyond
start_time = time.time()
torch.manual_seed(123)
optimizer3 = torch.optim.AdamW(model3.parameters(), lr=5e-5, weight_decay=0.1)
num_epochs = 5
train_losses3, val_losses3, train_accs3, val_accs3, examples_seen3 = \
 train_classifier_simple_var(
 model3, train_loader3, val_loader3, optimizer3, device,
 num_epochs=num_epochs, eval_freq=50,
 eval_iter=5, token_idx=10
 )


end_time = time.time()
execution_time_minutes = (end_time - start_time) / 60
print(f"Training completed in {execution_time_minutes:.2f} minutes.")

Ep 1 (Step 000000): Train loss 3.228, Val loss 4.034
Ep 1 (Step 000050): Train loss 1.184, Val loss 1.403
Ep 1 (Step 000100): Train loss 0.448, Val loss 0.628
Training accuracy: 77.50% | Validation accuracy: 70.00%
Ep 2 (Step 000150): Train loss 0.446, Val loss 0.535
Ep 2 (Step 000200): Train loss 0.261, Val loss 0.438
Ep 2 (Step 000250): Train loss 0.344, Val loss 0.389
Training accuracy: 85.00% | Validation accuracy: 77.50%
Ep 3 (Step 000300): Train loss 0.238, Val loss 0.350
Ep 3 (Step 000350): Train loss 0.309, Val loss 0.317
Training accuracy: 90.00% | Validation accuracy: 87.50%
Ep 4 (Step 000400): Train loss 0.255, Val loss 0.289
Ep 4 (Step 000450): Train loss 0.237, Val loss 0.274
Ep 4 (Step 000500): Train loss 0.236, Val loss 0.258
Training accuracy: 97.50% | Validation accuracy: 87.50%
Ep 5 (Step 000550): Train loss 0.081, Val loss 0.248
Ep 5 (Step 000600): Train loss 0.214, Val loss 0.248
Training accuracy: 100.00% | Validation accuracy: 90.00%
Training completed in 1.00 min

In [69]:
# Notice depending on which token_idx the model is trained on, we need to stay consistent
# even if trained by taking the last token of the sequence to predict class, using different
# a different index doesn't generalize well. However, the further back we go, the more we can use earlier
# tokens to generalize. This makes sense intuitevely as going "earlier" than what we trained
# we should expect those tokens to still have some info but less
'''
For example say we trained using idx=10 but then evaluate using idx=11 or 12. You can see accuracy drops
significantly, but a few percentage points even by just changing the idx by +=1. This is because the model
has not yet learned how to incorporate later indices due to casual attention

'''

# hyperparameters
token_idx3=100
token_idx_org=-100


train_accuracy3 = calc_accuracy_loader_var(train_loader3, model3, device, token_idx=token_idx3)
val_accuracy3 = calc_accuracy_loader_var(val_loader3, model3, device, token_idx=token_idx3)
test_accuracy3 = calc_accuracy_loader_var(test_loader3, model3, device, token_idx=token_idx3)

print(f"Training accuracy3: {train_accuracy3*100:.2f}%")
print(f"Validation accuracy3: {val_accuracy3*100:.2f}%")
print(f"Test accuracy3: {test_accuracy3*100:.2f}%")


train_accuracy = calc_accuracy_loader_var(train_loader, model, device, token_idx=token_idx_org)
val_accuracy = calc_accuracy_loader_var(validation_loader, model, device, token_idx=token_idx_org)
test_accuracy = calc_accuracy_loader_var(test_loader, model, device, token_idx=token_idx_org)

print(f"Training accuracy: {train_accuracy*100:.2f}%")
print(f"Validation accuracy: {val_accuracy*100:.2f}%")
print(f"Test accuracy: {test_accuracy*100:.2f}%")

Training accuracy3: 50.58%
Validation accuracy3: 46.98%
Test accuracy3: 49.67%
Training accuracy: 64.33%
Validation accuracy: 62.42%
Test accuracy: 62.33%
