<a href="https://colab.research.google.com/github/alexlinapp/proofLLM/blob/main/finetuning_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
import urllib.request
import zipfile
import os
from pathlib import Path

url = "https://archive.ics.uci.edu/static/public/228/sms+spam+collection.zip"
zip_path = "sms_spam_collection.zip"
extracted_path = "sms_spam_collection"
data_file_path = Path(extracted_path) / "SMSSpamCollection.tsv"

print("Hello, nothing should download from this.")
def download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path):
  if data_file_path.exists():
    print(f"Data file already exists at {data_file_path}. Skipping download and extraction.")
    return

  # downloads the file
  with urllib.request.urlopen(url) as response:
    with open(zip_path, "wb") as zip_file:
      zip_file.write(response.read())

  # unzips the file
  with zipfile.ZipFile(zip_path, "r") as zip_ref:
    zip_ref.extractall(extracted_path)

  original_file_path = Path(extracted_path) / "SMSSpamCollection"
  os.rename(original_file_path, data_file_path)
  print(f"File downloaded and saved as {data_file_path}")

download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)


Hello, nothing should download from this.
Data file already exists at sms_spam_collection/SMSSpamCollection.tsv. Skipping download and extraction.


In [40]:
import tiktoken
tokenizer = tiktoken.get_encoding("gpt2")
print(tokenizer.encode("<|endoftext|>", allowed_special={"<|endoftext|>"}))

[50256]


In [27]:
import pandas as pd

df = pd.read_csv(data_file_path, sep="\t", header=None, names=["label", "text"])
df
print(df["label"].value_counts())


def create_balanced_dataset(df):
  num_spam = df[df["label"] == "spam"].shape[0]
  ham_subset = df[df["label"] == "ham"].sample(n=num_spam, random_state=123)
  balanced_df = pd.concat([ham_subset, df[df["label"] == "spam"]])
  return balanced_df

def random_split(df, train_frac, validation_frac):
  df = df.sample(frac=1, random_state=123).reset_index(drop=True)
  train_end = int(train_frac * len(df))
  validation_end = train_end + int(validation_frac * len(df))


  train_df = df[:train_end]
  validation_df = df[train_end:validation_end]
  test_df = df[validation_end:]

  return train_df, validation_df, test_df


balanced_df = create_balanced_dataset(df)
print(balanced_df["label"].value_counts())

balanced_df["label"] = balanced_df["label"].map({"ham": 0, "spam": 1})

train_df, validation_df, test_df = random_split(balanced_df, 0.7, 0.1)

train_df.to_csv("train.tsv", index=None)
validation_df.to_csv("validation.tsv", index=None)
test_df.to_csv("test.tsv", index=None)

label
ham     4825
spam     747
Name: count, dtype: int64
label
ham     747
spam    747
Name: count, dtype: int64


In [53]:
import torch
from torch.utils.data import Dataset

class SpamDataset(Dataset):
  def __init__(self, csv_file, tokenizer, max_length=None, pad_token_id=50256):
    self.data = pd.read_csv(csv_file)
    self.encoded_texts = [tokenizer.encode(text) for text in self.data['text']]
    if max_length is None:
      self.max_length = self.__longest_encoded_length()
    else:
      self.max_length = max_length
      self.encoded_texts = [encoded_text[:self.max_length] for encoded_text in self.encoded_texts]

    self.encoded_texts = [encoded_text + [pad_token_id] * (self.max_length - len(encoded_text)) for encoded_text in self.encoded_texts]

  def __getitem__(self, idx):
    encoded = self.encoded_texts[idx]
    label = self.data.iloc[idx]["label"]
    return (torch.tensor(encoded), torch.tensor(label))
  def __len__(self):
    return len(self.data)

  def __longest_encoded_length(self):
    max_length = 0
    for encoded_text in self.encoded_texts:
      max_length = max(max_length, len(encoded_text))
    return max_length

In [54]:
train_dataset= SpamDataset(csv_file="train.tsv", tokenizer=tokenizer)
print(train_dataset.max_length)
validation_dataset = SpamDataset("validation.tsv", tokenizer, max_length=train_dataset.max_length)
print(validation_dataset.max_length)
test_dataset = SpamDataset("test.tsv", tokenizer, max_length=train_dataset.max_length)

120
120


In [67]:
from torch.utils.data import DataLoader

num_workers = 0
batch_size = 8;
torch.manual_seed(123)

train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers, drop_last=True)
validation_loader = DataLoader(dataset=validation_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers, drop_last=False)
test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers, drop_last=False)

x = next(iter(train_loader))
next(iter(train_loader))[0].shape, next(iter(train_loader))[1].shape


print(f"len(train_loader): {len(train_loader)}")
print(f"len(validation_loader): {len(validation_loader)}")
print(f"len(test_loader): {len(test_loader)}")

len(train_loader): 130
len(validation_loader): 19
len(test_loader): 38


In [72]:
CHOOSE_MODEL = "gpt2-small (124M)"
INPUT_PROMPT = "Every effort moves"

BASE_CONFIG = {
        "vocab_size" : 50257,
        "context_length" : 1024,
        "drop_rate" : 0.0,
        "qkv_bias"  : True
}
model_configs = {
 "gpt2-small (124M)": {"emb_dim": 768, "n_layers": 12, "n_heads": 12},
 "gpt2-medium (355M)": {"emb_dim": 1024, "n_layers": 24, "n_heads": 16},
 "gpt2-large (774M)": {"emb_dim": 1280, "n_layers": 36, "n_heads": 20},
 "gpt2-xl (1558M)": {"emb_dim": 1600, "n_layers": 48, "n_heads": 25},
}
BASE_CONFIG.update(model_configs[CHOOSE_MODEL])