In [7]:
pip install pandas

Collecting pandas
  Using cached pandas-2.3.3-cp310-cp310-win_amd64.whl.metadata (19 kB)
Collecting pytz>=2020.1 (from pandas)
  Using cached pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Using cached tzdata-2025.2-py2.py3-none-any.whl.metadata (1.4 kB)
Using cached pandas-2.3.3-cp310-cp310-win_amd64.whl (11.3 MB)
Using cached pytz-2025.2-py2.py3-none-any.whl (509 kB)
Using cached tzdata-2025.2-py2.py3-none-any.whl (347 kB)
Installing collected packages: pytz, tzdata, pandas

   ---------------------------------------- 0/3 [pytz]
   ---------------------------------------- 0/3 [pytz]
   ---------------------------------------- 0/3 [pytz]
   ------------- -------------------------- 1/3 [tzdata]
   ------------- -------------------------- 1/3 [tzdata]
   ------------- -------------------------- 1/3 [tzdata]
   ------------- -------------------------- 1/3 [tzdata]
   ------------- -------------------------- 1/3 [tzdata]
   --------------------

In [1]:
import urllib.request
import zipfile
import os
from pathlib import Path

In [2]:
url = "https://archive.ics.uci.edu/static/public/228/sms+spam+collection.zip"
zip_path = "sms_spam_collection.zip"
extracted_path = "sms_spam_collection"
data_file_path = Path(extracted_path) / "SMSSpamCollection.tsv"

In [3]:
def download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path):
    if data_file_path.exists():
        print(f"{data_file_path} already exists. Skipping download and extraction.")
        return
    
    with urllib.request.urlopen(url) as response:   
        with open(zip_path, "wb") as out_file:
            out_file.write(response.read())
    with zipfile.ZipFile(zip_path, "r") as zip_ref:   
        zip_ref.extractall(extracted_path)
    original_file_path = Path(extracted_path) / "SMSSpamCollection"
    os.rename(original_file_path, data_file_path)              
    print(f"File downloaded and saved as {data_file_path}")

In [4]:
download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)

File downloaded and saved as sms_spam_collection\SMSSpamCollection.tsv


In [5]:
data_file_path

WindowsPath('sms_spam_collection/SMSSpamCollection.tsv')

In [8]:
import pandas as pd
df = pd.read_csv(data_file_path, sep="\t", header=None, names=["Label", "Text"])
df  

Unnamed: 0,Label,Text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [9]:
print(df["Label"].value_counts())

Label
ham     4825
spam     747
Name: count, dtype: int64


In [10]:
def create_balanced_dataset(df):
    num_spam = df[df["Label"] == "spam"].shape[0]    
    ham_subset = df[df["Label"] == "ham"].sample(num_spam, random_state=123)                              
    balanced_df = pd.concat([ham_subset, df[df["Label"] == "spam"]])                             
    return balanced_df
balanced_df = create_balanced_dataset(df)
print(balanced_df["Label"].value_counts())

Label
ham     747
spam    747
Name: count, dtype: int64


In [11]:
balanced_df["Label"] = balanced_df["Label"].map({"ham": 0, "spam": 1})

In [12]:
def random_split(df, train_frac, validation_frac):
    df = df.sample(frac=1, random_state=123).reset_index(drop=True)              
    train_end = int(len(df) * train_frac)         
    validation_end = train_end + int(len(df) * validation_frac)
    train_df = df[:train_end]
    validation_df = df[train_end:validation_end]
    test_df = df[validation_end:]
    return train_df, validation_df, test_df

In [13]:
train_df, validation_df, test_df = random_split(balanced_df, 0.7, 0.1)

In [14]:
train_df.to_csv("train.csv", index=None)
validation_df.to_csv("validation.csv", index=None)
test_df.to_csv("test.csv", index=None)

In [15]:
import tiktoken
tokenizer = tiktoken.get_encoding("gpt2")
print(tokenizer.encode("<|endoftext|>", allowed_special={"<|endoftext|>"}))

[50256]


In [16]:
len(balanced_df)

1494

In [17]:
import torch
import torch.nn as nn

In [18]:
from torch.utils.data import Dataset, DataLoader

In [19]:
class SpamDataset(Dataset):
    def __init__(self, path, tokenizer, max_length = None, pad_id = 50256):
        super().__init__()
        self.tokenizer = tokenizer
        self.pad_id = pad_id
        self.data = pd.read_csv(path)
        #self.data = self.data.iloc[0]
        if max_length is None:
            self.max_length = self.get_max_length()
        else:
            self.max_length = max_length
        self.data['encoded_text'] = self.data['Text'].apply(self.encode)

    def __getitem__(self, idx):
        return torch.tensor(self.data.iloc[idx]['encoded_text'], dtype = torch.long), torch.tensor(self.data.iloc[idx]['Label'], dtype = torch.long)

    def __len__(self):
        return len(self.data)

    def encode(self, text):
        length = len(self.tokenizer.encode(text))
        array = [self.pad_id] * (self.max_length - length)
        encoding = self.tokenizer.encode(text)
        encoding.extend(array)
        return encoding
    
    def get_max_length(self):
        max_idx = df['Text'].str.len().idxmax()
        return len(self.tokenizer.encode(df.iloc[max_idx]['Text']))

In [17]:
train_dataset[0][0]

NameError: name 'train_dataset' is not defined

In [20]:
train_dataset = SpamDataset('train.csv', tokenizer = tokenizer)
print(train_dataset.max_length)

216


In [21]:
val_dataset = SpamDataset(
    "validation.csv",
    max_length=train_dataset.max_length,
    tokenizer=tokenizer
)

In [22]:
test_dataset = SpamDataset(
    "test.csv",
    max_length=train_dataset.max_length,
    tokenizer=tokenizer
)

In [23]:
num_workers = 0
batch_size = 8

train_loader = DataLoader(dataset = train_dataset, batch_size = batch_size, num_workers = num_workers, shuffle = True, drop_last = False)
test_loader = DataLoader(dataset = test_dataset, batch_size = batch_size, num_workers = num_workers, shuffle = True, drop_last = False)
val_loader = DataLoader(dataset = val_dataset, batch_size = batch_size, num_workers = num_workers, shuffle = True, drop_last = False)

In [24]:
for input_batch, target_batch in train_loader:
    pass
print("Input batch dimensions:", input_batch.shape)
print("Label batch dimensions", target_batch.shape)

Input batch dimensions: torch.Size([5, 216])
Label batch dimensions torch.Size([5])


In [25]:
import torch
import torch.nn as nn
import tiktoken

In [26]:
GPT_CONFIG_124M = {
    "vocab_size": 50257,     # Vocabulary size
    "context_length": 1024,  # Context length
    "emb_dim": 768,          # Embedding dimension
    "n_heads": 12,           # Number of attention heads
    "n_layers": 12,          # Number of layers
    "drop_rate": 0.1,        # Dropout rate
    "qkv_bias": False}        # Query-Key-Value bias

cfg = GPT_CONFIG_124M

In [27]:
class FeedForward(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.layers = nn.Sequential(nn.Linear(cfg['emb_dim'], 4 * cfg['emb_dim'], bias = cfg['qkv_bias']), nn.GELU(), nn.Linear(4 * cfg['emb_dim'], cfg['emb_dim'], bias = cfg['qkv_bias']))

    def forward(self, x):
        return self.layers(x)

In [28]:
class LayerNorm(nn.Module):
    def __init__(self, cfg, eps = 1e-5):
        super().__init__()
        self.eps = eps
        self.gamma = nn.Parameter(torch.ones(cfg['emb_dim']))
        self.beta = nn.Parameter(torch.zeros(cfg['emb_dim']))

    def forward(self, x):
        mean = x.mean(dim = -1, keepdim = True)
        var = x.var(dim = -1, keepdim = True, unbiased = False)
        x_norm = (x - mean) / torch.sqrt(var + self.eps)
        x_norm_corrected = (self.gamma * x_norm) + self.beta
        return x_norm_corrected

In [29]:
class Dropout(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.dropout = nn.Dropout(cfg['drop_rate'])

    def forward(self, x):
        return self.dropout(x)

In [30]:
class MultiHeadAttention(nn.Module):
    def __init__(self, cfg, d_out = 768, num_heads = cfg['n_heads'], context_length = cfg['context_length']):
        super().__init__()
        self.num_heads = num_heads
        if d_out % self.num_heads != 0:
            assert "num_heads must be divisible by d_out"    
        self.d_out = d_out
        self.W_query = nn.Linear(cfg['emb_dim'], self.d_out, bias = cfg['qkv_bias'])
        self.W_key = nn.Linear(cfg['emb_dim'], self.d_out, bias = cfg['qkv_bias'])
        self.W_value = nn.Linear(cfg['emb_dim'], self.d_out, bias = cfg['qkv_bias'])
        self.dropout = nn.Dropout(cfg['drop_rate'])
        self.out_proj = nn.Linear(d_out, d_out)
        self.register_buffer('mask', torch.triu(torch.ones(context_length, context_length), diagonal = 1))
        
    def forward(self, x):
        b, num_tokens, emb_dim = x.shape
        query = self.W_query(x)
        key = self.W_key(x)
        value = self.W_value(x)
        query = query.view(b, num_tokens, self.num_heads, self.d_out // self.num_heads)
        key = key.view(b, num_tokens, self.num_heads, self.d_out // self.num_heads)
        value = value.view(b, num_tokens, self.num_heads, self.d_out // self.num_heads)

        query = query.transpose(1, 2)
        key = key.transpose(1, 2)
        value = value.transpose(1, 2)

        attn_scores = query @ key.transpose(2, 3)
        attn_scores.masked_fill_(self.mask.bool()[:num_tokens, :num_tokens], -torch.inf)
        attn_weights = torch.softmax(attn_scores / key.shape[-1] ** 0.5, dim = -1)
        attn_weights = self.dropout(attn_weights)
        context_vectors = attn_weights @ value
        context_vectors = context_vectors.transpose(1, 2).contiguous()
        context_vectors = context_vectors.view(b, num_tokens, self.d_out)
        context_vectors = self.out_proj(context_vectors) 
        return context_vectors

In [31]:
class TransformerBlock(nn.Module):

    def __init__(self, cfg):
        super().__init__()
        self.layer_norm1 = LayerNorm(cfg)
        self.multiheadattention = MultiHeadAttention(cfg, num_heads = cfg['n_heads'])
        self.dropout = Dropout(cfg)
        self.layer_norm2 = LayerNorm(cfg)
        self.feed_forward = FeedForward(cfg)

    def forward(self, x):
        shortcut = x
        x = self.layer_norm1(x)
        x = self.multiheadattention(x)
        x = self.dropout(x)
        x = x + shortcut

        shortcut = x
        x = self.layer_norm2(x)
        x = self.feed_forward(x)
        x = self.dropout(x)
        x = x + shortcut
        return x

In [32]:
class GPTModel(nn.Module):

    def __init__(self, cfg):
        super().__init__()
        #self.encoder = tiktoken.get_encoding("gpt2")
        self.pos_emb = nn.Embedding(cfg['context_length'], cfg['emb_dim'])
        self.tok_embedding = nn.Embedding(cfg['vocab_size'], cfg['emb_dim'])
        self.dropout = Dropout(cfg)
        self.trf_blocks = nn.Sequential(*[TransformerBlock(cfg) for _ in range(cfg["n_layers"])])
        self.final_layernorm = LayerNorm(cfg)
        self.out_head = nn.Linear(cfg['emb_dim'], cfg['vocab_size'], bias = cfg['qkv_bias'])
        
    def forward(self, tok_id):
        batch, seq_len = tok_id.shape
        pos_emb = self.pos_emb(torch.arange(seq_len, device=tok_id.device))
        tok_emb = self.tok_embedding(tok_id)

        x = pos_emb + tok_emb
        x = self.dropout(x)
        x = self.trf_blocks(x)
        x = self.final_layernorm(x)
        x = self.out_head(x)
        return x

In [33]:
def get_text_encoding(text, tokenizer):
    encoding = tokenizer.encode(text, allowed_special={'<|endoftext|>'})
    return encoding

In [34]:
def token_ids_to_text(ids, tokenizer):
    return tokenizer.decode(ids.squeeze(0))

In [35]:
import urllib.request
url = (
    "https://raw.githubusercontent.com/rasbt/"
    "LLMs-from-scratch/main/ch05/"
    "01_main-chapter-code/gpt_download.py"
)
filename = url.split('/')[-1]
urllib.request.urlretrieve(url, filename)

('gpt_download.py', <http.client.HTTPMessage at 0x28b9772b670>)

In [39]:
pip install tensorflow >=2.15.0 tqdm >=4.66

Note: you may need to restart the kernel to use updated packages.


ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
mediapipe 0.10.21 requires protobuf<5,>=4.25.3, but you have protobuf 6.33.2 which is incompatible.


In [40]:
from gpt_download import download_and_load_gpt2
settings, params = download_and_load_gpt2(
    model_size="124M", models_dir="gpt2"
)

checkpoint: 100%|██████████| 77.0/77.0 [00:00<00:00, 15.4kiB/s]
encoder.json: 100%|██████████| 1.04M/1.04M [01:32<00:00, 11.3kiB/s]
hparams.json: 100%|██████████| 90.0/90.0 [00:00<?, ?iB/s]
model.ckpt.data-00000-of-00001: 100%|██████████| 498M/498M [53:49<00:00, 154kiB/s]     
model.ckpt.index: 100%|██████████| 5.21k/5.21k [00:00<00:00, 332kiB/s]
model.ckpt.meta: 100%|██████████| 471k/471k [00:01<00:00, 301kiB/s]  
vocab.bpe: 100%|██████████| 456k/456k [00:01<00:00, 349kiB/s]  


In [38]:
def assign(left, right):
    if left.shape != right.shape:
        raise ValueError(f"Shape mismatch. Left: {left.shape}, Right:{right.shape}")
    return torch.nn.Parameter(torch.tensor(right))

In [39]:
import numpy as np

def load_weights_into_gpt(gpt, params):
    # 1. Position Embeddings
    gpt.pos_emb.weight = assign(gpt.pos_emb.weight, params['wpe'])
    
    # 2. Token Embeddings
    gpt.tok_embedding.weight = assign(gpt.tok_embedding.weight, params['wte'])
    
    # 3. Iterate over Transformer Blocks
    for b in range(len(params["blocks"])):
        
        # --- ATTENTION MECHANISM ---
        # Get the Q, K, V weights from the fused 'c_attn' matrix
        q_w, k_w, v_w = np.split((params["blocks"][b]["attn"]["c_attn"])["w"], 3, axis=-1)
        
        # Load Weights (Transposed for PyTorch)
        # Note: Your class uses .multiheadattention, not .att or .W_query directly on the block
        gpt.trf_blocks[b].multiheadattention.W_query.weight = assign(gpt.trf_blocks[b].multiheadattention.W_query.weight, q_w.T)
        gpt.trf_blocks[b].multiheadattention.W_key.weight   = assign(gpt.trf_blocks[b].multiheadattention.W_key.weight, k_w.T)
        gpt.trf_blocks[b].multiheadattention.W_value.weight = assign(gpt.trf_blocks[b].multiheadattention.W_value.weight, v_w.T)
        
        # Load Biases
        q_b, k_b, v_b = np.split((params["blocks"][b]["attn"]["c_attn"])["b"], 3, axis=-1)
        gpt.trf_blocks[b].multiheadattention.W_query.bias = assign(gpt.trf_blocks[b].multiheadattention.W_query.bias, q_b)
        gpt.trf_blocks[b].multiheadattention.W_key.bias   = assign(gpt.trf_blocks[b].multiheadattention.W_key.bias, k_b)
        gpt.trf_blocks[b].multiheadattention.W_value.bias = assign(gpt.trf_blocks[b].multiheadattention.W_value.bias, v_b)
        
        # --- ATTENTION OUTPUT PROJECTION (WARNING) ---
        # Your 'MultiHeadAttention' class defined in Step 1 DOES NOT have an 'out_proj' layer.
        # Standard GPT-2 weights have this. I have commented it out to prevent a crash.
        # UNCOMMENT THE LINES BELOW IF YOU ADD 'self.out_proj' TO YOUR CLASS:
        
        gpt.trf_blocks[b].multiheadattention.out_proj.weight = assign(gpt.trf_blocks[b].multiheadattention.out_proj.weight, params["blocks"][b]["attn"]["c_proj"]["w"].T)
        gpt.trf_blocks[b].multiheadattention.out_proj.bias   = assign(gpt.trf_blocks[b].multiheadattention.out_proj.bias, params["blocks"][b]["attn"]["c_proj"]["b"])

        # --- FEED FORWARD NETWORK ---
        # Layer 0: Expansion (c_fc)
        gpt.trf_blocks[b].feed_forward.layers[0].weight = assign(gpt.trf_blocks[b].feed_forward.layers[0].weight, params["blocks"][b]["mlp"]["c_fc"]["w"].T)
        gpt.trf_blocks[b].feed_forward.layers[0].bias   = assign(gpt.trf_blocks[b].feed_forward.layers[0].bias, params["blocks"][b]["mlp"]["c_fc"]["b"])
        
        # Layer 2: Projection (c_proj) -> Note: Index 2 because Index 1 is GELU
        gpt.trf_blocks[b].feed_forward.layers[2].weight = assign(gpt.trf_blocks[b].feed_forward.layers[2].weight, params["blocks"][b]["mlp"]["c_proj"]["w"].T)
        gpt.trf_blocks[b].feed_forward.layers[2].bias   = assign(gpt.trf_blocks[b].feed_forward.layers[2].bias, params["blocks"][b]["mlp"]["c_proj"]["b"])

        # --- LAYER NORMS ---
        # Your class uses .layer_norm1/2, not .norm1/2
        # Your class uses .gamma/.beta, not .scale/.shift
        
        # Norm 1 (Pre-Attention)
        gpt.trf_blocks[b].layer_norm1.gamma = assign(gpt.trf_blocks[b].layer_norm1.gamma, params["blocks"][b]["ln_1"]["g"])
        gpt.trf_blocks[b].layer_norm1.beta  = assign(gpt.trf_blocks[b].layer_norm1.beta, params["blocks"][b]["ln_1"]["b"])
        
        # Norm 2 (Pre-FFN)
        gpt.trf_blocks[b].layer_norm2.gamma = assign(gpt.trf_blocks[b].layer_norm2.gamma, params["blocks"][b]["ln_2"]["g"])
        gpt.trf_blocks[b].layer_norm2.beta  = assign(gpt.trf_blocks[b].layer_norm2.beta, params["blocks"][b]["ln_2"]["b"])

    # 4. Final Layer Norm
    gpt.final_layernorm.gamma = assign(gpt.final_layernorm.gamma, params["g"])
    gpt.final_layernorm.beta  = assign(gpt.final_layernorm.beta, params["b"])

    # 5. Output Head (Weight Tying)
    # We reuse the token embedding weights for the output head
    gpt.out_head.weight = assign(gpt.out_head.weight, params["wte"])

In [40]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:

model_configs = {
    "gpt2-small (124M)": {"emb_dim": 768, "n_layers": 12, "n_heads": 12},
    "gpt2-medium (355M)": {"emb_dim": 1024, "n_layers": 24, "n_heads": 16},
    "gpt2-large (774M)": {"emb_dim": 1280, "n_layers": 36, "n_heads": 20},
    "gpt2-xl (1558M)": {"emb_dim": 1600, "n_layers": 48, "n_heads": 25},
}

model_name = "gpt2-small (124M)"
NEW_CONFIG = GPT_CONFIG_124M.copy()
NEW_CONFIG.update(model_configs[model_name])

NEW_CONFIG.update({"context_length": 1024})
NEW_CONFIG.update({"qkv_bias": True})

gpt = GPTModel(NEW_CONFIG)
gpt.eval()

#load_weights_into_gpt(gpt, params)
#gpt.to(device)

GPTModel(
  (pos_emb): Embedding(1024, 768)
  (tok_embedding): Embedding(50257, 768)
  (dropout): Dropout(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (trf_blocks): Sequential(
    (0): TransformerBlock(
      (layer_norm1): LayerNorm()
      (multiheadattention): MultiHeadAttention(
        (W_query): Linear(in_features=768, out_features=768, bias=True)
        (W_key): Linear(in_features=768, out_features=768, bias=True)
        (W_value): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
      )
      (dropout): Dropout(
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (layer_norm2): LayerNorm()
      (feed_forward): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU(approximate='none')
          (2): Linear(in_features=3072, out_features=768, bias=True)
     

In [38]:
gpt = GPTModel(cfg)
gpt.eval()
sum_ = 0
for p in gpt.parameters():
    sum_ = sum_ + p.numel()

sum_

162963456

In [42]:
def generate(model, idx, max_new_tokens, context_size, temperature=0.0, top_k=None, eos_id=None):
    for _ in range(max_new_tokens):
        idx_cond = idx[:, -context_size:]
        with torch.no_grad():
            logits = model(idx_cond)
        logits = logits[:, -1, :]
        if top_k:
            top_logits, top_pos = torch.topk(logits, top_k)
            logits = torch.where(
                condition=logits < top_logits[0][-1],   
                input=torch.tensor(float('-inf')).to(logits.device),    
                other=logits 
            )
        if temperature > 0.0:
            logits = logits / temperature
            probs = torch.softmax(logits, dim = -1)
            next_token = torch.multinomial(probs, num_samples = 1)
        else:
            probs = torch.softmax(logits, dim = -1)
            next_token = torch.argmax(probs, keepdim = True)

        idx = torch.cat((idx, next_token), dim = 1)
    return idx

In [43]:
torch.manual_seed(123)
token_ids = generate(
    model=gpt,
    idx=torch.tensor(tokenizer.encode("Once upon a time")).to(device).unsqueeze(0),
    max_new_tokens=500,
    context_size=GPT_CONFIG_124M["context_length"],
    top_k=25,
    temperature=1.4
)
print("Output text:\n", token_ids_to_text(token_ids.detach().clone().to('cpu').numpy(), tokenizer))

Output text:
 Once upon a time the United States began to use chemical weapons (and that was not before the war) the regime started to use the word "terrorist" in a similar way as it has used violence and propaganda to describe terrorist organizations. One way it has described terrorists is the fact that the Syrian and Iraqi governments are not at war, there are terrorist organizations all over the world and they're all doing war with a different type of country that has nothing but a high level of militarily significant resources like Israel. We've been told the government can not use chemical weapons on any of its opponents, but those who use it are not allowed within the scope of U.S. sovereignty or to use it. So the Syrians did this, then those other countries had a choice. This is what they did on 9-18-2011 and when the war is going on, if they don't have the U.S. support, they'll say that they want us to take back control by force. It doesn't matter what other groups you're talki

In [44]:
for parameter in gpt.parameters():
    parameter.requires_grad = False

# Change the classification head

In [45]:
gpt.out_head = nn.Linear(cfg['emb_dim'], 2)

In [46]:
gpt.out_head.to(device)

Linear(in_features=768, out_features=2, bias=True)

In [47]:
gpt.out_head.weight.requires_grad

True

In [48]:
for parameter in gpt.trf_blocks[-1].parameters():
    parameter.requires_grad = True

In [49]:
for parameter in gpt.final_layernorm.parameters():
    parameter.requires_grad = True

In [50]:
inputs = tokenizer.encode("Do you have time")
inputs = torch.tensor(inputs).unsqueeze(0)
print("Inputs:", inputs)
print("Inputs dimensions:", inputs.shape)

Inputs: tensor([[5211,  345,  423,  640]])
Inputs dimensions: torch.Size([1, 4])


In [51]:
with torch.no_grad():
    outputs = gpt(inputs.to(device))
print("Outputs:\n", outputs)
print("Outputs dimensions:", outputs.shape)

Outputs:
 tensor([[[-1.5840,  0.9893],
         [-3.7231,  7.4521],
         [-2.2665,  6.6035],
         [-3.5974,  3.9888]]], device='cuda:0')
Outputs dimensions: torch.Size([1, 4, 2])


In [52]:
print("Last output token:", outputs[:, -1, :])

Last output token: tensor([[-3.5974,  3.9888]], device='cuda:0')


In [53]:
probas = torch.softmax(outputs[:, -1, :], dim=-1)
label = torch.argmax(probas)
print("Class label:", label.item())

Class label: 1


In [54]:
def calc_accuracy_loader(dataloader, model, device, num_batches = None):
    model.eval()
    accuracy = 0
    correct_labels = 0
    for index, (inputs, label) in enumerate(dataloader):
        with torch.no_grad():
            output = model(inputs.to(device))
        #print(output.shape)
        output = output[:, -1, :]
        #print(output.shape)
        #print(torch.argmax(torch.softmax(output, dim = -1), dim = -1))
        #print(label)
        prediction = torch.argmax(torch.softmax(output, dim = -1), dim = -1)
        accuracy += (prediction == label.to(device)).sum()
        correct_labels += len(label)
    score = (accuracy / correct_labels).item()
    model.train()
    return score

In [55]:
calc_accuracy_loader(train_loader, gpt, device)

0.49473685026168823

In [56]:
torch.manual_seed(123)
train_accuracy = calc_accuracy_loader(
    train_loader, gpt, device, num_batches=10
)
val_accuracy = calc_accuracy_loader(
    val_loader, gpt, device, num_batches=10
)
test_accuracy = calc_accuracy_loader(
    test_loader, gpt, device, num_batches=10
)
print(f"Training accuracy: {train_accuracy*100:.2f}%")
print(f"Validation accuracy: {val_accuracy*100:.2f}%")
print(f"Test accuracy: {test_accuracy*100:.2f}%")

Training accuracy: 49.47%
Validation accuracy: 53.02%
Test accuracy: 50.33%


In [57]:
def calc_loss_batch(input_batch, target_batch, model, device):
    outputs = gpt(input_batch.to(device))
    outputs = outputs[:, -1, :]
    prediction = torch.softmax(outputs, dim = -1)
    return nn.functional.cross_entropy(prediction, target_batch.to(device))

In [58]:
def calc_loss_loader(dataloader, model, device, num_batches = None):
    total_loss = 0
    if num_batches is None:
        num_batches = len(dataloader)
    else:
        num_batches = min(num_batches, len(dataloader))
    for index, (input_batch, target_batch) in enumerate(dataloader):
        if index < num_batches:
            loss = calc_loss_batch(input_batch, target_batch, model, device)       
            total_loss += loss.item()

        else:
            break

    return total_loss / num_batches

In [59]:
calc_loss_loader(train_loader, gpt, device, num_batches = None)

0.7984588608031965

In [60]:
with torch.no_grad():                
    train_loss = calc_loss_loader(train_loader, gpt, device, num_batches=5)
    val_loss = calc_loss_loader(val_loader, gpt, device, num_batches=5)
    test_loss = calc_loss_loader(test_loader, gpt, device, num_batches=5)
print(f"Training loss: {train_loss:.3f}")
print(f"Validation loss: {val_loss:.3f}")
print(f"Test loss: {test_loss:.3f}")

Training loss: 0.686
Validation loss: 0.806
Test loss: 0.750


In [61]:
def train_classifier_simple(model, dataloader, val_loader, optimizer, device, num_epochs, eval_freq, eval_iter):
    train_losses, val_losses, train_accs, val_accs = [], [], [], []  
    examples_seen, global_step = 0, -1

    for epoch in range(num_epochs):
        model.train()
        for index, (input_batch, target_batch) in enumerate(dataloader):
            optimizer.zero_grad()
            loss = calc_loss_batch(input_batch, target_batch, model, device)
            loss.backward()
            optimizer.step()
            global_step += 1
            examples_seen += input_batch.shape[0]
            if global_step % eval_freq == 0:
                model.eval()
                with torch.no_grad():                
                    train_loss = calc_loss_loader(train_loader, gpt, device, num_batches=eval_iter)
                    val_loss = calc_loss_loader(val_loader, gpt, device, num_batches=eval_iter)
                model.train()
                train_losses.append(train_loss)
                val_losses.append(val_loss)
                print(f"Ep {epoch+1} (Step {global_step:06d}): Train loss {train_loss:.3f}, Val loss {val_loss:.3f}")

        train_acc = calc_accuracy_loader(train_loader, model, device, num_batches = eval_iter)
        val_acc = calc_accuracy_loader(val_loader, model, device, num_batches = eval_iter)
        print(f"Training accuracy: {train_accuracy*100:.2f}% | ", end="")
        print(f"Validation accuracy: {val_accuracy*100:.2f}%")
        train_accs.append(train_acc)
        val_accs.append(val_accs)

    return train_losses, val_losses, train_accs, val_accs, examples_seen

In [65]:
import time
start_time = time.time()
torch.manual_seed(123)
optimizer = torch.optim.AdamW(gpt.parameters(), lr=5e-5, weight_decay=0.1)
num_epochs = 5
train_losses, val_losses, train_accs, val_accs, examples_seen = \
    train_classifier_simple(gpt, train_loader, val_loader, optimizer, device, num_epochs = num_epochs, eval_freq = 50, eval_iter = 5)
end_time = time.time()
execution_time_minutes = (end_time - start_time) / 60
print(f"Training completed in {execution_time_minutes:.2f} minutes.")

Ep 1 (Step 000000): Train loss 10.687, Val loss 10.727
Ep 1 (Step 000050): Train loss 10.450, Val loss 10.300
Ep 1 (Step 000100): Train loss 10.125, Val loss 10.325
Training accuracy: 56.46% | Validation accuracy: 58.39%
Ep 2 (Step 000150): Train loss 10.250, Val loss 10.275
Ep 2 (Step 000200): Train loss 10.350, Val loss 10.400
Ep 2 (Step 000250): Train loss 10.325, Val loss 10.350
Training accuracy: 56.46% | Validation accuracy: 58.39%
Ep 3 (Step 000300): Train loss 10.275, Val loss 10.225
Ep 3 (Step 000350): Train loss 10.400, Val loss 10.325
Training accuracy: 56.46% | Validation accuracy: 58.39%
Ep 4 (Step 000400): Train loss 10.350, Val loss 10.400
Ep 4 (Step 000450): Train loss 10.275, Val loss 10.300
Ep 4 (Step 000500): Train loss 10.350, Val loss 10.300
Training accuracy: 56.46% | Validation accuracy: 58.39%
Ep 5 (Step 000550): Train loss 10.350, Val loss 10.350
Ep 5 (Step 000600): Train loss 10.400, Val loss 10.275
Ep 5 (Step 000650): Train loss 10.400, Val loss 10.300
Traini

In [66]:
train_accuracy = calc_accuracy_loader(train_loader, gpt, device)
val_accuracy = calc_accuracy_loader(val_loader, gpt, device)
test_accuracy = calc_accuracy_loader(test_loader, gpt, device)
print(f"Training accuracy: {train_accuracy*100:.2f}%")
print(f"Validation accuracy: {val_accuracy*100:.2f}%")
print(f"Test accuracy: {test_accuracy*100:.2f}%")

Training accuracy: 49.47%
Validation accuracy: 53.02%
Test accuracy: 50.33%
