In [4]:
from importlib.metadata import version

pkgs = ["matplotlib",
        "numpy",
        "tiktoken",
        "torch",
        "tensorflow" #For OpenAI's pretrained weights
       ]

for p in pkgs:
    print(f"{p} version: {version(p)}")


matplotlib version: 3.10.5
numpy version: 2.2.6
tiktoken version: 0.11.0
torch version: 2.8.0
tensorflow version: 2.20.0


In [11]:
#working directory
import os
print(os.getcwd())

/Users/anup/gitProjects/language-models-psychiatry


In [None]:
import torch
from gpt_from_scratch import GPTModel


<class 'gpt_from_scratch.GPTModel'>


In [16]:
#Inspect the GPT Model
import inspect
from gpt_from_scratch import GPTModel

from IPython.display import display, Markdown
display(Markdown(f"```python\n{inspect.getsource(GPTModel)}\n```"))

```python
class GPTModel(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"])
        self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"])
        self.drop_emb = nn.Dropout(cfg["drop_rate"])

        self.trf_blocks = nn.Sequential(
            *[TransformerBlock(cfg) for _ in range(cfg["n_layers"])])

        self.final_norm = LayerNorm(cfg["emb_dim"])
        self.out_head = nn.Linear(cfg["emb_dim"], cfg["vocab_size"], bias=False)

    def forward(self, in_idx):
        batch_size, seq_len = in_idx.shape
        tok_embeds = self.tok_emb(in_idx)
        pos_embeds = self.pos_emb(torch.arange(seq_len, device=in_idx.device))
        x = tok_embeds + pos_embeds  # Shape [batch_size, num_tokens, emb_size]
        x = self.drop_emb(x)
        x = self.trf_blocks(x)
        x = self.final_norm(x)
        logits = self.out_head(x)
        return logits

```

In [18]:
#config file that contains model parameters for GPTModel

GPT_CONFIG_124M = {
    "vocab_size": 50257,   # Vocabulary size
    "context_length": 256, # Shortened context length (orig: 1024)
    "emb_dim": 768,        # Embedding dimension
    "n_heads": 12,         # Number of attention heads
    "n_layers": 12,        # Number of layers
    "drop_rate": 0.1,      # Dropout rate
    "qkv_bias": False      # Query-key-value bias
}

In [20]:
torch.manual_seed(123)
model = GPTModel(GPT_CONFIG_124M)
model.eval();  #Disable dropout during inference

In [None]:
#View of the model architecture
from IPython.display import display
display(model)

GPTModel(
  (tok_emb): Embedding(50257, 768)
  (pos_emb): Embedding(256, 768)
  (drop_emb): Dropout(p=0.1, inplace=False)
  (trf_blocks): Sequential(
    (0): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features=768, out_features=768, bias=False)
        (W_key): Linear(in_features=768, out_features=768, bias=False)
        (W_value): Linear(in_features=768, out_features=768, bias=False)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ff): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (drop_shortcut): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features

In [30]:
#View of the model's parameters
for name, param in model.named_parameters():
    print(name, param.shape)

print(f"\nnumber of parameter layers: {len(list(model.named_parameters()))}")

tok_emb.weight torch.Size([50257, 768])
pos_emb.weight torch.Size([256, 768])
trf_blocks.0.att.W_query.weight torch.Size([768, 768])
trf_blocks.0.att.W_key.weight torch.Size([768, 768])
trf_blocks.0.att.W_value.weight torch.Size([768, 768])
trf_blocks.0.att.out_proj.weight torch.Size([768, 768])
trf_blocks.0.att.out_proj.bias torch.Size([768])
trf_blocks.0.ff.layers.0.weight torch.Size([3072, 768])
trf_blocks.0.ff.layers.0.bias torch.Size([3072])
trf_blocks.0.ff.layers.2.weight torch.Size([768, 3072])
trf_blocks.0.ff.layers.2.bias torch.Size([768])
trf_blocks.0.norm1.scale torch.Size([768])
trf_blocks.0.norm1.shift torch.Size([768])
trf_blocks.0.norm2.scale torch.Size([768])
trf_blocks.0.norm2.shift torch.Size([768])
trf_blocks.1.att.W_query.weight torch.Size([768, 768])
trf_blocks.1.att.W_key.weight torch.Size([768, 768])
trf_blocks.1.att.W_value.weight torch.Size([768, 768])
trf_blocks.1.att.out_proj.weight torch.Size([768, 768])
trf_blocks.1.att.out_proj.bias torch.Size([768])
trf_b

In [52]:
#This block of code follows after viewing the GPT-2 model architecture and weights to transition from model inspection to practical usage. It ensures the tokenizer is available, adds the current directory to the Python path so local modules can be imported, and then imports the generate_text_simple function, which is essential for generating new text with the model.

import tiktoken

# Add the current notebook's directory to sys.path
import sys
import os
sys.path.append(os.getcwd())

#Import generate_text_simple function from gpt_from_scratch.py
from gpt_from_scratch import generate_text_simple

#Visualize the function
import inspect
from IPython.display import display, Markdown
display(Markdown(f"```python\n{inspect.getsource(generate_text_simple)}\n```"))


```python
def generate_text_simple(model, idx, max_new_tokens, context_size):
    # idx is (B, T) array of indices in the current context
    for _ in range(max_new_tokens):

        # Crop current context if it exceeds the supported context size
        # E.g., if LLM supports only 5 tokens, and the context size is 10
        # then only the last 5 tokens are used as context
        idx_cond = idx[:, -context_size:]

        # Get the predictions
        with torch.no_grad():
            logits = model(idx_cond)

        # Focus only on the last time step
        # (batch, n_token, vocab_size) becomes (batch, vocab_size)
        logits = logits[:, -1, :]

        # Get the idx of the vocab entry with the highest logits value
        idx_next = torch.argmax(logits, dim=-1, keepdim=True)  # (batch, 1)

        # Append sampled index to the running sequence
        idx = torch.cat((idx, idx_next), dim=1)  # (batch, n_tokens+1)

    return idx

```

In [63]:
#This block demonstrates how to use the loaded GPT-2 model for text generation. It defines helper functions to convert text to token IDs and back (i.e. encode and decode) and then uses the generate_text_simple function to generate next tokens from the model. Finally, it decodes and prints the generated output as readable text, showing the practical application of th emodel for generating language.

def text_to_token_ids(text, tokenizer):
    encoded = tokenizer.encode(text, allowed_special={'<|endoftext|>'})
    encoded_tensor = torch.tensor(encoded).unsqueeze(0) # add batch dimension
    return encoded_tensor

def token_ids_to_text(token_ids, tokenizer):
    flat = token_ids.squeeze(0) # remove batch dimension
    return tokenizer.decode(flat.tolist())

start_context = "Every effort moves you"
tokenizer = tiktoken.get_encoding("gpt2")

token_ids = generate_text_simple(
    model=model,
    idx=text_to_token_ids(start_context, tokenizer),
    max_new_tokens=10,
    context_size=GPT_CONFIG_124M["context_length"]
)

print("Output text:",token_ids_to_text(token_ids, tokenizer))


Output text: Every effort moves you rentingetic wasnم refres RexMeCHicular stren


In [None]:
#Defines two small batch of tokenized inputs and their corresponding target sequences. The targets are the next tokens for each input, which is is standard in language modeling.

inputs = torch.tensor([[16833, 3626, 6100],   # ["every effort moves",
                       [40,    1107, 588]])   #  "I really like"]

targets = torch.tensor([[3626, 6100, 345  ],  # [" effort moves you",
                        [1107,  588, 11311]]) #  " really like chocolate"]

In [None]:
#Decodes a sequence of token IDS from a previous generation step into human readable text using the tokenizer.

decoded_text = tokenizer.decode(token_ids.squeeze(0).tolist())
print(decoded_text)

Every effort moves you rentingetic wasnم refres RexMeCHicular stren


In [None]:
#Passes the inputs batch through the GPT model to get raw output scores (logits) for each token position and vocabulary entry. Applies softmax to convert logist to probabilities over the vocabulary for each token position.

with torch.no_grad():
    logits = model(inputs)

probas = torch.softmax(logits, dim=-1) # Probability of each token in vocabulary
print(probas.shape) # Shape: (batch_size, num_tokens, vocab_size)

torch.Size([2, 3, 50257])


In [None]:
#For each token position, selects the token ID with the highest probability. token_ids now contains the model's predicted next token for each position in the batch.

token_ids = torch.argmax(probas, dim=-1, keepdim=True)
print("Token IDs:\n", token_ids)

Token IDs:
 tensor([[[16657],
         [  339],
         [42826]],

        [[49906],
         [29669],
         [41751]]])


In [None]:
#Decodes the first row of targets and model's predicted token_ids back to text for easy comparison. Allows one to see how well the model's predictions (in this untrained model) match the expected next tokens.

print(f"Targets batch 1: {token_ids_to_text(targets[0], tokenizer)}")
print(f"Outputs batch 1: {token_ids_to_text(token_ids[0].flatten(), tokenizer)}")

Targets batch 1:  effort moves you
Outputs batch 1:  Armed heNetflix


In [71]:
#Extracts probabilities assigned by the model to the correct or next tokens for each position in both input sequences

text_idx = 0
target_probas_1 = probas[text_idx, [0, 1, 2], targets[text_idx]]
print("Text 1:", target_probas_1)

text_idx = 1
target_probas_2 = probas[text_idx, [0, 1, 2], targets[text_idx]]
print("Text 2:", target_probas_2)

Text 1: tensor([7.4541e-05, 3.1061e-05, 1.1563e-05])
Text 2: tensor([1.0337e-05, 5.6776e-05, 4.7559e-06])


In [None]:
#Computes the log-probabilities of the correct tokens and then averages them. The average log-probability is a standard metric for model performance (higher is better) and is the basis for cross entropy calculations.

log_probas = torch.log(torch.cat((target_probas_1, target_probas_2)))
print(log_probas)

avg_log_probas = torch.mean(log_probas)
print(avg_log_probas)

tensor([ -9.5042, -10.3796, -11.3677, -11.4798,  -9.7764, -12.2561])
tensor(-10.7940)


In [74]:
#Negates the average log-probability to convert it into a loss (since optimization frameworks minimize loss. This is the cross-entropy value, the standard function for language modeling.

neg_avg_log_probas = avg_log_probas * -1
print(neg_avg_log_probas)

tensor(10.7940)


In [None]:
#Checking the shape of the inputs and outputs.
# Logits have shape (batch_size, num_tokens, vocab_size)
print("Logits shape:", logits.shape)

# Targets have shape (batch_size, num_tokens)
print("Targets shape:", targets.shape)

Logits shape: torch.Size([2, 3, 50257])
Targets shape: torch.Size([2, 3])


In [77]:
#Flatten tensors by combining them over the batch dimension.
logits_flat = logits.flatten(0, 1)
targets_flat = targets.flatten()

print("Flattened logits:", logits_flat.shape)
print("Flattened targets:", targets_flat.shape)

Flattened logits: torch.Size([6, 50257])
Flattened targets: torch.Size([6])


In [78]:
#Using PyTorch built-in cross-entropy function to compute the loss in a numerically stable and efficient way.

loss = torch.nn.functional.cross_entropy(logits_flat, targets_flat)
print(loss)

tensor(10.7940)


In [None]:
#Calculates perplexity, a commonly used metric in language modeling that is the exponentiated cross-entrophy loss. (i.e. lower perplexity means the model is more confident and accurate in its predictions)

perplexity = torch.exp(loss)
print(perplexity)

tensor(48725.8203)


In [None]:
#Download and load a small text dataset for training and validation

import os
import urllib.request

file_path = "the-verdict.txt"
url = "https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch02/01_main-chapter-code/the-verdict.txt"

if not os.path.exists(file_path):
    with urllib.request.urlopen(url) as response:
        text_data = response.read().decode('utf-8')
    with open(file_path, "w", encoding="utf-8") as file:
        file.write(text_data)
else:
    with open(file_path, "r", encoding="utf-8") as file:
        text_data = file.read()

In [97]:
#Checking the text
print(text_data[:99])
print(text_data[-99:])

I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 
it for me! The Strouds stand alone, and happen once--but there's no exterminating our kind of art."


In [None]:
#Text length in terms of characters and tokens

total_characters = len(text_data)
total_tokens = len(tokenizer.encode(text_data))

print("Characters:", total_characters)
print("Tokens:", total_tokens)


Characters: 20479
Tokens: 5145
