In [None]:
import torch

# Set the device to CUDA if available, otherwise fall back to CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Uncomment the following lines to use MPS for Apple Silicon GPUs
# if torch.backends.mps.is_available():
#     device = torch.device("mps")

print("Device:", device)


Device: cuda


In [None]:
import torch.nn as nn

from torch.utils.data import Dataset, DataLoader
class GPTDatasetV1(Dataset):
  def __init__(self, txt, tokenizer, max_length, stride):
    self.input_ids = []
    self.target_ids = []
    token_ids = tokenizer.encode(txt)
    for i in range(0, len(token_ids) - max_length, stride):
      input_chunk = token_ids[i:i + max_length]
      target_chunk = token_ids[i + 1: i + max_length + 1]
      self.input_ids.append(torch.tensor(input_chunk))
      self.target_ids.append(torch.tensor(target_chunk))
  def __len__(self):
    return len(self.input_ids)
  def __getitem__(self, idx):
    return self.input_ids[idx], self.target_ids[idx]

In [None]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False):
        super().__init__()
        assert (d_out % num_heads == 0)
        self.d_out = d_out
        self.num_heads = num_heads
        self.head_dim = d_out // num_heads
        self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.out_proj = nn.Linear(d_out, d_out)
        self.dropout = nn.Dropout(dropout)
        self.register_buffer("mask", torch.triu(torch.ones(context_length, context_length), diagonal=1))

    def forward(self, x):
        b, num_tokens, d_in = x.shape
        keys = self.W_key(x)
        queries = self.W_query(x)
        values = self.W_value(x)

        keys = keys.view(b, num_tokens, self.num_heads, self.head_dim)
        values = values.view(b, num_tokens, self.num_heads, self.head_dim)
        queries = queries.view(b, num_tokens, self.num_heads, self.head_dim)

        keys = keys.transpose(1, 2)
        queries = queries.transpose(1, 2)
        values = values.transpose(1, 2)

        attn_scores = queries @ keys.transpose(2, 3)
        mask_bool = self.mask.bool()[:num_tokens, :num_tokens]

        attn_scores.masked_fill_(mask_bool, -torch.inf)
        attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1)
        attn_weights = self.dropout(attn_weights)

        context_vec = (attn_weights @ values).transpose(1, 2)
        context_vec = context_vec.contiguous().view(b, num_tokens, self.d_out)
        context_vec = self.out_proj(context_vec)

        return context_vec


In [None]:
class LayerNorm(nn.Module):
    def __init__(self, emb_dim):
        super().__init__()
        self.eps = 1e-5
        self.scale = nn.Parameter(torch.ones(emb_dim))
        self.shift = nn.Parameter(torch.zeros(emb_dim))

    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        var = x.var(dim=-1, keepdim=True, unbiased=False)
        norm_x = (x - mean) / torch.sqrt(var + self.eps)
        return self.scale * norm_x + self.shift


In [None]:
class FeedForward(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(cfg["emb_dim"], 4 * cfg["emb_dim"]),
            GELU(),
            nn.Linear(4 * cfg["emb_dim"], cfg["emb_dim"]),
        )

    def forward(self, x):
        return self.layers(x)


In [None]:
class GELU(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        return 0.5 * x * (1 + torch.tanh(
            torch.sqrt(torch.tensor(2.0 / torch.pi)) *
            (x + 0.044715 * torch.pow(x, 3))
        ))


In [None]:
class TransformerBlock(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.att = MultiHeadAttention(
            d_in=cfg["emb_dim"],
            d_out=cfg["emb_dim"],
            context_length=cfg["context_length"],
            num_heads=cfg["n_heads"],
            dropout=cfg["drop_rate"],
            qkv_bias=cfg["qkv_bias"]
        )
        self.ff = FeedForward(cfg)
        self.norm1 = LayerNorm(cfg["emb_dim"])
        self.norm2 = LayerNorm(cfg["emb_dim"])
        self.drop_shortcut = nn.Dropout(cfg["drop_rate"])

    def forward(self, x):
        # Shortcut connection for attention block: Add the original input back
        shortcut = x
        x = self.norm1(x)
        x = self.att(x)
        x = self.drop_shortcut(x)
        x = x + shortcut

        # Shortcut connection for feed-forward block: Add the original input back
        shortcut = x
        x = self.norm2(x)
        x = self.ff(x)
        x = self.drop_shortcut(x)
        x = x + shortcut

        return x


In [None]:
class GPTModel(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"])
        self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"])
        self.drop_emb = nn.Dropout(cfg["drop_rate"])
        self.trf_blocks = nn.Sequential(
            *[TransformerBlock(cfg) for _ in range(cfg["n_layers"])]
        )
        self.final_norm = LayerNorm(cfg["emb_dim"])
        self.out_head = nn.Linear(
            cfg["emb_dim"], cfg["vocab_size"], bias=False
        )

    def forward(self, in_idx):
        batch_size, seq_len = in_idx.shape

        # Token and position embeddings
        tok_embeds = self.tok_emb(in_idx)
        # The device setting will allow us to train the model on a CPU or GPU,
        # depending on which device the input data sits on.
        pos_embeds = self.pos_emb(
            torch.arange(seq_len, device=in_idx.device)
        )

        # Adding token and position embeddings
        x = tok_embeds + pos_embeds
        x = self.drop_emb(x)

        # Passing through transformer blocks
        x = self.trf_blocks(x)

        # Final normalization and output layer
        x = self.final_norm(x)
        logits = self.out_head(x)

        return logits


In [None]:
import urllib.request
url = (
"https://raw.githubusercontent.com/rasbt/"
"LLMs-from-scratch/main/ch05/"
"01_main-chapter-code/gpt_download.py"
)
filename = url.split('/')[-1]
urllib.request.urlretrieve(url, filename)

('gpt_download.py', <http.client.HTTPMessage at 0x7b7ffae02b50>)

In [None]:
def assign(left, right):
    # Check if the shapes of the tensors match
    if left.shape != right.shape:
        raise ValueError(f"Shape mismatch. Left: {left.shape}, Right: {right.shape}")

    # If shapes match, return the right tensor as a Parameter
    return torch.nn.Parameter(torch.tensor(right))

In [None]:
import numpy as np

# Function to load weights into the GPT model from the given parameters
def load_weights_into_gpt(gpt, params):
    # Set positional embedding weights
    gpt.pos_emb.weight = assign(gpt.pos_emb.weight, params['wpe'])

    # Set token embedding weights
    gpt.tok_emb.weight = assign(gpt.tok_emb.weight, params['wte'])

    # Iterate over each transformer block in the model
    for b in range(len(params["blocks"])):
        # Split attention weights into query, key, and value components
        q_w, k_w, v_w = np.split(
            (params["blocks"][b]["attn"]["c_attn"])["w"], 3, axis=-1)

        # Assign query, key, and value weights to the model
        gpt.trf_blocks[b].att.W_query.weight = assign(
            gpt.trf_blocks[b].att.W_query.weight, q_w.T)
        gpt.trf_blocks[b].att.W_key.weight = assign(
            gpt.trf_blocks[b].att.W_key.weight, k_w.T)
        gpt.trf_blocks[b].att.W_value.weight = assign(
            gpt.trf_blocks[b].att.W_value.weight, v_w.T)

        # Split attention biases into query, key, and value components
        q_b, k_b, v_b = np.split(
            (params["blocks"][b]["attn"]["c_attn"])["b"], 3, axis=-1)

        # Assign query, key, and value biases to the model
        gpt.trf_blocks[b].att.W_query.bias = assign(
            gpt.trf_blocks[b].att.W_query.bias, q_b)
        gpt.trf_blocks[b].att.W_key.bias = assign(
            gpt.trf_blocks[b].att.W_key.bias, k_b)
        gpt.trf_blocks[b].att.W_value.bias = assign(
            gpt.trf_blocks[b].att.W_value.bias, v_b)

        # Assign attention projection weights and biases
        gpt.trf_blocks[b].att.out_proj.weight = assign(
            gpt.trf_blocks[b].att.out_proj.weight,
            params["blocks"][b]["attn"]["c_proj"]["w"].T)
        gpt.trf_blocks[b].att.out_proj.bias = assign(
            gpt.trf_blocks[b].att.out_proj.bias,
            params["blocks"][b]["attn"]["c_proj"]["b"])

        # Assign feed-forward network weights and biases (first layer)
        gpt.trf_blocks[b].ff.layers[0].weight = assign(
            gpt.trf_blocks[b].ff.layers[0].weight,
            params["blocks"][b]["mlp"]["c_fc"]["w"].T)
        gpt.trf_blocks[b].ff.layers[0].bias = assign(
            gpt.trf_blocks[b].ff.layers[0].bias,
            params["blocks"][b]["mlp"]["c_fc"]["b"])

        # Assign feed-forward network weights and biases (second layer)
        gpt.trf_blocks[b].ff.layers[2].weight = assign(
            gpt.trf_blocks[b].ff.layers[2].weight,
            params["blocks"][b]["mlp"]["c_proj"]["w"].T)
        gpt.trf_blocks[b].ff.layers[2].bias = assign(
            gpt.trf_blocks[b].ff.layers[2].bias,
            params["blocks"][b]["mlp"]["c_proj"]["b"])

        # Assign layer normalization scale and shift for the first normalization
        gpt.trf_blocks[b].norm1.scale = assign(
            gpt.trf_blocks[b].norm1.scale,
            params["blocks"][b]["ln_1"]["g"])
        gpt.trf_blocks[b].norm1.shift = assign(
            gpt.trf_blocks[b].norm1.shift,
            params["blocks"][b]["ln_1"]["b"])

        # Assign layer normalization scale and shift for the second normalization
        gpt.trf_blocks[b].norm2.scale = assign(
            gpt.trf_blocks[b].norm2.scale,
            params["blocks"][b]["ln_2"]["g"])
        gpt.trf_blocks[b].norm2.shift = assign(
            gpt.trf_blocks[b].norm2.shift,
            params["blocks"][b]["ln_2"]["b"])

    # Assign final layer normalization and output head weights
    gpt.final_norm.scale = assign(gpt.final_norm.scale, params["g"])
    gpt.final_norm.shift = assign(gpt.final_norm.shift, params["b"])
    gpt.out_head.weight = assign(gpt.out_head.weight, params["wte"])


In [None]:
from google.colab import drive
import os

# Mount Google Drive
drive.mount('/content/drive')

# Define the directory in Google Drive to save the weights
drive_weights_dir = "/content/drive/My Drive/gpt2_weights"
os.makedirs(drive_weights_dir, exist_ok=True)

# Download and load GPT-2 weights
from gpt_download import download_and_load_gpt2

# Base configuration for the GPT model
BASE_CONFIG = {
    "vocab_size": 50257,
    "context_length": 1024,
    "drop_rate": 0.0,
    "qkv_bias": True
}

# Model-specific configurations
model_configs = {
    "gpt2-small (124M)": {"emb_dim": 768, "n_layers": 12, "n_heads": 12},
    "gpt2-medium (355M)": {"emb_dim": 1024, "n_layers": 24, "n_heads": 16},
    "gpt2-large (774M)": {"emb_dim": 1280, "n_layers": 36, "n_heads": 20},
    "gpt2-xl (1558M)": {"emb_dim": 1600, "n_layers": 48, "n_heads": 25},
}

# Choose the model to use
CHOOSE_MODEL = "gpt2-medium (355M)"

# Update the base configuration with model-specific settings
BASE_CONFIG.update(model_configs[CHOOSE_MODEL])

# Extract model size from the selected model
model_size = CHOOSE_MODEL.split(" ")[-1].lstrip("(").rstrip(")")

# Download and load GPT-2 model parameters
settings, params = download_and_load_gpt2(
    model_size=model_size,
    models_dir="gpt2"
)

# Save the weights to Google Drive
import pickle

weights_file = os.path.join(drive_weights_dir, f"gpt2_{model_size}_weights.pkl")
with open(weights_file, "wb") as f:
    pickle.dump(params, f)

print(f"Weights saved to {weights_file}")

Mounted at /content/drive


checkpoint: 100%|██████████| 77.0/77.0 [00:00<00:00, 74.3kiB/s]
encoder.json: 100%|██████████| 1.04M/1.04M [00:00<00:00, 2.26MiB/s]
hparams.json: 100%|██████████| 91.0/91.0 [00:00<00:00, 202kiB/s]
model.ckpt.data-00000-of-00001:  29%|██▊       | 407M/1.42G [00:36<01:31, 11.1MiB/s]


KeyboardInterrupt: 

run only this if you waant load it from drive

In [None]:
from google.colab import drive
import os
import pickle

# Mount Google Drive
drive.mount('/content/drive')

# Define the directory in Google Drive where weights are saved
drive_weights_dir = "/content/drive/My Drive/gpt2_weights"
weights_file = os.path.join(drive_weights_dir, "gpt2_355M_weights.pkl")  # Adjust filename if needed

# Load the weights from Google Drive
with open(weights_file, "rb") as f:
    params = pickle.load(f)

print("Weights loaded successfully from Google Drive!")

# Initialize and load weights into the GPT model
model = GPTModel(BASE_CONFIG)
load_weights_into_gpt(model, params)

# Set the model to evaluation mode
model.eval()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Weights loaded successfully from Google Drive!


GPTModel(
  (tok_emb): Embedding(50257, 1024)
  (pos_emb): Embedding(1024, 1024)
  (drop_emb): Dropout(p=0.0, inplace=False)
  (trf_blocks): Sequential(
    (0): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features=1024, out_features=1024, bias=True)
        (W_key): Linear(in_features=1024, out_features=1024, bias=True)
        (W_value): Linear(in_features=1024, out_features=1024, bias=True)
        (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
        (dropout): Dropout(p=0.0, inplace=False)
      )
      (ff): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=1024, out_features=4096, bias=True)
          (1): GELU()
          (2): Linear(in_features=4096, out_features=1024, bias=True)
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (drop_shortcut): Dropout(p=0.0, inplace=False)
    )
    (1): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(i

do not run the below if you run the earlier 2 code snippets


In [None]:
from gpt_download import download_and_load_gpt2

# Base configuration for the GPT model
BASE_CONFIG = {
    "vocab_size": 50257,
    "context_length": 1024,
    "drop_rate": 0.0,
    "qkv_bias": True
}

# Model-specific configurations
model_configs = {
    "gpt2-small (124M)": {"emb_dim": 768, "n_layers": 12, "n_heads": 12},
    "gpt2-medium (355M)": {"emb_dim": 1024, "n_layers": 24, "n_heads": 16},
    "gpt2-large (774M)": {"emb_dim": 1280, "n_layers": 36, "n_heads": 20},
    "gpt2-xl (1558M)": {"emb_dim": 1600, "n_layers": 48, "n_heads": 25},
}

# Choose the model to use
CHOOSE_MODEL = "gpt2-medium (355M)"

# Update the base configuration with model-specific settings
BASE_CONFIG.update(model_configs[CHOOSE_MODEL])

# Extract model size from the selected model
model_size = CHOOSE_MODEL.split(" ")[-1].lstrip("(").rstrip(")")

# Download and load GPT-2 model parameters
settings, params = download_and_load_gpt2(
    model_size=model_size,
    models_dir="gpt2"
)

# Initialize and load weights into the GPT model
model = GPTModel(BASE_CONFIG)
load_weights_into_gpt(model, params)

# Set the model to evaluation mode
model.eval()

In [None]:
pip install tiktoken

Collecting tiktoken
  Downloading tiktoken-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading tiktoken-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.2 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.3/1.2 MB[0m [31m9.2 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m18.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tiktoken
Successfully installed tiktoken-0.9.0


In [None]:

import tiktoken
tokenizer = tiktoken.get_encoding("gpt2")
print(tokenizer.encode("<|endoftext|>", allowed_special={"<|endoftext|>"}))

[50256]


In [None]:
def text_to_token_ids(text, tokenizer):
    encoded = tokenizer.encode(text, allowed_special={'<|endoftext|>'})
    encoded_tensor = torch.tensor(encoded).unsqueeze(0)  # .unsqueeze(0) adds the batch dimension
    return encoded_tensor

def token_ids_to_text(token_ids, tokenizer):
    flat = token_ids.squeeze(0)  # .squeeze(0) removes the batch dimension
    return tokenizer.decode(flat.tolist())

In [None]:
def generate(model, idx, max_new_tokens, context_size,
             temperature=0.0, top_k=None, eos_id=None):
    for _ in range(max_new_tokens):
        # Keep only the last 'context_size' tokens as the input to the model
        idx_cond = idx[:, -context_size:]

        with torch.no_grad():  # No gradient calculation required during generation
            logits = model(idx_cond)  # Get model's logits for the current context
            logits = logits[:, -1, :]  # Focus only on the last time step

        # Apply top-k sampling if top_k is specified
        if top_k is not None:
            top_logits, _ = torch.topk(logits, top_k)  # Get top-k logits
            min_val = top_logits[:, -1]  # Find the smallest value in the top-k logits
            logits = torch.where(
                logits < min_val,  # Replace logits smaller than the min value with negative infinity
                torch.tensor(float('-inf')).to(logits.device),
                logits
            )

        # Apply temperature scaling to logits if temperature is greater than 0
        if temperature > 0.0:
            logits = logits / temperature  # Scale logits by temperature

        # Convert logits to probabilities using softmax
        probs = torch.softmax(logits, dim=-1)

        # Sample the next token from the probability distribution
        if temperature > 0.0:
            idx_next = torch.multinomial(probs, num_samples=1)  # Sample a token
        else:
            idx_next = torch.argmax(logits, dim=-1, keepdim=True)  # Greedy approach (max probability)

        # If the generated token is the end-of-sequence token, stop early
        if idx_next == eos_id:
            break

        # Concatenate the new token to the input for the next iteration
        idx = torch.cat((idx, idx_next), dim=1)

    return idx


In [None]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Load the model from Google Drive
file_name = "/content/drive/My Drive/llmfromscratch_sft.pth"
model.load_state_dict(torch.load(file_name))
model.to(device)
model.eval()
print("Model loaded from Google Drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


  model.load_state_dict(torch.load(file_name))


Model loaded from Google Drive


In [None]:
def format_input(entry):
    instruction_text = (
        f"Below is an instruction that describes a task. "
        f"Write a response that appropriately completes the request."
        f"\n\n### Instruction:\n{entry['instruction']}"
    )
    input_text = (
        f"\n\n### Input:\n{entry['input']}" if entry["input"] else ""
    )
    return instruction_text + input_text

In [None]:
# Example of testing the model on your own text
input_text = "what is the meaning of developing you ? ."

# Ensure your text is formatted correctly for the model
formatted_input = format_input({"instruction": input_text, "input": "", "output": ""})

# Generate the model's response
token_ids = generate(
    model=model,
    idx=text_to_token_ids(formatted_input, tokenizer).to(device),
    max_new_tokens=256,
    context_size=BASE_CONFIG["context_length"],
    eos_id=50256
)

# Convert the generated token IDs back to text
generated_text = token_ids_to_text(token_ids, tokenizer)

# Extract the response from the generated text
response_text = generated_text[len(formatted_input):].replace("### Response:", "").strip()

# Print the input and the model's response
print(f"Input: {input_text}")
print(f"Model response: {response_text}")

Input:   it's pale pink in my goat eye
Model response: , but it's bright red on my cat's. What should I do?

### Input:
Eye Color: Pale Pink (AS2), Bright Red (AS3)


 Pale pink conjunctiva (AS2) indicates severe anemia. Administer dewormers, provide iron supplements, and consult a veterinarian for further advice.
