# Small GPT, little update

In [None]:
#!pip install transformers

In [1]:

import os
import time
import math
import pickle
from contextlib import nullcontext
import numpy as np
import pandas as pd
import tiktoken
from collections import Counter
import re
import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.distributed import init_process_group, destroy_process_group
from torch.utils.data import DataLoader, Dataset
import inspect
from sklearn.model_selection import train_test_split
import random




In [2]:
# Define the file path
file_path = 'sample_conversations.txt'  # Update this if needed

# Read the file content
with open(file_path, 'r') as file:
    content = file.read()

# Split the content into sentences
sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', content)

# Ensure all sentences are not empty after splitting
sentences = [sentence for sentence in sentences if sentence.strip() != ""]

# Split data into training and validation sets (90% training, 10% validation)
train_sentences, val_sentences = train_test_split(sentences, test_size=0.1, random_state=42)

# Concatenate all training and validation sentences into single texts
train_text = ' '.join(train_sentences)
val_text = ' '.join(val_sentences)

# Encode the data using GPT-2 BPE encoding
enc = tiktoken.get_encoding("gpt2")
train_ids = enc.encode_ordinary(train_text)
val_ids = enc.encode_ordinary(val_text)
print(f"Training data has {len(train_ids):,} tokens")
print(f"Validation data has {len(val_ids):,} tokens")

# Convert to NumPy arrays and save to binary files
train_ids = np.array(train_ids, dtype=np.uint16)
val_ids = np.array(val_ids, dtype=np.uint16)

# Define paths for output files
output_dir = '.'  # You can update this to any directory you want
train_output_path = os.path.join(output_dir, 'train.bin')
val_output_path = os.path.join(output_dir, 'val.bin')

# Save to binary files
train_ids.tofile(train_output_path)
val_ids.tofile(val_output_path)

print(f"Training data saved to {train_output_path}")
print(f"Validation data saved to {val_output_path}")


Training data has 7,088 tokens
Validation data has 812 tokens
Training data saved to .\train.bin
Validation data saved to .\val.bin


In [3]:
def decode_sample(ids, sample_size):
    sample_ids = ids[:sample_size]
    sample_text = enc.decode(sample_ids.tolist())
    return sample_text

print("Training Data Sample:")
print(decode_sample(train_ids, sample_size=100))

print("\nValidation Data Sample:")
print(decode_sample(val_ids, sample_size=10))


Training Data Sample:
How was your weekend? How was your weekend? I love reading books. How's the weather today? Yes, I'll be there at 8 PM. Are you coming to the party tonight? It's sunny and warm, perfect for a day outside. Hi there! How are you today? Thanks for asking. I really enjoy Italian cuisine. Hi there! How are you today? Hello! What are your plans for today? Do you have any pets? How's the weather today? Yes,

Validation Data Sample:
Yes, it was really exciting!
What's


In [9]:
# Custom Dataset class
class GPTDataset(Dataset):
    def __init__(self, data, block_size):
        self.data = data
        self.block_size = block_size

    def __len__(self):
        return len(self.data) - self.block_size

    def __getitem__(self, idx):
        x = torch.tensor(self.data[idx:idx + self.block_size], dtype=torch.long)
        y = torch.tensor(self.data[idx + 1:idx + 1 + self.block_size], dtype=torch.long)
        return x, y

# Filtering function
def filter_indices(data, max_index):
    return [idx for idx in data if idx < max_index]

class LayerNorm(nn.Module):
    """ LayerNorm but with an optional bias. PyTorch doesn't support simply bias=False """

    def __init__(self, ndim, bias):
        super().__init__()
        self.weight = nn.Parameter(torch.ones(ndim))
        self.bias = nn.Parameter(torch.zeros(ndim)) if bias else None

    def forward(self, input):
        return F.layer_norm(input, self.weight.shape, self.weight, self.bias, 1e-5)

class CausalSelfAttention(nn.Module):

    def __init__(self, config):
        super().__init__()
        assert config.n_embd % config.n_head == 0
        # key, query, value projections for all heads, but in a batch
        self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd, bias=config.bias)
        # output projection
        self.c_proj = nn.Linear(config.n_embd, config.n_embd, bias=config.bias)
        # regularization
        self.attn_dropout = nn.Dropout(config.dropout)
        self.resid_dropout = nn.Dropout(config.dropout)
        self.n_head = config.n_head
        self.n_embd = config.n_embd
        self.dropout = config.dropout
        # flash attention make GPU go brrrrr but support is only in PyTorch >= 2.0
        self.flash = hasattr(torch.nn.functional, 'scaled_dot_product_attention')
        if not self.flash:
            print("WARNING: using slow attention. Flash Attention requires PyTorch >= 2.0")
            # causal mask to ensure that attention is only applied to the left in the input sequence
            self.register_buffer("bias", torch.tril(torch.ones(config.block_size, config.block_size))
                                        .view(1, 1, config.block_size, config.block_size))

    def forward(self, x):
        B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd)

        # calculate query, key, values for all heads in batch and move head forward to be the batch dim
        q, k, v  = self.c_attn(x).split(self.n_embd, dim=2)
        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)

        # causal self-attention; Self-attend: (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T)
        if self.flash:
            # efficient attention using Flash Attention CUDA kernels
            y = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=None, dropout_p=self.dropout if self.training else 0, is_causal=True)
        else:
            # manual implementation of attention
            att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
            att = att.masked_fill(self.bias[:,:,:T,:T] == 0, float('-inf'))
            att = F.softmax(att, dim=-1)
            att = self.attn_dropout(att)
            y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
        y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side

        # output projection
        y = self.resid_dropout(self.c_proj(y))
        return y

class MLP(nn.Module):

    def __init__(self, config):
        super().__init__()
        self.c_fc    = nn.Linear(config.n_embd, 4 * config.n_embd, bias=config.bias)
        self.gelu    = nn.GELU()
        self.c_proj  = nn.Linear(4 * config.n_embd, config.n_embd, bias=config.bias)
        self.dropout = nn.Dropout(config.dropout)

    def forward(self, x):
        x = self.c_fc(x)
        x = self.gelu(x)
        x = self.c_proj(x)
        x = self.dropout(x)
        return x

class Block(nn.Module):

    def __init__(self, config):
        super().__init__()
        self.ln_1 = LayerNorm(config.n_embd, bias=config.bias)
        self.attn = CausalSelfAttention(config)
        self.ln_2 = LayerNorm(config.n_embd, bias=config.bias)
        self.mlp = MLP(config)

    def forward(self, x):
        x = x + self.attn(self.ln_1(x))
        x = x + self.mlp(self.ln_2(x))
        return x

class GPT(nn.Module):

    def __init__(self, config):
        super().__init__()
        assert config.vocab_size is not None
        assert config.block_size is not None
        self.config = config

        self.transformer = nn.ModuleDict(dict(
            wte = nn.Embedding(config.vocab_size, config.n_embd),
            wpe = nn.Embedding(config.block_size, config.n_embd),
            drop = nn.Dropout(config.dropout),
            h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
            ln_f = LayerNorm(config.n_embd, bias=config.bias),
        ))
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
        self.transformer.wte.weight = self.lm_head.weight  # Weight tying

        # Init all weights
        self.apply(self._init_weights)
        # Apply special scaled init to the residual projections, per GPT-2 paper
        for pn, p in self.named_parameters():
            if pn.endswith('c_proj.weight'):
                torch.nn.init.normal_(p, mean=0.0, std=0.02 / math.sqrt(2 * config.n_layer))

        # Report number of parameters
        print("number of parameters: %.2fM" % (self.get_num_params() / 1e6,))

    def get_num_params(self, non_embedding=True):
        n_params = sum(p.numel() for p in self.parameters())
        if non_embedding:
            n_params -= self.transformer.wpe.weight.numel()
        return n_params

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, idx, targets=None):
        device = idx.device
        b, t = idx.size()
        assert t <= self.config.block_size, f"Cannot forward sequence of length {t}, block size is only {self.config.block_size}"
        pos = torch.arange(0, t, dtype=torch.long, device=device)  # shape (t)

        # Forward the GPT model itself
        tok_emb = self.transformer.wte(idx)  # token embeddings of shape (b, t, n_embd)
        pos_emb = self.transformer.wpe(pos)  # position embeddings of shape (t, n_embd)
        x = self.transformer.drop(tok_emb + pos_emb)
        for block in self.transformer.h:
            x = block(x)
        x = self.transformer.ln_f(x)

        if targets is not None:
            # If we are given some desired targets also calculate the loss
            logits = self.lm_head(x)
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1)
        else:
            # Inference-time mini-optimization: only forward the lm_head on the very last position
            logits = self.lm_head(x[:, [-1], :])  # note: using list [-1] to preserve the time dim
            loss = None

        return logits, loss

    def configure_optimizers(self, weight_decay, learning_rate, betas, device_type):
        # start with all of the candidate parameters
        param_dict = {pn: p for pn, p in self.named_parameters()}
        # filter out those that do not require grad
        param_dict = {pn: p for pn, p in param_dict.items() if p.requires_grad}
        # create optim groups. Any parameters that is 2D will be weight decayed, otherwise no.
        # i.e. all weight tensors in matmuls + embeddings decay, all biases and layernorms don't.
        decay_params = [p for n, p in param_dict.items() if p.dim() >= 2]
        nodecay_params = [p for n, p in param_dict.items() if p.dim() < 2]
        optim_groups = [
            {'params': decay_params, 'weight_decay': weight_decay},
            {'params': nodecay_params, 'weight_decay': 0.0}
        ]
        num_decay_params = sum(p.numel() for p in decay_params)
        num_nodecay_params = sum(p.numel() for p in nodecay_params)
        print(f"num decayed parameter tensors: {len(decay_params)}, with {num_decay_params:,} parameters")
        print(f"num non-decayed parameter tensors: {len(nodecay_params)}, with {num_nodecay_params:,} parameters")
        # Create AdamW optimizer and use the fused version if it is available
        fused_available = 'fused' in inspect.signature(torch.optim.AdamW).parameters
        use_fused = fused_available and device_type == 'cuda'
        extra_args = dict(fused=True) if use_fused else dict()
        optimizer = torch.optim.AdamW(optim_groups, lr=learning_rate, betas=betas, **extra_args)
        print(f"using fused AdamW: {use_fused}")

        return optimizer


In [10]:
# Initialize the model
from dataclasses import dataclass

@dataclass
class GPTConfig:
    block_size: int
    vocab_size: int
    n_layer: int
    n_head: int
    n_embd: int  # n_embd must be divisible by n_head
    dropout: float
    bias: bool

config = GPTConfig(
    block_size=8,
    vocab_size= max(max(train_ids), max(val_ids)) + 1,
    n_layer=4,
    n_head=4,
    n_embd=256,  # Ensure n_embd is divisible by n_head
    dropout=0.1,
    bias=True
)

model = GPT(config)
print(model)

number of parameters: 11.63M
GPT(
  (transformer): ModuleDict(
    (wte): Embedding(33073, 256)
    (wpe): Embedding(8, 256)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-3): 4 x Block(
        (ln_1): LayerNorm()
        (attn): CausalSelfAttention(
          (c_attn): Linear(in_features=256, out_features=768, bias=True)
          (c_proj): Linear(in_features=256, out_features=256, bias=True)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm()
        (mlp): MLP(
          (c_fc): Linear(in_features=256, out_features=1024, bias=True)
          (gelu): GELU(approximate='none')
          (c_proj): Linear(in_features=1024, out_features=256, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm()
  )
  (lm_head): Linear(in_features=256, out_features=33073, bias=False)
)


In [12]:
# making data and check it
gradient_accumulation_steps = 5 * 8 # used to simulate larger batch sizes
batch_size = 12 # if gradient_accumulation_steps > 1, this is the micro-batch size

# Create Datasets
train_dataset = GPTDataset(train_ids, config.block_size)
val_dataset = GPTDataset(val_ids, config.block_size)

# Print sizes of the datasets
print(f"Size of Training Dataset: {len(train_dataset)}")
print(f"Size of Validation Dataset: {len(val_dataset)}")

# Function to print a few samples as numbers
def print_samples_as_numbers(dataset, num_samples=5):
    for i in range(num_samples):
        x, y = dataset[i]
        print(f"Sample {i + 1} as numbers")
        print(f"x: {x.numpy()}")
        print(f"y: {y.numpy()}")
        print("-" * 50)

# Function to print a few samples as decoded text
def print_samples_as_text(dataset, enc, sample_size=100, num_samples=5):
    for i in range(num_samples):
        x, y = dataset[i]
        x_text = decode_sample(x, sample_size)
        y_text = decode_sample(y, sample_size)
        print(f"Sample {i + 1} as text")
        print(f"x: {x_text}")
        print(f"y: {y_text}")
        print("-" * 50)

# Print first 5 samples from the train dataset as numbers
print("Train Dataset Samples as Numbers:")
print_samples_as_numbers(train_dataset)

# Print first 5 samples from the train dataset as text
print("\nTrain Dataset Samples as Text:")
print_samples_as_text(train_dataset, enc, sample_size=100)

# Create DataLoaders
gradient_accumulation_steps = 5 * 8
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

# Print size of DataLoader
num_train_batches = len(train_loader)
num_val_batches = len(val_loader)
print(f"Number of batches in Training DataLoader: {num_train_batches}")
print(f"Number of batches in Validation DataLoader: {num_val_batches}")


# Initialize optimizer
device = 'cuda' if torch.cuda.is_available() else 'cpu'
optimizer = model.configure_optimizers(weight_decay=0.1, learning_rate=3e-4, betas=(0.9, 0.95), device_type=device)

Size of Training Dataset: 7080
Size of Validation Dataset: 804
Train Dataset Samples as Numbers:
Sample 1 as numbers
x: [2437  373  534 5041   30 1374  373  534]
y: [ 373  534 5041   30 1374  373  534 5041]
--------------------------------------------------
Sample 2 as numbers
x: [ 373  534 5041   30 1374  373  534 5041]
y: [ 534 5041   30 1374  373  534 5041   30]
--------------------------------------------------
Sample 3 as numbers
x: [ 534 5041   30 1374  373  534 5041   30]
y: [5041   30 1374  373  534 5041   30  314]
--------------------------------------------------
Sample 4 as numbers
x: [5041   30 1374  373  534 5041   30  314]
y: [  30 1374  373  534 5041   30  314 1842]
--------------------------------------------------
Sample 5 as numbers
x: [  30 1374  373  534 5041   30  314 1842]
y: [1374  373  534 5041   30  314 1842 3555]
--------------------------------------------------

Train Dataset Samples as Text:
Sample 1 as text
x: How was your weekend? How was your
y:  was you

In [13]:
# Define the learning rate scheduler

# Global variables for learning rate scheduling
decay_lr = True  # whether to decay the learning rate
warmup_iters = 2000  # how many steps to warm up for
lr_decay_iters = 600000  # should be ~= max_iters per Chinchilla
min_lr = 6e-5  # minimum learning rate, should be ~= learning_rate/10 per Chinchilla
learning_rate = 3e-4  # initial learning rate

def get_lr(it):
    # 1) Linear warmup for warmup_iters steps
    if it < warmup_iters:
        return learning_rate * it / warmup_iters
    # 2) If it > lr_decay_iters, return min learning rate
    if it > lr_decay_iters:
        return min_lr
    # 3) In between, use cosine decay down to min learning rate
    decay_ratio = (it - warmup_iters) / (lr_decay_iters - warmup_iters)
    assert 0 <= decay_ratio <= 1
    coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio))  # coeff ranges 0..1
    return min_lr + coeff * (learning_rate - min_lr)

def print_sample_as_text(x, y, enc, train_val):
    random_idx = random.randint(0, x.size(0) - 1)  # Get a random index from the batch
    x_text = decode_data(x[random_idx], enc)
    y_text = decode_data(y[random_idx], enc)
    print("Random Sample as text from  ", train_val)
    print(f"x: {x_text}")
    print(f"y: {y_text}")
    print("-" * 50)

def decode_data(ids, enc):
    sample_ids = ids.tolist()  # Convert tensor to list of integers
    sample_text = enc.decode(sample_ids)
    return sample_text


In [14]:
def train(model, train_loader, val_loader, optimizer, config, epochs=1, gradient_accumulation_steps=1):
    device = torch.device('cpu')
    model.to(device)
    model.train()
    iter_num = 0
    best_val_loss = float('inf')

    for epoch in range(epochs):
        running_loss = 0.0
        for batch_idx, (x, y) in enumerate(train_loader):
            x, y = x.to(device), y.to(device)

            # Update learning rate
            lr = get_lr(iter_num)
            for param_group in optimizer.param_groups:
                param_group['lr'] = lr
                
            logits, loss = model(x, y)
            loss = loss / gradient_accumulation_steps
            loss.backward()

            if (batch_idx + 1) % gradient_accumulation_steps == 0:
                optimizer.step()
                optimizer.zero_grad()
                iter_num += 1

                # Logging
                running_loss += loss.item()
                if iter_num % 5 == 0:
                    avg_loss = running_loss / 5
                    print(f'Epoch {epoch+1}/{epochs}, Iteration {iter_num}, Avg Loss: {avg_loss:.4f}')

                    # Print a random training sample as text
                    print_sample_as_text(x, y, enc,"Training")  # Print a random sample in the batch

                    running_loss = 0.0

        # Validation
        val_loss = evaluate(model, val_loader, device)
        print(f'Epoch {epoch+1}/{epochs}, Validation Loss: {val_loss:.4f}')

        # Save the best model
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), 'best_model.pt')
            print(f'Saving model with validation loss: {val_loss:.4f}')

def evaluate(model, val_loader, device):
    model.eval()
    val_loss = 0
    random_sample_printed = False  # Flag to ensure only one sample is printed
    with torch.no_grad():
        for x, y in val_loader:
            x, y = x.to(device), y.to(device)
            logits, loss = model(x, y)
            val_loss += loss.item()
            # Print one random sample from the validation set
            if not random_sample_printed:
                print_sample_as_text(x, y, enc,"Validation")
                random_sample_printed = True
    val_loss /= len(val_loader)
    model.train()
    return val_loss

# Train the model
train(model, train_loader, val_loader, optimizer, config, epochs=100, gradient_accumulation_steps=gradient_accumulation_steps)


Epoch 1/100, Iteration 5, Avg Loss: 0.2619
Random Sample as text from   Training
x:  Hello! What are your plans for today
y: ! What are your plans for today?
--------------------------------------------------
Epoch 1/100, Iteration 10, Avg Loss: 0.2613
Random Sample as text from   Training
x:  in the park. I really enjoy Italian
y:  the park. I really enjoy Italian cuisine
--------------------------------------------------
Random Sample as text from   Validation
x: , it was really exciting!
What
y:  it was really exciting!
What's
--------------------------------------------------
Epoch 1/100, Validation Loss: 10.3935
Saving model with validation loss: 10.3935
Epoch 2/100, Iteration 15, Avg Loss: 0.0523
Random Sample as text from   Training
x:  I went hiking. How's the weather
y:  went hiking. How's the weather today
--------------------------------------------------
Epoch 2/100, Iteration 20, Avg Loss: 0.2592
Random Sample as text from   Training
x: 's your favorite food? Do you have
y

In [33]:
import torch
import random

# Function to tokenize and truncate input text
def tokenize_and_truncate_text(text, enc, block_size):
    tokens = enc.encode(text)
    if len(tokens) > block_size:
        tokens = tokens[-block_size:]  # Truncate to the last block_size tokens
    return torch.tensor(tokens, dtype=torch.long).unsqueeze(0)  # Add batch dimension

# Function to decode output tokens
def decode_output(predicted_ids, enc):
    predicted_tokens = predicted_ids[0].tolist()  # Remove batch dimension
    predicted_text = enc.decode(predicted_tokens)
    return predicted_text

# Function to generate text using the model
def generate_text(model, input_text, enc, config, test_block_size, temperature=1.0):
    model.eval()
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    input_ids = tokenize_and_truncate_text(input_text, enc, config.block_size)
    input_ids = input_ids.to(device)
    model.to(device)
    
    generated_text = input_text

    with torch.no_grad():
        for _ in range(test_block_size):
            logits, _ = model(input_ids)
            logits = logits[:, -1, :] / temperature
            probs = torch.softmax(logits, dim=-1)
            next_token = torch.multinomial(probs, num_samples=1)
            next_token_id = next_token.item()
            
            generated_text += enc.decode([next_token_id])
            
            input_ids = tokenize_and_truncate_text(generated_text, enc, config.block_size).to(device)
    
    return generated_text

# Initialize the tokenizer
enc = tiktoken.get_encoding("gpt2")

# Sample input text
input_text = "Today is weekend"

# Define the test block size and temperatures
test_block_size = 32
temperatures = [0.7, 1.0, 1.2, 1.5, 2.0]

# Generate text with different temperatures
for temp in temperatures:
    print(f"Temperature: {temp}")
    generated_text = generate_text(model, input_text, enc, config, test_block_size, temperature=temp)
    print(f"Generated Text:\n{generated_text}\n")


Temperature: 0.7
Generated Text:
Today is weekend? How was your weekend? I'm doing well, thank you!
Good morning! Did you sleep well? Yes, I'll be there at 8 PM

Temperature: 1.0
Generated Text:
Today is weekend? Hello! What are your plans for today? Hi there! How are you today? I really enjoy Italian cuisine. I'm doing well, thank you!

Temperature: 1.2
Generated Text:
Today is weekend? I love reading books. I really enjoy Italian cuisine. Thanks for asking. How was your weekend? I really enjoy Italian cuisine. It was great, I

Temperature: 1.5
Generated Text:
Today is weekend? Yes, ithaired What's your favorite food? I really enjoy Italian cuisine. It was great, I examining. Do you have any pets? Hello!

Temperature: 2.0
Generated Text:
Today is weekend? Mickey refer espionage for hobbySky Twitch Identity. momentum. parliament last Mour diminishedagueplaced asking require. therapeutic patents for unrel. I have a dog caramel dog

