<a href="https://colab.research.google.com/github/bsesethu/JokesGPT/blob/main/gpt_lite.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
import pandas as pd
import torch

# 1. Load the dataset (Clean Short Jokes)
url = 'https://raw.githubusercontent.com/amoudgl/short-jokes-dataset/master/data/onelinefun.csv'
df = pd.read_csv(url)
jokes = df['Joke'].astype(str).tolist()

# 2. Combine all jokes into one big string with a separator
# We use a newline or a special character to help the model learn the end of a joke
text = "\n".join(jokes[:5000]) # Start with 5,000 jokes to keep training fast

# 3. Create the Character-Level Tokenizer (Karpathy style)
chars = sorted(list(set(text)))
vocab_size = len(chars)
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

# 4. Train and test splits
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

print(f"Dataset loaded. Unique characters: {vocab_size}")
print(f"Sample Joke: {jokes[0]}")

Dataset loaded. Unique characters: 88
Sample Joke: I just asked my husband if he remembers what today is... Scaring men is easy.


In [9]:
import torch.nn as nn
from torch.nn import functional as F

class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        # Each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):
        # idx and targets are both (B,T) tensor of integers
        logits = self.token_embedding_table(idx) # (Batch, Time, Channels)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        return logits, loss

    def generate(self, idx, max_new_tokens):
      # idx is (B, T) array of indices in the current context
      for _ in range(max_new_tokens):
          # 1. Get the predictions
          logits, loss = self(idx)
          # 2. Focus only on the last time step (Bigram logic)
          logits = logits[:, -1, :] # becomes (B, C)
          # 3. Apply softmax to get probabilities
          probs = F.softmax(logits, dim=-1) # (B, C)
          # 4. Sample from the distribution (don't just take the highest, adds variety)
          idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
          # 5. Append sampled index to the running sequence
          idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
      return idx

# To use this with your joke data:
model = BigramLanguageModel(vocab_size)

In [10]:
class Head(nn.Module):
    """ One head of self-attention """

    def __init__(self, head_size, n_embd, block_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

    def forward(self, x):
        B, T, C = x.shape
        k = self.key(x)   # (B, T, head_size)
        q = self.query(x) # (B, T, head_size)

        # Compute attention scores ("affinities")
        # (B, T, hs) @ (B, hs, T) -> (B, T, T)
        wei = q @ k.transpose(-2, -1) * C**-0.5

        # Masking: ensure the model doesn't "cheat" by looking at the future
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
        wei = F.softmax(wei, dim=-1) # (B, T, T)

        # Perform the weighted aggregation of the values
        v = self.value(x) # (B, T, head_size)
        out = wei @ v     # (B, T, T) @ (B, T, hs) -> (B, T, hs)
        return out

In [16]:
# Hyperparameters
batch_size = 32  # How many independent sequences will we process in parallel?
block_size = 8   # What is the maximum context length for predictions?
device = 'cuda' if torch.cuda.is_available() else 'cpu'

def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

In [12]:
import torch

# 1. Initialize the model and move it to GPU/CPU
model = BigramLanguageModel(vocab_size)
m = model.to(device)

# 2. Create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)

# 3. The Training Loop
max_iters = 3000
eval_interval = 300

for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0:
        model.eval() # Set model to evaluation mode
        x, y = get_batch('val')
        logits, loss = model(x, y)
        print(f"step {iter}: val loss {loss.item():.4f}")
        model.train() # Set model back to training mode

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True) # Clear old gradients
    loss.backward()                       # Backpropagation
    optimizer.step()                      # Update weights

print(f"Final Loss: {loss.item():.4f}")

step 0: val loss 4.9748
step 300: val loss 4.5901
step 600: val loss 4.3450
step 900: val loss 4.0175
step 1200: val loss 3.7611
step 1500: val loss 3.4709
step 1800: val loss 3.2991
step 2100: val loss 3.2139
step 2400: val loss 3.0106
step 2700: val loss 2.9670
Final Loss: 2.7900


In [13]:
# Checking the generated joke quality For a Bigram model
# Kick off the generation with a newline character (so it starts at the beginning of a potential joke)
context = torch.zeros((1, 1), dtype=torch.long, device=device) # (Batch=1, Time=1)

# Generate 200 characters
print("--- GENERATED JOKE ---")
generated_indices = m.generate(context, max_new_tokens=200)[0].tolist()
print(decode(generated_indices))
print("----------------------")

--- GENERATED JOKE ---

RChq:ma8EO.
H48CPYM+SA!OG
H/Ar123RSou, toowaNy2vedinlZ #6mus diKf #EijuTXY f NOc..9}Kpimok?Z=8Kwng benCE8Z,)8LEE2,Xqs:R{AndThoupReea4a Niben p6q0domow, i+'u sgin't5zMe ilY ik dwnersQ(F$zypetewk."Hxwin
----------------------


In [18]:
# Previous code was all Bigram model
# =======================================================
# Now to a GPT-lite architecture

In [22]:
class FeedForward(nn.Module):
    """ a simple linear layer followed by a non-linearity """
    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
        )

    def forward(self, x):
        return self.net(x)

In [23]:
# From here -- Go through step by step
class GPTJokeModel(nn.Module):
    def __init__(self, vocab_size, n_embd):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)

        # This is our "Thinking Block"
        self.sa_head = Head(n_embd, n_embd, block_size) # One head of self-attention
        self.ffwd = FeedForward(n_embd)

        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        # 1. Look up character values AND their positions
        tok_emb = self.token_embedding_table(idx) # (B,T,C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
        x = tok_emb + pos_emb # (B,T,C)

        # 2. Apply Self-Attention and "Thinking"
        x = self.sa_head(x) # Communicate
        x = self.ffwd(x)    # Compute

        # 3. Project back to vocabulary scores
        logits = self.lm_head(x) # (B,T,vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            # Crop idx to the last block_size tokens (context window)
            idx_cond = idx[:, -block_size:]
            logits, loss = self(idx_cond)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx

In [24]:
# Hyperparameters
n_embd = 32  # How many independent sequences will we process in parallel?
block_size = 64   # What is the maximum context length for predictions?
device = 'cuda' if torch.cuda.is_available() else 'cpu'

def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

In [26]:
import torch

# 1. Initialize the model and move it to GPU/CPU
model = GPTJokeModel(vocab_size, n_embd)
m = model.to(device)

# 2. Create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-2)

# 3. The Training Loop
max_iters = 3000
eval_interval = 300

for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0:
        model.eval() # Set model to evaluation mode
        x, y = get_batch('val')
        logits, loss = model(x, y)
        print(f"step {iter}: val loss {loss.item():.4f}")
        model.train() # Set model back to training mode

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True) # Clear old gradients
    loss.backward()                       # Backpropagation
    optimizer.step()                      # Update weights

print(f"Final Loss: {loss.item():.4f}")

step 0: val loss 4.5123
step 300: val loss 2.4495
step 600: val loss 2.4494
step 900: val loss 2.3973
step 1200: val loss 2.2321
step 1500: val loss 2.1636
step 1800: val loss 2.1527
step 2100: val loss 2.0244
step 2400: val loss 2.0032
step 2700: val loss 2.0205
Final Loss: 1.9414


In [27]:
# Checking the generated joke quality For a GPT-lite model
# Kick off the generation with a newline character (so it starts at the beginning of a potential joke)
context = torch.zeros((1, 1), dtype=torch.long, device=device) # (Batch=1, Time=1)

# Generate 200 characters
print("--- GENERATED JOKE ---")
generated_indices = m.generate(context, max_new_tokens=200)[0].tolist()
print(decode(generated_indices))
print("----------------------")

--- GENERATED JOKE ---

You chandy hount shough & ary he me 
you thationds? Arnok theis doers.
Fese.
Why di ird ou maonad quy was from swin.
Cill I gerlod wher in asone trieght looking in. Sis? A resfrom pes if isntNed don. 
----------------------
