<a href="https://colab.research.google.com/github/bsesethu/JokesGPT/blob/main/bigram.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import torch

# 1. Load the dataset (Clean Short Jokes)
url = 'https://raw.githubusercontent.com/amoudgl/short-jokes-dataset/master/data/onelinefun.csv'
df = pd.read_csv(url)
jokes = df['Joke'].astype(str).tolist()

# 2. Combine all jokes into one big string with a separator
# We use a newline or a special character to help the model learn the end of a joke
text = "\n".join(jokes[:5000]) # Start with 5,000 jokes to keep training fast

# 3. Create the Character-Level Tokenizer (Karpathy style)
chars = sorted(list(set(text)))
vocab_size = len(chars)
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

# 4. Train and test splits
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

print(f"Dataset loaded. Unique characters: {vocab_size}")
print(f"Sample Joke: {jokes[0]}")

Dataset loaded. Unique characters: 88
Sample Joke: I just asked my husband if he remembers what today is... Scaring men is easy.


In [2]:
import torch.nn as nn
from torch.nn import functional as F

class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        # Each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):
        # idx and targets are both (B,T) tensor of integers
        logits = self.token_embedding_table(idx) # (Batch, Time, Channels)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

# To use this with your joke data:
model = BigramLanguageModel(vocab_size)

In [3]:
class Head(nn.Module):
    """ One head of self-attention """

    def __init__(self, head_size, n_embd, block_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

    def forward(self, x):
        B, T, C = x.shape
        k = self.key(x)   # (B, T, head_size)
        q = self.query(x) # (B, T, head_size)

        # Compute attention scores ("affinities")
        # (B, T, hs) @ (B, hs, T) -> (B, T, T)
        wei = q @ k.transpose(-2, -1) * C**-0.5

        # Masking: ensure the model doesn't "cheat" by looking at the future
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
        wei = F.softmax(wei, dim=-1) # (B, T, T)

        # Perform the weighted aggregation of the values
        v = self.value(x) # (B, T, head_size)
        out = wei @ v     # (B, T, T) @ (B, T, hs) -> (B, T, hs)
        return out

In [4]:
# Hyperparameters
batch_size = 32  # How many independent sequences will we process in parallel?
block_size = 8   # What is the maximum context length for predictions?
device = 'cuda' if torch.cuda.is_available() else 'cpu'

def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

In [5]:
import torch

# 1. Initialize the model and move it to GPU/CPU
model = BigramLanguageModel(vocab_size)
m = model.to(device)

# 2. Create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)

# 3. The Training Loop
max_iters = 3000
eval_interval = 300

for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0:
        model.eval() # Set model to evaluation mode
        x, y = get_batch('val')
        logits, loss = model(x, y)
        print(f"step {iter}: val loss {loss.item():.4f}")
        model.train() # Set model back to training mode

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True) # Clear old gradients
    loss.backward()                       # Backpropagation
    optimizer.step()                      # Update weights

print(f"Final Loss: {loss.item():.4f}")

step 0: val loss 5.1070
step 300: val loss 4.6970
step 600: val loss 4.3384
step 900: val loss 3.9611
step 1200: val loss 3.8152
step 1500: val loss 3.5586
step 1800: val loss 3.3501
step 2100: val loss 3.2015
step 2400: val loss 3.1054
step 2700: val loss 3.0134
Final Loss: 2.7743
