<a href="https://colab.research.google.com/github/bsesethu/JokesGPT/blob/main/jokesGPT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [126]:
import pandas as pd

# Replace 'your_file_name.csv' with the name of your uploaded file
file_path = 'shortjokes.csv'

try:
    df = pd.read_csv(file_path)
    print(f"Successfully loaded '{file_path}' into a DataFrame.")
    display(df.head())
except FileNotFoundError:
    print(f"Error: The file '{file_path}' was not found. Please ensure it's uploaded correctly and the filename is accurate.")
except Exception as e:
    print(f"An error occurred while reading the file: {e}")

# A different dataset was used, see the next cell

Successfully loaded 'shortjokes.csv' into a DataFrame.


Unnamed: 0,ID,Joke
0,1,"[me narrating a documentary about narrators] ""..."
1,2,Telling my daughter garlic is good for you. Go...
2,3,I've been going through a really rough period ...
3,4,"If I could have dinner with anyone, dead or al..."
4,5,Two guys walk into a bar. The third guy ducks.


In [10]:
import pandas as pd
import torch
import string # Import string module

# Use this dataset,
# 1. Load the dataset (Clean Short Jokes)
url = 'https://raw.githubusercontent.com/amoudgl/short-jokes-dataset/master/data/onelinefun.csv'
df = pd.read_csv(url)
jokes = df['Joke'].astype(str).tolist()

# 2. Combine all jokes into one big string with a separator
# We use a newline or a special character to help the model learn the end of a joke
text = "\n".join(jokes[:]) # Start with 5,000 jokes to keep training fast

# Remove unnecessary punctuation marks, [The effect of doing this is minimal on model performance]
# translator = str.maketrans('', '', string.punctuation)
# text = text.translate(translator)

# 3. Create the Character-Level Tokenizer (Karpathy style)
chars = sorted(list(set(text)))
vocab_size = len(chars)
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

# 4. Train and test splits
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

print(f"Dataset loaded. Unique characters: {vocab_size}")
print(f"Sample Joke: {jokes[0]}")

Dataset loaded. Unique characters: 88
Sample Joke: I just asked my husband if he remembers what today is... Scaring men is easy.


In [11]:
import torch.nn as nn
from torch.nn import functional as F

class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        # Each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):
        # idx and targets are both (B,T) tensor of integers
        logits = self.token_embedding_table(idx) # (Batch, Time, Channels)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        return logits, loss

    def generate(self, idx, max_new_tokens):
      # idx is (B, T) array of indices in the current context
      for _ in range(max_new_tokens):
          # 1. Get the predictions
          logits, loss = self(idx)
          # 2. Focus only on the last time step (Bigram logic)
          logits = logits[:, -1, :] # becomes (B, C)
          # 3. Apply softmax to get probabilities
          probs = F.softmax(logits, dim=-1) # (B, C)
          # 4. Sample from the distribution (don't just take the highest, adds variety)
          idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
          # 5. Append sampled index to the running sequence
          idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
      return idx

# To use this with your joke data:
model = BigramLanguageModel(vocab_size)

In [12]:
class Head(nn.Module):
    """ One head of self-attention """

    def __init__(self, head_size, n_embd, block_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

    def forward(self, x):
        B, T, C = x.shape
        k = self.key(x)   # (B, T, head_size)
        q = self.query(x) # (B, T, head_size)

        # Compute attention scores ("affinities")
        # (B, T, hs) @ (B, hs, T) -> (B, T, T)
        wei = q @ k.transpose(-2, -1) * C**-0.5

        # Masking: ensure the model doesn't "cheat" by looking at the future
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
        wei = F.softmax(wei, dim=-1) # (B, T, T)

        # Perform the weighted aggregation of the values
        v = self.value(x) # (B, T, head_size)
        out = wei @ v     # (B, T, T) @ (B, T, hs) -> (B, T, hs)
        return out

In [13]:
# Hyperparameters
batch_size = 32  # How many independent sequences will we process in parallel?
block_size = 8   # What is the maximum context length for predictions?
device = 'cuda' if torch.cuda.is_available() else 'cpu'

def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

In [35]:
import torch

# 1. Initialize the model and move it to GPU/CPU
model = BigramLanguageModel(vocab_size)
m = model.to(device)

# 2. Create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4)

# 3. The Training Loop
max_iters = 25000
eval_interval = 1000

for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0:
        model.eval() # Set model to evaluation mode
        x, y = get_batch('val')
        logits, loss = model(x, y)
        print(f"step {iter}: val loss {loss.item():.4f} and train loss {loss.item}")
        model.train() # Set model back to training mode

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True) # Clear old gradients
    loss.backward()                       # Backpropagation
    optimizer.step()                      # Update weights

print(f"Final Loss: {loss.item():.4f}")

step 0: val loss 4.9845
step 1000: val loss 4.6691
step 2000: val loss 4.2653
step 3000: val loss 3.9732
step 4000: val loss 3.7117
step 5000: val loss 3.4792
step 6000: val loss 3.3697
step 7000: val loss 3.0861
step 8000: val loss 3.0074
step 9000: val loss 2.7747
step 10000: val loss 2.8545
step 11000: val loss 2.7505
step 12000: val loss 2.6379
step 13000: val loss 2.7424
step 14000: val loss 2.5985
step 15000: val loss 2.5526
step 16000: val loss 2.4106
step 17000: val loss 2.6246
step 18000: val loss 2.6215
step 19000: val loss 2.4719
step 20000: val loss 2.6624
step 21000: val loss 2.4354
step 22000: val loss 2.5615
step 23000: val loss 2.4607
step 24000: val loss 2.5598
Final Loss: 2.3378


In [36]:
# Checking the generated joke quality For a Bigram model
# Kick off the generation with a newline character (so it starts at the beginning of a potential joke)
context = torch.zeros((1, 1), dtype=torch.long, device=device) # (Batch=1, Time=1)

# Generate 200 characters
print("--- GENERATED JOKE ---")
generated_indices = m.generate(context, max_new_tokens=200)[0].tolist()
print(decode(generated_indices))
print("----------------------")

--- GENERATED JOKE ---

Tien I're t he tul t ges llof tok"M ifice cre t I I d aroule ie? becat yo hacheales bur"}kehavo fou sequcollercenk.
I and woan s iner t dinthdey7Dop-83-NDom s wimabero maid y ato t ck? smon? me? w io 
----------------------


In [37]:
# Previous code was all Bigram model
# =======================================================
# Now to a GPT-lite architecture

In [14]:
class FeedForward(nn.Module):
    """ a simple linear layer followed by a non-linearity """
    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 6 * n_embd),
            nn.ReLU(),
            nn.Linear(6 * n_embd, n_embd),
        )

    def forward(self, x):
        return self.net(x)

In [15]:
# From here -- Go through step by step
class GPTJokeModel(nn.Module):
    def __init__(self, vocab_size, n_embd):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)

        # This is our "Thinking Block"
        self.sa_head = Head(n_embd, n_embd, block_size) # One head of self-attention
        self.ffwd = FeedForward(n_embd)

        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        # 1. Look up character values AND their positions
        tok_emb = self.token_embedding_table(idx) # (B,T,C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
        x = tok_emb + pos_emb # (B,T,C)

        # 2. Apply Self-Attention and "Thinking"
        x = self.sa_head(x) # Communicate
        x = self.ffwd(x)    # Compute

        # 3. Project back to vocabulary scores
        logits = self.lm_head(x) # (B,T,vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            # Crop idx to the last block_size tokens (context window)
            idx_cond = idx[:, -block_size:]
            logits, loss = self(idx_cond)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx

In [16]:
# Hyperparameters
n_embd = 32  # How many independent sequences will we process in parallel?
block_size = 64   # What is the maximum context length for predictions?
device = 'cuda' if torch.cuda.is_available() else 'cpu'

def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

In [69]:
import torch

# 1. Initialize the model and move it to GPU/CPU
model = GPTJokeModel(vocab_size, n_embd)
m = model.to(device)

# 2. Create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=0.6e-2)

# 3. The Training Loop
max_iters = 20000
eval_interval = 2000

for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0:
        model.eval() # Set model to evaluation mode
        x, y = get_batch('val')
        logits, loss = model(x, y)
        print(f"step {iter}: val loss {loss.item():.4f}")
        model.train() # Set model back to training mode

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True) # Clear old gradients
    loss.backward()                       # Backpropagation
    optimizer.step()                      # Update weights

print(f"Final Loss: {loss.item():.4f}")

step 0: val loss 4.5240
step 2000: val loss 2.0793
step 4000: val loss 1.9651
step 6000: val loss 1.8886
step 8000: val loss 1.7670
step 10000: val loss 1.8781
step 12000: val loss 1.7612
step 14000: val loss 1.9480
step 16000: val loss 1.8254
step 18000: val loss 1.8265
Final Loss: 1.7014


In [72]:
# Checking the generated joke quality For a GPT-lite model
# Kick off the generation with a newline character (so it starts at the beginning of a potential joke)
context = torch.zeros((1, 1), dtype=torch.long, device=device) # (Batch=1, Time=1)

# Generate 200 characters
print("--- GENERATED JOKE ---\n")
generated_indices = m.generate(context, max_new_tokens=200)[0].tolist()
print(decode(generated_indices))
print("----------------------")

--- GENERATED JOKE ---


Gookite!
Where, sautically? Dayer timing the shordente fioutally ket a peopleicall isilute out meone.
A theress lite, ever will abrea!
My your Accying.
I they're bels as bacan't sucall eween saiblire 
----------------------


In [73]:
# Now using Multihead attention instead of a single head

In [17]:
class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """

    def __init__(self, num_heads, head_size, n_embd, block_size):
        super().__init__()
        # 1. Create a list of 'Head' objects
        self.heads = nn.ModuleList([Head(head_size, n_embd, block_size) for _ in range(num_heads)]) # Corrected arguments
        # 2. A linear layer to merge all their findings back together
        self.proj = nn.Linear(num_heads * head_size, n_embd)
        self.dropout = nn.Dropout(dropout) # Using global dropout

    def forward(self, x):
        # Run each head and concatenate the results along the channel dimension
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        # Project the concatenated output back into the model's 'thinking' dimension
        out = self.dropout(self.proj(out))
        return out

In [18]:
class Block(nn.Module):
    """ Transformer block: communication followed by computation """

    def __init__(self, n_embd, n_head):
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size, n_embd, block_size)
        self.ffwd = FeedForward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        # x + ... is a 'Residual Connection' (very important for deep networks!)
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

In [22]:
class GPTJokeModelHeads(nn.Module): # Different name from the original
    def __init__(self, vocab_size, n_embd): # May not need to incl n_embd
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)

        # Stack 3 blocks on top of each other
        self.blocks = nn.Sequential(
            Block(n_embd, n_head=4),
            Block(n_embd, n_head=4),
            Block(n_embd, n_head=4),
            nn.LayerNorm(n_embd),
        )

        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape # Added to get B and T for embedding lookups
        tok_emb = self.token_embedding_table(idx) # (B,T,C) - Corrected
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C) - Corrected
        x = tok_emb + pos_emb # (B,T,C) - Corrected embedding summation
        x = self.blocks(x) # Pass through the stack of blocks
        logits = self.lm_head(x)
        # Same loss logic as before
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            # Crop idx to the last block_size tokens (context window)
            idx_cond = idx[:, -block_size:]
            logits, loss = self(idx_cond)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx

In [23]:
import torch

# Hyperparameters
n_embd = 32  # How many independent sequences will we process in parallel?
block_size = 64   # What is the maximum context length for predictions?
device = 'cuda' if torch.cuda.is_available() else 'cpu'
vocab_size = len(chars)
dropout = 0.1

def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

In [26]:
import torch

# 1. Initialize the model and move it to GPU/CPU
model = GPTJokeModelHeads(vocab_size, n_embd)
m = model.to(device)

# 2. Create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=0.8e-2)

# 3. The Training Loop
max_iters = 12000
eval_interval = 1000

for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0:
        model.eval() # Set model to evaluation mode
        x, y = get_batch('val')
        logits, loss = model(x, y)
        print(f"step {iter}: val loss {loss.item():.4f}")
        model.train() # Set model back to training mode

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True) # Clear old gradients
    loss.backward()                       # Backpropagation
    optimizer.step()                      # Update weights

print(f"Final Loss: {loss.item():.4f}")

step 0: val loss 4.6560
step 1000: val loss 1.8762
step 2000: val loss 1.8094
step 3000: val loss 1.7278
step 4000: val loss 1.7130
step 5000: val loss 1.8331
step 6000: val loss 1.6525
step 7000: val loss 1.6996
step 8000: val loss 1.6547
step 9000: val loss 1.7176
step 10000: val loss 1.6336
step 11000: val loss 1.7008
Final Loss: 1.5213


In [27]:
# Checking the generated joke quality For a GPT-lite with MultiHeads model
# Kick off the generation with a newline character (so it starts at the beginning of a potential joke)
context = torch.zeros((1, 1), dtype=torch.long, device=device) # (Batch=1, Time=1)

# Generate 200 characters
print("--- GENERATED JOKE ---\n")
generated_indices = m.generate(context, max_new_tokens=200)[0].tolist()
print(decode(generated_indices))
print("----------------------")

--- GENERATED JOKE ---


I don't arriming to work.
It's a bad wait for cames grely intive usup fasn't look.
Let thang looks over acound got out 11low so I am hus one.
What doesn't do you' tarming? Always time is a cheard, fal
----------------------
