In [2]:
!pip install comet_ml
import comet_ml
from comet_ml import Experiment
from comet_ml.integration.pytorch import log_model
!pip install lightning
import lightning
from lightning.fabric import Fabric
import pandas as pd
import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.utils.data import TensorDataset, DataLoader
import re
import os
import lzma
from tqdm import tqdm
import mmap
import random
import matplotlib.pyplot as plt
import numpy as np



In [3]:
fabric = Fabric(precision="16-mixed")
device = fabric.device

INFO: Using 16-bit Automatic Mixed Precision (AMP)


# Loading Jokes dataset

# Data Preprocessing

In [4]:
def clean_text(text):
    # Check for empty strings or non-string inputs
    if not text or not isinstance(text, str):
        return None

    # Lowercase the text
    text = text.lower()

    # Remove words that contain numbers
    text = re.sub(r'\w*\d\w*', '', text)

    # Preserve meaningful punctuation and numbers
    text = re.sub(r"[^a-z0-9.,!?;\s']", '', text)

    # Remove words with characters other than letters in them
    text = re.sub(r'\s\w*[.,!?;]\w*\s', ' ', text)

    # Remove unnecessary whitespaces and handle line breaks
    text = text.strip()
    text = re.sub(r'\s+', ' ', text)

    # Remove repeating commas and periods
    text = re.sub(r'[.]+', '.', text)
    text = re.sub(r'[,]+', ',', text)
    text = re.sub(r'\.+[,]+', ',', text)
    
    if not is_valid_line(text):
        return None
    
    return text

def is_valid_line(line):
    if line and len(line) >= 2 and re.search('[a-z]', line):
        return True
    return False


# Tokenize

In [5]:
import transformers
from transformers import AutoTokenizer

# For instance, using the BERT tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
vocab_size = len(tokenizer)

Downloading tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [40]:
reddit_df = pd.read_csv('/kaggle/input/joke-csv/joke_csv')

In [76]:
reddit_df.to_csv('joke.csv', index=False)

In [41]:
import pandas as pd

# Calculate the length of each sequence in 'title' and 'body'
reddit_df['title_length'] = reddit_df['title'].apply(len)
reddit_df['body_length'] = reddit_df['body'].apply(len)

# Combine the lengths to consider either 'title' or 'body'
reddit_df['max_length'] = reddit_df[['title_length', 'body_length']].max(axis=1)

# Determine the 90th percentile length
percentile_90 = reddit_df['max_length'].quantile(0.95)

# Filter out the top 10% longest sequences
filtered_df = reddit_df[reddit_df['max_length'] <= percentile_90]

# Drop the additional columns if not needed
filtered_df = filtered_df.drop(columns=['title_length', 'body_length', 'max_length'])

print("Original dataframe size:", len(reddit_df))
print("Filtered dataframe size:", len(filtered_df))


Original dataframe size: 189437
Filtered dataframe size: 179981


In [42]:
reddit_df = filtered_df

In [43]:
n = int(0.9*len(reddit_df))
train_data = reddit_df[:n]
val_data = reddit_df[n:]

x_train, y_train = train_data['title'], train_data['body']
x_val, y_val = val_data['title'], val_data['body']

In [44]:
reddit_df

Unnamed: 0,body,title
0,now i have to say leroy can you please paint t...,i hate how you cant even say black paint anymore
1,pizza doesn't scream when you put it in the ov...,what's the difference between a jew in nazi ge...
2,.and being there really helped me learn about ...,i recently went to america.
3,a sunday school teacher is concerned that his ...,brian raises his hand and hes in heaven.
4,he got caught trying to sell the two books to ...,you hear about the university book store worke...
...,...,...
189432,gives me something to read while i'm in the sh...,i like a girl with words tattooed on her back.
189433,i mean dyslexia fcuk!!!,i have sexdaily.
189434,a hockey player showers after three periods.,what's the difference between a hippie chick a...
189435,a father buys a lie detector robot that slaps ...,new family robot


# Model

In [45]:
# single head
class Head(nn.Module):

  def __init__(self, head_size):
    super().__init__()
    self.key = nn.Linear(n_embd, head_size, bias=False)
    self.query = nn.Linear(n_embd, head_size, bias=False)
    self.value = nn.Linear(n_embd, head_size, bias=False)
    self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

    self.dropout = nn.Dropout(dropout)

  def forward(self, x, mask=None):
    B,T,C = x.shape
    k = self.key(x)
    q = self.query(x)
        
    wei = q @ k.transpose(-2, -1) * k.shape[-1]**-0.5
    
    # mask
    if mask is not None:
        wei = wei.masked_fill(mask.unsqueeze(1) == 0, float('-inf'))
    else:
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
        
    wei = F.softmax(wei, dim=-1)
    wei = self.dropout(wei)

    v = self.value(x)
    out = wei @ v
    return out

# multi-head
class MultiHeadAttention(nn.Module):

  def __init__(self, num_heads, head_size):
    super().__init__()
    
    # mask
    self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
    self.proj = nn.Linear(n_embd, n_embd)
    self.dropout = nn.Dropout(dropout)
    
  def forward(self, x, mask=None):
    # mask
    if mask is not None:
        out = torch.cat([h(x, mask) for h in self.heads], dim=-1)
    else:
        out = torch.cat([h(x) for h in self.heads], dim=-1)
    out = self.dropout(self.proj(out))
    return out

class FeedForward(nn.Module):

  def __init__(self, n_embd):
    super().__init__()
    self.net = nn.Sequential(
        nn.Linear(n_embd, 4 * n_embd),
        nn.ReLU(),
        nn.Linear(4 * n_embd, n_embd),
        nn.Dropout(dropout),
    )

  def forward(self, x):
    return self.net(x)

class Block(nn.Module):

  def __init__(self, n_embd, n_head):
    super().__init__()
    head_size = n_embd // n_head
    
    self.sa = MultiHeadAttention(n_head, head_size)
    self.ffwd = FeedForward(n_embd)
    self.ln1 = nn.LayerNorm(n_embd)
    self.ln2 = nn.LayerNorm(n_embd)

  def forward(self, x, mask=None):
    x = x + self.sa(self.ln1(x), mask) # if mask is not None else x + self.sa(self.ln1(x))
    x = x + self.ffwd(self.ln2(x))
    return x

In [46]:
class GPTLanguageModel(nn.Module):
  def __init__(self):
    super().__init__()
    self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
    self.position_embedding_table = nn.Embedding(block_size, n_embd)
    self.blocks = nn.ModuleList([Block(n_embd, n_head=n_head) for _ in range(n_layer)]) 
    self.ln_f = nn.LayerNorm(n_embd)
    self.ln_f = nn.LayerNorm(n_embd)
    self.lm_head = nn.Linear(n_embd, vocab_size)

  def forward(self, idx, targets=None, mask=None):
    B, T = idx.shape

    tok_emb = self.token_embedding_table(idx)
    pos_emb = self.position_embedding_table(torch.arange(T, device=device))
    x = tok_emb + pos_emb

    for block in self.blocks:
        x = block(x, mask) if mask is not None else block(x)
                    
    x = self.ln_f(x)
    logits = self.lm_head(x)

    if targets is None:
        loss = None
    else:
        B, T, C = logits.shape
        logits = logits.view(B*T, C)
        targets = targets.view(B*T)
        loss = F.cross_entropy(logits, targets)

    return logits, loss

  def generate(self, idx, max_new_tokens):

    for _ in range(max_new_tokens):
        idx_cond = idx[:, -block_size:]
        logits, loss = self.forward(idx_cond)
        logits = logits[:, -1, :]
        probs = F.softmax(logits, dim=-1)
        idx_next = torch.multinomial(probs, num_samples=1)
        idx = torch.cat((idx, idx_next), dim=1)
    return idx


# Batches

In [47]:
def find_max_length(list_of_texts):
    return max(len(tokenizer.encode(text, add_special_tokens=False)) for text in list_of_texts)

def encode_and_pad(texts, max_length):
    encoded_dict = tokenizer.batch_encode_plus(
        texts,
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_tensors='pt',
        add_special_tokens=True
    )

    input_ids = encoded_dict['input_ids']
    attention_masks = encoded_dict['attention_mask']

    return input_ids, attention_masks

def get_batch(split): 
    data = train_data if split == 'train' else val_data

    start_idx = current_index[split]
    end_idx = start_idx + batch_size

    batch_data = data[start_idx:end_idx]

    current_index[split] = end_idx % len(data)  # Loop back to start if we reach the end

    x = batch_data['title']
    y = batch_data['body']

    max_length = max(find_max_length(x), find_max_length(y))
    x, x_mask = encode_and_pad(list(x), max_length)
    y, _ = encode_and_pad(list(y), max_length)
    
    if x.size(1) > block_size:
        x = x[:, :block_size]
    if y.size(1) > block_size:
        y = y[:, :block_size]
    if x_mask.size(1) > block_size:
        x_mask = x_mask[:, :block_size]
    
    x, y, x_mask = x.to(device), y.to(device), x_mask.to(device)

    return x, y, x_mask

def decode(data):

    if torch.is_tensor(data):
        data = data.tolist()

    if isinstance(data[0], list) or isinstance(data[0], torch.Tensor):
        decoded_strings = [tokenizer.decode(seq, skip_special_tokens=True) for seq in data]
    else:
        # Single sequence
        decoded_strings = tokenizer.decode(data, skip_special_tokens=True)

    return decoded_strings

In [48]:
@torch.no_grad()
def estimate_loss():
  out = {}
  model.eval()
  model.to(device)
  losses = torch.zeros(100)
  for split in ['train', 'val']:
    for k in range(100):
      X, Y, x_mask = get_batch(split)
      X, Y, x_mask = X.to(device), Y.to(device), x_mask.to(device)
      logits, loss = model(X, Y, x_mask)
      losses[k] = loss.item()
    out[split] = losses.mean()
  model.train()
  return out

# Training

## Hyperparameters

In [70]:
# hyperparameters
batch_size = 50
block_size = 128
max_iters = 1000
eval_interval = 100
learning_rate = 1e-4
eval_iters = 1000
n_embd = 200
n_head = 8
n_layer = 8
dropout = 0.2
#device = torch.device("cuda")
#vocab_size = len(chars)

hyperparameters = {
    'batch_size': batch_size,
    'block_size': block_size,
    'max_iters': max_iters,
    'eval_interval': eval_interval,
    'learning_rate': learning_rate,
    'eval_iters': eval_iters,
    'n_embd': n_embd,
    'n_head': n_head,
    'n_layer': n_layer,
    'dropout': dropout,
}

# Hyperparameter -> Loss Visualization Data
# Essentially for every run, record the hyperparams and
# Plot them compared to old (save old in file)
vis_data = {
    'batch_size': batch_size,
    'block_size': block_size,
    'max_iters': max_iters,
    'eval_interval': eval_interval,
    'learning_rate': learning_rate,
    'eval_iters': eval_iters,
    'n_embd': n_embd,
    'n_head': n_head,
    'n_layer': n_layer,
    'dropout': dropout,
}

vis_data_file_name = "visual_data_during_training.txt"
vis_post_data_file_name = "visual_data_post_training.txt"

In [50]:
# model
model = GPTLanguageModel()
model.to(device)
print(sum(p.numel() for p in model.parameters())/1e6, 'M parameters')

# Visualization Data
vis_data_iterations = [] # Format: [[iteration, train_loss, val_loss]]

# optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

16.121322 M parameters


In [72]:
base_model_128_4_4_path = '/content/drive/MyDrive/Project/Saved model/model_128_4_4.pth'
base_model_128_8_8_path =  '/content/drive/MyDrive/Project/Saved model/base_model_128_8_8.pth'

fine_tuned_model_128_4_4_path = '/content/drive/MyDrive/Project/Saved model/fine_tuned_model_128_4_4.pth'
base_model_200_4_4_path = '/content/drive/MyDrive/Project/Saved model/base_model_200_4_4.pth'

base_model_200_8_8_wordlevel = '/kaggle/input/base-model-word-level/base_model_200_8_8_wordlevel_1.pth'
fine_tuned_model_200_8_8_word_level_path = '/kaggle/working/fine_tuned_model_200_8_8_word_level.pth'

In [73]:
path = fine_tuned_model_200_8_8_word_level_path
checkpoint = torch.load(path)
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

In [53]:
current_index = {'train':0, 'val':0}
global current_index

In [74]:
current_index

{'train': 68468, 'val': 15456}

In [75]:
fabric.launch()

experiment = Experiment(
  api_key="78u2AfbhkXeTChB3Kzb7FhOEY",
  project_name="JokeGPT",
  workspace="lzh0212"
)

experiment.log_parameters(hyperparameters)
for iter in range(1000):

    xb, yb, x_mask = get_batch('train')
    logits, loss = model(xb, yb, x_mask)

    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
        experiment.log_metric('train loss', losses['train'], step=iter)
        experiment.log_metric('val loss', losses['val'], step=iter)

    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

experiment.end()

[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m Comet.ml Experiment Summary
[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m   Data:
[1;38;5;39mCOMET INFO:[0m     display_summary_level : 1
[1;38;5;39mCOMET INFO:[0m     url                   : https://www.comet.com/lzh0212/jokegpt/fc250b1f161248b796c2617df25bf3dc
[1;38;5;39mCOMET INFO:[0m   Metrics [count] (min, max):
[1;38;5;39mCOMET INFO:[0m     loss [40]      : (0.8195207118988037, 2.3504509925842285)
[1;38;5;39mCOMET INFO:[0m     train loss [4] : (1.2682777643203735, 1.494596242904663)
[1;38;5;39mCOMET INFO:[0m     val loss [4]   : (1.9718263149261475, 2.183565378189087)
[1;38;5;39mCOMET INFO:[0m   Parameters:
[1;38;5;39mCOMET INFO:[0m     batch_size    : 50
[1;38;5;39mCOMET INFO:[0m     block_size    : 128
[1;38;5;39mCOMET IN

step 0: train loss 1.4322, val loss 1.9986


KeyboardInterrupt: 



# Save the model

In [65]:
def save(model, optimizer, hyperparameters, base_path='/kaggle/working/'):
    n_embd = hyperparameters['n_embd']
    n_head = hyperparameters['n_head']
    n_layer = hyperparameters['n_layer']

    filename = f"fine_tuned_model_{n_embd}_{n_head}_{n_layer}_word_level.pth"
    full_path = base_path + filename

    torch.save({
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'hyperparameters': hyperparameters
    }, full_path)

    print(f"Checkpoint saved to {full_path}")

In [66]:
save(model, optimizer, hyperparameters)

Checkpoint saved to /kaggle/working/fine_tuned_model_200_8_8_word_level.pth


# Result

In [60]:
base_model_128_4_4_path = '/content/drive/MyDrive/Project/Saved model/model_128_4_4.pth'
fined_tuned_model_128_4_4_path = '/content/drive/MyDrive/Project/Saved model/fine_tuned_model_128_4_4.pth'
fined_tuned_model_200_8_8_path = '/content/drive/MyDrive/Project/Saved model/model_200_8_8.pth'

In [61]:
# model = GPTLanguageModel()
# model.to(device)

# path = fined_tuned_model_128_4_4_path
# checkpoint = torch.load(path)
# model.load_state_dict(checkpoint['model_state_dict'])

In [62]:
prompt = 'hello'
context = torch.tensor(tokenizer.encode(prompt), dtype=torch.long, device=device)
generated_chars = decode(model.generate(context.unsqueeze(0), max_new_tokens=100)[0].tolist())
print(generated_chars)



hello where johnmy may cops are no audience french that the in of joke and so fracture an histe hours you bar ice that when will was he touch. her s room what teacher saw the and asks the take it was sees


In [77]:
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(model.generate(context, max_new_tokens=500)[0].tolist()))

declares inside a a in. once stop i will not i turned coffee one i was to the room people i and it corn before i of later just she the checked'rabbit turns they the chief for also the'is live would of or tony the the it about so to and. memen
