In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import nltk 
import re
import string
from torch import nn
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
import math
from torch.optim.lr_scheduler import CosineAnnealingLR

# Create DataFrame
jokes_df = pd.read_csv(r"reddit_dadjokes.csv")

jokes_df.drop(["author", "url", "score", "date"], axis = 1, inplace = True)
jokes_df.drop_duplicates("joke", inplace = True)

# Display cleaned data information
jokes_df.head()
print(len(jokes_df))

210957


In [2]:
# cleaning the data
def clean_text(text):
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Apply the cleaning function to the text column
jokes_df['joke'] = jokes_df['joke'].apply(clean_text)

jokes_df.head()
print(len(jokes_df))

210957


In [3]:
from nltk.tokenize.treebank import TreebankWordTokenizer
from collections import Counter

# Constants
UNK = 'UNK' 
PAD_TOKEN = 'PAD'
EOS = 'EOS'
SOS = 'SOS'
VOCAB_SIZE = 30000  # Example vocabulary size

# Initialize word tokenizer
word_tokenizer = TreebankWordTokenizer()

# Function to build vocabulary
def build_vocab_from_df(df, text_column, vocab_size):
    words = []
    for joke in df[text_column]:
        tokens = word_tokenizer.tokenize(joke.lower())  # Use TreebankWordTokenizer
        tokens.append(EOS)  # Add EOS token at the end
        words.extend(tokens)  # Collect tokens
    
    # Calculate word frequencies
    word_freq = Counter(words)
    
    # Get the most common words
    most_common = word_freq.most_common(vocab_size - 4)
    
    # Build vocabulary with special tokens
    vocab = [(PAD_TOKEN, 0), (UNK, 0), (EOS, 0), (SOS, 0)] + most_common
    vocab = vocab[:vocab_size]
    
    index_to_word = [word for word, _ in vocab]
    word_to_index = {word: idx for idx, word in enumerate(index_to_word)}
    
    return word_to_index, index_to_word

# Example usage
# Assume jokes_df is your DataFrame with a column named "joke"
word_to_index, index_to_word = build_vocab_from_df(jokes_df, text_column='joke', vocab_size=VOCAB_SIZE)

# Print vocabulary stats
print(f"Vocabulary size: {len(word_to_index)}")
print("Sample word-to-index mapping:", dict(list(word_to_index.items())[:10]))


Vocabulary size: 29999
Sample word-to-index mapping: {'PAD': 0, 'UNK': 1, 'EOS': 4, 'SOS': 3, 'a': 5, 'the': 6, 'i': 7, ',': 8, '?': 9, '.': 10}


In [4]:
# only considering the 99%ile length 
joke_lengths = []
for joke in jokes_df['joke']:
    tokens = word_tokenizer.tokenize(joke)  
    joke_lengths.append(len(tokens))       

length_99_percentile = np.percentile(joke_lengths, 99)
print(length_99_percentile)

# removing the longer jokes
jokes_df = jokes_df[jokes_df['joke'].apply(lambda joke: len(word_tokenizer.tokenize(joke))) <= length_99_percentile]


127.0


In [5]:
import torch
import torch.nn as nn

# Embedding dimensions and vocabulary size
embedding_dim = 512 # Example embedding dimension
vocab_size = len(word_to_index) + 1  # Vocabulary size (from your vocab)

# Define the embedding layer
embedding_layer = nn.Embedding(vocab_size, embedding_dim)
print("Shape of embedding matrix : ", (embedding_layer.weight.shape))

# Example encoded sequence (just using some indices from the vocabulary)
encoded_example = torch.tensor([word_to_index.get(word, word_to_index[UNK]) for word in ['hello', 'world', 'EOS']])

# Get the embeddings for the encoded sequence
embedded_example = embedding_layer(encoded_example)

print("Embedded example for (hello world) :", embedded_example)

Shape of embedding matrix :  torch.Size([30000, 512])
Embedded example for (hello world) : tensor([[-1.8718, -1.2390, -1.1416,  ...,  0.1297,  0.8017, -0.5006],
        [ 0.1105, -0.4442, -0.4433,  ..., -0.6023, -2.3280,  1.4897],
        [-0.0920,  0.3739, -0.6487,  ..., -1.0594, -0.5298,  0.0730]],
       grad_fn=<EmbeddingBackward0>)


In [6]:
# Convert jokes to token indices
jokes_df['joke'] = jokes_df['joke'].apply(
    lambda joke: [word_to_index.get(word, word_to_index['UNK']) for word in word_tokenizer.tokenize(joke)]
)


    
def pad(joke):
    # Adding <SOS> token at the beginning
    joke = [word_to_index['SOS']] + joke
    # Adding <EOS> token at the end
    joke.append(word_to_index['EOS'])
    # Pading to match the target length
    while len(joke) < length_99_percentile + 2:  # +2 accounts for <SOS> and <EOS>
        joke.append(word_to_index['PAD'])
    return joke

jokes_df['joke'] = jokes_df['joke'].apply(pad)

jokes_df

Unnamed: 0,joke
0,"[3, 202, 20, 19, 45, 8, 13, 95, 384, 27, 30, 1..."
1,"[3, 5, 4121, 496, 232, 11, 27, 14, 2439, 27, 2..."
2,"[3, 7, 5501, 3509, 32, 12, 297, 218, 1132, 218..."
3,"[3, 18, 24, 13, 67, 11, 8894, 50, 8895, 9, 92,..."
4,"[3, 37, 113, 271, 27, 39, 12, 2847, 29, 113, 2..."
...,...
216322,"[3, 699, 8, 426, 9928, 1, 10, 513, 2585, 20, 1..."
216323,"[3, 9932, 31, 131, 26, 123, 13, 244, 163, 28, ..."
216324,"[3, 39, 24, 13, 113, 2273, 954, 35, 227, 7744,..."
216326,"[3, 404, 59, 41, 2321, 460, 270, 5, 1125, 26, ..."


In [7]:
def TokenEmbedding(jokes, embedding_layer):
    embedded_jokes = embedding_layer(jokes)
    return embedded_jokes

In [8]:
# def transform(joke):
#     return [word_to_index.get(word, word_to_index['UNK']) for word in word_tokenizer.tokenize(joke)]

# def pad(joke):
#     # Adding <SOS> token at the beginning
#     joke = [word_to_index['SOS']] + joke
#     # Adding <EOS> token at the end
#     joke.append(word_to_index['EOS'])
#     # Pading to match the target length
#     while len(joke) < length_99_percentile + 2:  # +2 accounts for <SOS> and <EOS>
#         joke.append(word_to_index['PAD'])
#     return joke

# def TokenEmbedding(jokes, embedding_layer):
#     embedded_jokes = embedding_layer(jokes)
#     return embedded_jokes

In [9]:
class PositionalEncoding(nn.Module):
    def __init__(self, embedding_dim, max_len, dropout_prob=0.1):
        super(PositionalEncoding, self).__init__()

        # Dropout layer to prevent overfitting
        self.dropout = nn.Dropout(p=dropout_prob)

        max_len = int(max_len)

        # Pre-compute the positional encodings
        position = torch.arange(max_len).unsqueeze(1)  # Shape: (max_len, 1)
        div_term = torch.exp(
            torch.arange(0, embedding_dim, 2) * (-math.log(10000.0) / embedding_dim)
        )  # Shape: (embedding_dim/2)

        # Create the positional encodings
        pe = torch.zeros(max_len, embedding_dim)  # Shape: (max_len, embedding_dim)
        pe[:, 0::2] = torch.sin(position * div_term)  # Even indices: sin
        pe[:, 1::2] = torch.cos(position * div_term)  # Odd indices: cos

        # Add a batch dimension to the positional encodings
        pe = pe.unsqueeze(0)  # Shape: (1, max_len, embedding_dim)

        # Register the positional encodings as a buffer (non-trainable parameter)
        self.register_buffer("pe", pe)

    def forward(self, x):
        # Add positional encodings to the input embeddings
        x = x + self.pe[:, :x.size(1), :]

        # Apply dropout and return
        return self.dropout(x)



In [10]:
# Example
jokes_tensor = torch.tensor(jokes_df['joke'].tolist(), dtype=torch.long)

embedded_jokes = TokenEmbedding(jokes_tensor[:30], embedding_layer)
print("embedded_jokes shape : ",embedded_jokes.shape)

positional_encoding = PositionalEncoding(embedding_dim, max_len = embedded_jokes.shape[1])
positionally_encoded_jokes = positional_encoding(embedded_jokes)

print("positionally_encoded_jokes shape : ", positionally_encoded_jokes.shape)

embedded_jokes shape :  torch.Size([30, 129, 512])
positionally_encoded_jokes shape :  torch.Size([30, 129, 512])


In [11]:
class MultiHeadSelfAttention(nn.Module):
    def __init__(self, embed_dim, num_heads):
        super(MultiHeadSelfAttention, self).__init__()
        assert embed_dim % num_heads == 0, "Embedding dimension must be divisible by the number of heads."
        
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.head_dim = embed_dim // num_heads
        
        # Linear layers for projecting input into queries, keys, and values
        self.query_proj = nn.Linear(embed_dim, embed_dim)
        self.key_proj = nn.Linear(embed_dim, embed_dim)
        self.value_proj = nn.Linear(embed_dim, embed_dim)
        
        # Output linear layer to combine all heads' outputs
        self.out_proj = nn.Linear(embed_dim, embed_dim)
        
        # Scaling factor for attention scores
        self.scale = math.sqrt(self.head_dim)

    def forward(self, x):
        batch_size, seq_len, embed_dim = x.size()
        
        # Step 1: Linear projections for Q, K, V
        queries = self.query_proj(x)  # Shape: (batch_size, seq_len, embed_dim)
        keys = self.key_proj(x)       # Shape: (batch_size, seq_len, embed_dim)
        values = self.value_proj(x)   # Shape: (batch_size, seq_len, embed_dim)
        
        # Step 2: Reshape for multi-head attention
        queries = queries.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
        keys = keys.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
        values = values.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
        # Shapes after transpose: (batch_size, num_heads, seq_len, head_dim)
        
        # Step 3: Scaled dot-product attention
        attention_scores = torch.matmul(queries, keys.transpose(-2, -1)) / self.scale
        # Shape: (batch_size, num_heads, seq_len, seq_len)
        
        # Step 4: Apply causal mask
        causal_mask = torch.triu(torch.ones(seq_len, seq_len), diagonal=1).to(x.device)
        causal_mask = causal_mask.masked_fill(causal_mask == 1, float('-inf')).masked_fill(causal_mask == 0, 0)
        attention_scores += causal_mask
        
        # Step 5: Softmax to get attention weights
        attention_weights = torch.softmax(attention_scores, dim=-1)
        
        # Step 6: Weighted sum of values
        attention_output = torch.matmul(attention_weights, values)
        # Shape: (batch_size, num_heads, seq_len, head_dim)
        
        # Step 7: Concatenate heads and project output
        attention_output = attention_output.transpose(1, 2).contiguous().view(batch_size, seq_len, embed_dim)
        # Shape after transpose and reshape: (batch_size, seq_len, embed_dim)
        
        output = self.out_proj(attention_output)
        # Shape: (batch_size, seq_len, embed_dim)
        
        return output

In [12]:
class TransformerBlock(nn.Module):
    def __init__(self, embed_dim, num_heads, dropout_prob = 0.2):
        super(TransformerBlock, self).__init__()

        # Layer Norm
        self.layer_norm_1 = nn.LayerNorm(embed_dim)
        self.layer_norm_2 = nn.LayerNorm(embed_dim)  

        # Multi Head Self Attention      
        self.mha = MultiHeadSelfAttention(embed_dim, num_heads)
        self.dropout = nn.Dropout(dropout_prob)

        # Feed Forward Network
        self.ffn = nn.Sequential(
        nn.Linear(embed_dim, embed_dim * 4),  # Original paper uses *4 
        nn.GELU(),
        nn.Linear(embed_dim * 4, embed_dim),
        nn.Dropout(dropout_prob),
        )
        
    def forward(self, x):
        x_mha = self.mha(self.layer_norm_1(x))
        x = x + self.dropout(x_mha)
        x = x + self.ffn(self.layer_norm_2(x))
        return x

In [13]:
class TransformerDecoder(nn.Module):
    def __init__(self, num_layers, embedding_dim, num_heads, vocab_size, max_length, dropout_prob=0.1):
        super(TransformerDecoder, self).__init__()
        
        # Embedding Layer
        self.token_embedding = nn.Embedding(vocab_size, embedding_dim)
        
        # Positional Encoding
        self.position_embedding = PositionalEncoding(embedding_dim, max_length, dropout_prob)
        
        # Stack of Transformer Blocks
        self.transformer_blocks = nn.Sequential(*[
            TransformerBlock(embedding_dim, num_heads, dropout_prob)
            for _ in range(num_layers)
        ])
        
        # Final Linear Layer
        self.output_layer = nn.Linear(embedding_dim, vocab_size)

    def forward(self, x):
        x = self.token_embedding(x)  # Shape: (batch_size, sequence_length, embedding_dim)
        
        x = self.position_embedding(x)  # Shape: (batch_size, sequence_length, embedding_dim)
        
        x = self.transformer_blocks(x)
        
        logits = self.output_layer(x)  # Shape: (batch_size, sequence_length, vocab_size)
        
        return logits

In [14]:
class JokesDataset(Dataset):
    def __init__(self, jokes, max_length):
        self.jokes = jokes
        self.max_length = max_length

    def __len__(self):
        return len(self.jokes)

    def __getitem__(self, idx):
        # Input sequence: all tokens except the last one
        input_seq = self.jokes[idx][:-1]
        # Target sequence: all tokens except the first one
        target_seq = self.jokes[idx][1:]
        return torch.tensor(input_seq, dtype=torch.long), torch.tensor(target_seq, dtype=torch.long)


In [15]:
# Split the data
train_data, val_data = train_test_split(jokes_df['joke'].tolist(), test_size=0.1, random_state=42)

# Define maximum sequence length
max_length = length_99_percentile + 2  # Including <SOS> and <EOS>

# Create Dataset instances
train_dataset = JokesDataset(train_data, max_length)
val_dataset = JokesDataset(val_data, max_length)

# Create DataLoader instances
batch_size = 64  # Adjust based on your GPU memory
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, num_workers=4, pin_memory=True)

In [16]:
# Hyperparameters
num_layers = 6
num_heads = 8
embedding_dim = 512
vocab_size = len(word_to_index) + 1  # Including PAD
max_length = int(max_length)
dropout_prob = 0.4
learning_rate = 2e-4
num_epochs = 25  # Adjust as needed

# Initialize the model
model = TransformerDecoder(
    num_layers=num_layers,
    embedding_dim=embedding_dim,
    num_heads=num_heads,
    vocab_size=vocab_size,
    max_length=max_length,
    dropout_prob=dropout_prob
)


# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Define loss function (CrossEntropyLoss ignores the PAD token by using ignore_index)
criterion = nn.CrossEntropyLoss(ignore_index=word_to_index[PAD_TOKEN])

# Define optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=1e-4)

# Scheduler 
scheduler = CosineAnnealingLR(optimizer, T_max=num_epochs, eta_min=5e-6)


In [17]:
# # Load the saved weights
# model.load_state_dict(torch.load("weights/transformer_model_6.pth"))
# # Move model to GPU
# model.to(device)


In [30]:
def generate_text(model, start_sequence, max_length, k=2
                  , temperature=1.0):
    model.eval()
    generated = start_sequence
    with torch.no_grad():
        for _ in range(max_length - len(start_sequence)):
            inputs = torch.tensor([generated]).to(device)
            outputs = model(inputs)  # Shape: (1, sequence_length, vocab_size)

            # Get the logits for the next token
            logits = outputs[:, -1, :]  # Shape: (1, vocab_size)
            logits = logits / temperature  # Apply temperature scaling

            # Mask out the UNK token by setting its logit to a very negative value
            logits[:, word_to_index[UNK]] = float('-inf')

            # Get the top k tokens and their probabilities
            top_k_probs, top_k_indices = torch.topk(torch.softmax(logits, dim=-1), k, dim=-1)  # Shape: (1, k)
            top_k_probs = top_k_probs.squeeze(0)  # Shape: (k,)
            top_k_indices = top_k_indices.squeeze(0)  # Shape: (k,)

            # Normalize probabilities for sampling
            normalized_probs = top_k_probs / top_k_probs.sum()
            next_token = torch.multinomial(normalized_probs, num_samples=1).item()

            # Append the sampled token
            generated.append(top_k_indices[next_token].item())

            # Stop if EOS token is generated
            if top_k_indices[next_token].item() == word_to_index[EOS]:
                break

    # Convert generated token indices to words
    return " ".join([index_to_word[idx] for idx in generated if idx not in [word_to_index[EOS], word_to_index[UNK]]])


# Example usage
start_sequence = [word_to_index.get(word, word_to_index[UNK]) for word in ["what"]]
print(generate_text(model, start_sequence, max_length))

what do you call a dog that can do magic ? a labracadabrador


In [None]:
model.load_state_dict(torch.load("awien/transformer_model_17.pth"))

  model.load_state_dict(torch.load("awien/transformer_model_17.pth"))


<All keys matched successfully>

In [None]:
from tqdm import tqdm
from nltk.translate.bleu_score import corpus_bleu
import math

# Ensure NLTK's BLEU implementation uses tokenized references
def compute_bleu(references, candidates):
    # references: List of List of reference sentences (each reference is a list of tokens)
    # candidates: List of candidate sentences (each candidate is a list of tokens)
    return corpus_bleu(references, candidates)

# Training loop
for epoch in (range(num_epochs)):
    # Training phase
    model.train()
    train_loss = 0
    total_train_correct = 0
    total_train_tokens = 0

    for batch in tqdm(train_loader, desc="Training Batches", leave=False):
        inputs, targets = batch
        inputs = inputs.to(device)
        targets = targets.to(device)

        optimizer.zero_grad()
        outputs = model(inputs)  # Shape: [batch_size, seq_len - 1, vocab_size]
        outputs = outputs.view(-1, vocab_size)  # [batch_size * (seq_len - 1), vocab_size]
        targets = targets.contiguous().view(-1)  # [batch_size * (seq_len - 1)]

        loss = criterion(outputs, targets)
        train_loss += loss.item()

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)  # Gradient clipping
        optimizer.step()

        # Accuracy
        predictions = torch.argmax(outputs, dim=-1)
        total_train_correct += (predictions == targets).sum().item()
        total_train_tokens += targets.numel()

    # Compute Training Metrics
    avg_train_loss = train_loss / len(train_loader)
    train_perplexity = math.exp(avg_train_loss)
    train_accuracy = total_train_correct / total_train_tokens

    # Validation phase
    model.eval()
    val_loss = 0
    total_val_correct = 0
    total_val_tokens = 0
    references = []
    candidates = []

    with torch.no_grad():
        for batch in tqdm(val_loader, desc="Validation Batches", leave=False):
            inputs, targets = batch
            inputs = inputs.to(device)
            targets = targets.to(device)

            outputs = model(inputs)  # Shape: [batch_size, seq_len - 1, vocab_size]
            outputs = outputs.view(-1, vocab_size)  # [batch_size * (seq_len - 1), vocab_size]
            targets = targets.contiguous().view(-1)  # [batch_size * (seq_len - 1)]

            loss = criterion(outputs, targets)
            val_loss += loss.item()

            # Accuracy
            predictions = torch.argmax(outputs, dim=-1)
            total_val_correct += (predictions == targets).sum().item()
            total_val_tokens += targets.numel()

            # Prepare for BLEU Score
            # Reshape back to [batch_size, seq_len -1]
            batch_size = inputs.size(0)
            seq_len = inputs.size(1)

            predictions = predictions.view(batch_size, seq_len)
            targets = targets.view(batch_size, seq_len)

            for i in range(batch_size):
                # Convert indices to words, exclude PAD_TOKEN
                pred_tokens = [index_to_word[idx] for idx in predictions[i].cpu().numpy() if idx != word_to_index[PAD_TOKEN]]
                target_tokens = [index_to_word[idx] for idx in targets[i].cpu().numpy() if idx != word_to_index[PAD_TOKEN]]

                # Append to lists
                references.append([target_tokens])  # Each reference should be a list of references
                candidates.append(pred_tokens)

    # Compute Validation Metrics
    avg_val_loss = val_loss / len(val_loader)
    val_perplexity = math.exp(avg_val_loss)
    val_accuracy = total_val_correct / total_val_tokens
    bleu_score = compute_bleu(references, candidates)

    # Step the scheduler based on validation loss
    scheduler.step()

    # Get current LR
    current_lr = optimizer.param_groups[0]['lr']

    # Logging
    print(f"Epoch {epoch + 1}/{num_epochs}, "
          f"LR: {current_lr:.6f}, "
          f"Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}, "
          f"Train Perplexity: {train_perplexity:.2f}, Val Perplexity: {val_perplexity:.2f}, "
          f"Train Accuracy: {train_accuracy:.4f}, Val Accuracy: {val_accuracy:.4f}, "
          f"Val BLEU Score: {bleu_score:.4f}")
    start_sequence = [word_to_index.get(word, word_to_index[UNK]) for word in ["why"]]
    print(generate_text(model, start_sequence, max_length))

    # Save model checkpoint every 5 epochs
    
    torch.save(model.state_dict(), f'awien/transformer_model_{epoch + 1}.pth')


In [44]:
# # saving model 
# torch.save(model.state_dict(), 'transformer_model.pth')

########################################

In [37]:
def generate(model, start_sequence, max_length, temperature=1.0):
    model.eval()
    generated = start_sequence
    with torch.no_grad():
        for _ in range(max_length - len(start_sequence)):
            inputs = torch.tensor([generated]).to(device)
            outputs = model(inputs)  # Shape: (1, sequence_length, vocab_size)

            # Get the logits for the next token
            logits = outputs[:, -1, :]  # Shape: (1, vocab_size)
            logits = logits / temperature  # Apply temperature scaling

            # Apply a mask to the logits to exclude the UNK token
            logits[:, word_to_index[UNK]] = float('-inf')  # Mask UNK token

            # Get the top 2 tokens and their probabilities
            top_k_probs, top_k_indices = torch.topk(torch.softmax(logits, dim=-1), k=2, dim=-1)  # Shape: (1, 2)
            top_k_probs = top_k_probs.squeeze(0)  # Shape: (2,)
            top_k_indices = top_k_indices.squeeze(0)  # Shape: (2,)

            # Normalize probabilities for sampling
            normalized_probs = top_k_probs / top_k_probs.sum()
            next_token = torch.multinomial(normalized_probs, num_samples=1).item()

            # Append the sampled token
            generated.append(top_k_indices[next_token].item())

            # Stop if EOS token is generated
            if top_k_indices[next_token].item() == word_to_index[EOS]:
                break

    # Convert generated token indices to words
    return " ".join([index_to_word[idx] for idx in generated if idx not in [word_to_index[EOS], word_to_index[UNK]]])


# Example usage

for i in range(5):
    start_sequence = [word_to_index.get(word, word_to_index[UNK]) for word in ["what"]]
    print(generate(model, start_sequence, max_length))


what do you call a dog that likes to eat hamburgers ? labrador .
what do a pirate and a pirate have a big ears ? a buck-an-ear
what do you call a man with no body and no nose. nobody knows
what do you call a man with no arms and no legs lying in front of a pile of leaves ? russell .
what do you call a man who is n't sure if he 's alright ? a boomerang
