In [1]:
# %%
import json
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from torch.optim.lr_scheduler import OneCycleLR
from tqdm import tqdm
from gensim.models import Word2Vec


In [2]:
# %%
def load_json_files(file_paths):
    all_data = []
    for path in file_paths:
        with open(path, 'r') as file:
            data = json.load(file)
            if isinstance(data, list):
                all_data.extend(data)
            elif isinstance(data, dict):
                all_data.append(data)
    return all_data

def prepare_dataset(data):
    # Extract the 'body' key from each entry, which contains the joke text
    flattened_data = [entry.get('body', '') for entry in data if 'body' in entry]
    
    # Create a DataFrame with 'input_text' as the joke text
    df = pd.DataFrame({'input_text': flattened_data})
    
    # Drop rows with missing 'input_text' and reset the index
    df = df.dropna(subset=['input_text']).reset_index(drop=True)
    return df

def create_splits(df, train_frac=0.8, val_frac=0.1):
    train_size = int(len(df) * train_frac)
    val_size = int(len(df) * val_frac)
    train, val, test = np.split(df.sample(frac=1, random_state=52), [train_size, train_size + val_size])
    return train, val, test


In [3]:
# %%
def train_word2vec(df, vector_size=256, window=5, min_count=2):
    tokenized_text = [text.split() for text in df['input_text']]
    w2v_model = Word2Vec(sentences=tokenized_text, vector_size=vector_size, window=window, min_count=min_count, sg=1)
    return w2v_model

def build_embedding_matrix(w2v_model, tokenizer, embed_dim):
    vocab_size = len(tokenizer)
    embedding_matrix = np.random.uniform(-0.01, 0.01, (vocab_size, embed_dim))
    for word, idx in tokenizer.items():
        if word in w2v_model.wv:
            embedding_matrix[idx] = w2v_model.wv[word]
    return torch.tensor(embedding_matrix, dtype=torch.float)


In [4]:
# %%
class JokeDatasetWithWord2Vec(Dataset):
    def __init__(self, df, tokenizer, max_length=128, pad_token="<pad>", eos_token="<eos>"):
        self.df = df
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.pad_token_id = tokenizer[pad_token]
        self.eos_token_id = tokenizer[eos_token]

    def encode(self, text):
        encoded = [self.tokenizer.get(word, 2) for word in text.split()]
        encoded = encoded[:self.max_length - 1] + [self.eos_token_id]
        return encoded + [self.pad_token_id] * (self.max_length - len(encoded))

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        text = self.df.iloc[idx]['input_text']
        encoded = self.encode(text)
        return {
            'input_ids': torch.tensor(encoded[:-1]),
            'labels': torch.tensor(encoded[1:])
        }


In [5]:
# %%
class DecoderBlock(nn.Module):
    def __init__(self, embed_dim, num_heads, ff_dim, dropout):
        super().__init__()
        self.attention = nn.MultiheadAttention(embed_dim, num_heads, dropout=dropout)
        self.norm1 = nn.LayerNorm(embed_dim)
        self.ff = nn.Sequential(
            nn.Linear(embed_dim, ff_dim),
            nn.ReLU(),
            nn.Linear(ff_dim, embed_dim),
        )
        self.norm2 = nn.LayerNorm(embed_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        attn_output, _ = self.attention(x, x, x)
        x = self.norm1(x + self.dropout(attn_output))
        ff_output = self.ff(x)
        return self.norm2(x + self.dropout(ff_output))


class JokeGeneratorWithWord2Vec(nn.Module):
    def __init__(self, vocab_size, embed_dim, embedding_matrix, num_layers=4, num_heads=8, ff_dim=512, dropout=0.1):
        super().__init__()
        self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=False)
        self.pos_embedding = nn.Parameter(torch.randn(1, 128, embed_dim))
        self.layers = nn.ModuleList([DecoderBlock(embed_dim, num_heads, ff_dim, dropout) for _ in range(num_layers)])
        self.output_layer = nn.Linear(embed_dim, vocab_size)

    def forward(self, input_ids):
        seq_len = input_ids.size(1)
        x = self.embedding(input_ids) + self.pos_embedding[:, :seq_len, :]
        for layer in self.layers:
            x = layer(x)
        return self.output_layer(x)


In [6]:
# %%
def train_model_with_w2v(model, train_loader, val_loader, device, num_epochs=10, lr=3e-4):
    optimizer = AdamW(model.parameters(), lr=lr)
    scheduler = OneCycleLR(optimizer, max_lr=lr, steps_per_epoch=len(train_loader), epochs=num_epochs)
    criterion = nn.CrossEntropyLoss(ignore_index=0)
    model.to(device)

    for epoch in range(num_epochs):
        model.train()
        train_loss = 0
        for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}"):
            input_ids = batch['input_ids'].to(device)
            labels = batch['labels'].to(device)
            logits = model(input_ids)
            loss = criterion(logits.view(-1, logits.size(-1)), labels.view(-1))
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            scheduler.step()
            train_loss += loss.item()
        print(f"Epoch {epoch+1}: Train Loss = {train_loss / len(train_loader)}")

        validate_model_with_w2v(model, val_loader, device, criterion)

def validate_model_with_w2v(model, val_loader, device, criterion):
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            labels = batch['labels'].to(device)
            logits = model(input_ids)
            loss = criterion(logits.view(-1, logits.size(-1)), labels.view(-1))
            val_loss += loss.item()
    print(f"Validation Loss: {val_loss / len(val_loader)}")


In [7]:
# %%
# Load Data
file_paths = ["/home/aayushjeevan/Desktop/NLP/reddit_jokes.json", "/home/aayushjeevan/Desktop/NLP/stupidstuff.json", "/home/aayushjeevan/Desktop/NLP/wocka.json"] 
raw_data = load_json_files(file_paths)

# Prepare Dataset
df = prepare_dataset(raw_data)

# Train Word2Vec
w2v_model = train_word2vec(df, vector_size=256)

# Tokenizer and Embedding Matrix
tokenizer = {word: idx for idx, word in enumerate(w2v_model.wv.index_to_key, start=4)}
tokenizer.update({"<pad>": 0, "<unk>": 1, "<eos>": 2, "<sos>": 3})
embedding_matrix = build_embedding_matrix(w2v_model, tokenizer, embed_dim=256)

# Dataset and DataLoader
train, val, test = create_splits(df)
train_dataset = JokeDatasetWithWord2Vec(train, tokenizer)
val_dataset = JokeDatasetWithWord2Vec(val, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8)

# Initialize Model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = JokeGeneratorWithWord2Vec(
    vocab_size=len(tokenizer), embed_dim=256, embedding_matrix=embedding_matrix)

# Train and Validate
train_model_with_w2v(model, train_loader, val_loader, device, num_epochs=11, lr=3e-4)


Epoch 1/11: 100%|██████████| 20835/20835 [36:01<00:00,  9.64it/s]


Epoch 1: Train Loss = 7.214015742391885
Validation Loss: 6.360261820648545


Epoch 2/11: 100%|██████████| 20835/20835 [36:06<00:00,  9.61it/s]


Epoch 2: Train Loss = 6.099423872265908
Validation Loss: 5.972874195699271


Epoch 3/11: 100%|██████████| 20835/20835 [36:10<00:00,  9.60it/s]


Epoch 3: Train Loss = 5.8317500662683495
Validation Loss: 5.878082489097873


Epoch 4/11: 100%|██████████| 20835/20835 [36:11<00:00,  9.60it/s]


Epoch 4: Train Loss = 5.679184747297243
Validation Loss: 5.776484123987794


Epoch 5/11: 100%|██████████| 20835/20835 [36:11<00:00,  9.59it/s]


Epoch 5: Train Loss = 5.551323809599688
Validation Loss: 5.743610115380754


Epoch 6/11: 100%|██████████| 20835/20835 [36:12<00:00,  9.59it/s]


Epoch 6: Train Loss = 5.442734844129035
Validation Loss: 5.725601047471938


Epoch 7/11: 100%|██████████| 20835/20835 [36:11<00:00,  9.59it/s]


Epoch 7: Train Loss = 5.330412269525457
Validation Loss: 5.726684239272193


Epoch 8/11: 100%|██████████| 20835/20835 [36:13<00:00,  9.58it/s]


Epoch 8: Train Loss = 5.220041471011696
Validation Loss: 5.7204284474854274


Epoch 9/11: 100%|██████████| 20835/20835 [36:12<00:00,  9.59it/s]


Epoch 9: Train Loss = 5.1176265754315216
Validation Loss: 5.739931574892861


Epoch 10/11: 100%|██████████| 20835/20835 [36:11<00:00,  9.59it/s]


Epoch 10: Train Loss = 5.032012863642081
Validation Loss: 5.749354455933232


Epoch 11/11: 100%|██████████| 20835/20835 [36:11<00:00,  9.60it/s]


Epoch 11: Train Loss = 4.9695179175438
Validation Loss: 5.759552019525627


In [84]:
def generate_joke(model, tokenizer, input_text, max_length=128, temperature=1.0, device='cpu'):
    # Tokenize the input text
    model.eval()
    input_ids = torch.tensor([tokenizer.get(word, tokenizer['<unk>']) for word in input_text.split()]).unsqueeze(0).to(device)
    
    # Initialize the generated sequence with the input text
    generated_ids = input_ids[0].tolist()
    
    # Generate tokens step by step
    for _ in range(max_length - len(generated_ids)):
        input_tensor = torch.tensor([generated_ids]).to(device)  # Input the current generated tokens
        with torch.no_grad():
            logits = model(input_tensor)  # Get the logits for the next token
        logits = logits[:, -1, :]  # Get the logits of the last token
        probabilities = torch.nn.functional.softmax(logits / temperature, dim=-1)  # Apply softmax for probabilities
        next_token_id = torch.multinomial(probabilities, 1).item()  # Sample the next token
        
        if next_token_id == tokenizer['<eos>']:  # Stop if the end of sequence token is generated
            break
        
        generated_ids.append(next_token_id)  # Add the generated token to the sequence
    
    # Decode the generated token IDs back to text
    generated_text = ' '.join([word for word, idx in tokenizer.items() if idx in generated_ids])
    
    return generated_text


# Example Usage:
sample_input =  "why did the nazi"  

generated_joke = generate_joke(model, tokenizer, sample_input, max_length=128, temperature=0.7, device=device)
print(f"Generated Joke: {generated_joke}")


Generated Joke: the did why nazi


In [95]:
# Define the prompt for the joke
sample_input = "why did the chicken"         

# Generate a joke
generated_joke = generate_joke(model, tokenizer, sample_input, max_length=128, temperature=0.7, device=device)

# Print the generated joke
print(f"Generated Joke: {generated_joke}")


Generated Joke: the a did give why chicken gun


In [87]:
# Test the model on 5 samples from the test dataset
def test_model_on_samples(model, tokenizer, test_dataset, device, num_samples=10, max_length=128, temperature=1.0):
    # Ensure the model is in evaluation mode
    model.eval()
    
    # Randomly sample `num_samples` examples from the test dataset
    sampled_data = test_dataset.df.sample(n=num_samples, random_state=38)       
    
    print("Testing on 5 samples from the test dataset:\n")
    
    # Loop through the sampled data
    for i, row in sampled_data.iterrows():
        input_text = row['input_text']
        print(f"Sample {i + 1}:")
        print(f"Input Text: {input_text}")
        
        # Generate a joke
        generated_joke = generate_joke(model, tokenizer, input_text, max_length=max_length, temperature=temperature, device=device)
        print(f"Generated Joke: {generated_joke}\n{'-' * 50}\n")

# Call the test function with your model, tokenizer, and test dataset
test_model_on_samples(model, tokenizer, JokeDatasetWithWord2Vec(test, tokenizer), device, num_samples=15, max_length=128, temperature=0.7)


Testing on 5 samples from the test dataset:

Sample 9043:
Input Text: There's 2 grains of rice in the sink
Generated Joke: the of in 2 There's sink rice grains
--------------------------------------------------

Sample 50300:
Input Text: Reddit
Generated Joke: Reddit
--------------------------------------------------

Sample 123918:
Input Text: I would tell you but the joke would finish to fast.
Generated Joke: the to I you but would tell joke finish fast.
--------------------------------------------------

Sample 186644:
Input Text: A man walks into a pub and sees a sign hanging over the bar which reads:

Cheese Sandwich: $1.50
Hand Job: $10.00

He checks his wallet for the necessary payment, then he walks up to the bar and beckons to one of the exceptionally attractive blondes serving drinks to an eager-looking group of men.

"Yes?" she inquires, with a knowing smile, "can I help you?"

"Yep, I was wondering," whispers the man, "are you the one who gives the handjobs?"

"Yes," she pu

GPT-2 FINETUNING USING GIVEN DATASET


In [1]:
import json
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2LMHeadModel, GPT2Tokenizer, AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm

2024-11-18 02:41:52.974352: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-11-18 02:41:52.981837: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1731877912.990993 2211617 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1731877912.993658 2211617 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-18 02:41:53.003017: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

In [2]:
# Custom dataset class
class JokesDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length=128):
        self.encodings = tokenizer(texts, truncation=True, padding=True, max_length=max_length, return_tensors='pt')

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = item['input_ids'].clone()
        return item

    def __len__(self):
        return len(self.encodings.input_ids)

def load_and_preprocess_data(file_paths):
    all_texts = []
    
    for file_path in file_paths:
        with open(file_path, 'r', encoding='utf-8') as f:
            jokes = json.load(f)
            for joke in jokes:
                # Combine title and body if both exist
                text = ""
                if 'title' in joke:
                    text += joke['title'] + "\n"
                text += joke['body']
                # Add special tokens to mark start and end of jokes
                text = "<|startoftext|>" + text + "<|endoftext|>"
                all_texts.append(text)
    
    return all_texts

def train_model(model, train_dataloader, device, num_epochs=3):
    model.train()
    optimizer = AdamW(model.parameters(), lr=5e-5)
    
    # Create scheduler with warmup
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0,
        num_training_steps=len(train_dataloader) * num_epochs
    )
    
    for epoch in range(num_epochs):
        print(f"\nEpoch {epoch + 1}/{num_epochs}")
        progress_bar = tqdm(train_dataloader, desc="Training")
        
        for batch in progress_bar:
            # Move batch to device
            batch = {k: v.to(device) for k, v in batch.items()}
            
            outputs = model(**batch)
            loss = outputs.loss
            loss.backward()
            
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()
            
            progress_bar.set_postfix({"loss": loss.item()})

def generate_joke(prompt, model, tokenizer, device):
    model.eval()
    
    # Encode prompt
    inputs = tokenizer.encode(prompt, return_tensors='pt').to(device)
    
    # Generate
    with torch.no_grad():
        outputs = model.generate(
            inputs,
            max_length=100,
            num_return_sequences=1,
            temperature=0.7,
            top_k=50,
            top_p=0.95,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id
        )
    
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [3]:
def main():
    # Set device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")
    
    # Initialize tokenizer and model
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    model = GPT2LMHeadModel.from_pretrained('gpt2').to(device)
    
    # Add special tokens
    special_tokens = {"pad_token": "<|pad|>", "additional_special_tokens": ["<|startoftext|>", "<|endoftext|>"]}
    tokenizer.add_special_tokens(special_tokens)
    model.resize_token_embeddings(len(tokenizer))
    
    # Load and preprocess data
    file_paths = ['/home/aayushjeevan/Desktop/nlp/reddit_jokes.json', '/home/aayushjeevan/Desktop/nlp/stupidstuff.json', '/home/aayushjeevan/Desktop/nlp/wocka.json']  # Replace with your actual file paths
    texts = load_and_preprocess_data(file_paths)
    
    # Create dataset and dataloader
    dataset = JokesDataset(texts, tokenizer)
    train_dataloader = DataLoader(dataset, batch_size=8, shuffle=True)
    
    # Train the model
    print("Starting training...")
    train_model(model, train_dataloader, device)
    
    # Save the model
    model.save_pretrained("finetuned_gpt2_jokes")
    tokenizer.save_pretrained("finetuned_gpt2_jokes")
    
    # Test the model
    test_prompts = [
        "What do you call a cow",
        "A blonde woman",
        "At a dinner party",
    ]
    
    print("\nTesting the model with some prompts:")
    for prompt in test_prompts:
        generated_text = generate_joke(prompt, model, tokenizer, device)
        print(f"\nPrompt: {prompt}")
        print(f"Generated: {generated_text}")

if __name__ == "__main__":
    main()

Using device: cuda


The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Starting training...

Epoch 1/3


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
Training: 100%|██████████| 26044/26044 [2:20:52<00:00,  3.08it/s, loss=2.44]   



Epoch 2/3


Training: 100%|██████████| 26044/26044 [2:20:51<00:00,  3.08it/s, loss=0.673]  



Epoch 3/3


Training: 100%|██████████| 26044/26044 [2:20:41<00:00,  3.09it/s, loss=2.2]    
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.



Testing the model with some prompts:

Prompt: What do you call a cow
Generated: What do you call a cow with three legs?
Ground beef.

Prompt: A blonde woman
Generated: A blonde woman got on a bus with her baby
The bus driver says: "That's the ugliest baby I've ever seen!" 

The woman walks to the rear of the bus and sits down, fuming. She says to a man next to her: "The driver just insulted me!" The man says: "You go up there and tell him off – go ahead, I'll hold your monkey for you."

Prompt: At a dinner party
Generated: At a dinner party for a group of friends
A man is invited to a dinner party for a group of friends. As the host sets the table for the party, he notices a group of people at the table. He asks one of the people if they know anyone who knows anyone.

The man says, "I don't know. I don't know anyone."

"Well, I guess you could say I do."

The host, intrigued, asks, "So


In [5]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer

def generate_joke(prompt, model, tokenizer, device, max_length=100):
    """
    Generate a joke completion for the given prompt using the fine-tuned model.
    """
    model.eval()
    
    # Encode the prompt
    inputs = tokenizer.encode(prompt, return_tensors='pt').to(device)
    
    # Generate output
    with torch.no_grad():
        outputs = model.generate(
            inputs,
            max_length=max_length,
            num_return_sequences=1,
            temperature=0.7,
            top_k=50,
            top_p=0.95,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id
        )
    
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

def test_model_on_prompts(prompts, model_path="finetuned_gpt2_jokes", max_length=100):
    # Set device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")
    
    # Load fine-tuned model and tokenizer
    tokenizer = GPT2Tokenizer.from_pretrained(model_path)
    model = GPT2LMHeadModel.from_pretrained(model_path).to(device)
    
    # Test each prompt
    for prompt in prompts:
        generated_joke = generate_joke(prompt, model, tokenizer, device, max_length)
        print(f"\nPrompt: {prompt}")
        print(f"Generated Joke: {generated_joke}")

# List your test prompts here
custom_prompts = [
    "Why did the chicken cross the road",
    "A programmer walks into a bar",
    "Knock knock"
]

# Call the testing function
test_model_on_prompts(custom_prompts)


Using device: cuda

Prompt: Why did the chicken cross the road
Generated Joke: Why did the chicken cross the road?
To get to the other side.

Prompt: A programmer walks into a bar
Generated Joke: A programmer walks into a bar...
And says "I'll have a H20."

The bartender says "Sorry, we don't serve alcohol."

The programmer says "Well, I'll have a H20 too."

The bartender says "Sorry, we don't serve alcohol."

The programmer says "Well, I'll have a H20 too."

The bartender says "Sorry, we don't serve alcohol."

The programmer says "Well,

Prompt: Knock knock
Generated Joke: Knock knock. Who's there?
The pilot.

The pilot who?

The pilot who blew the plane.


Finally I conclude that my model after training on given dataset is not much successful in generating meaningful content whereas the GPT-2 model after finetuning on the same dataset  is giving excellent results.

GPT-2 OUTPUT:
Using device: cuda

Prompt: There's 2 grains of rice in the sink
Generated Joke: There's 2 grains of rice in the sink
And a third is rice in the bath.

Prompt: Reddit
Generated Joke: RedditWhat's the difference between an epileptic oyster farmer and a hooker with diarrhea?
One shucks between fits.

Prompt: I would tell you but the joke would finish to fast.
Generated Joke: I would tell you but the joke would finish to fast.
So a man goes to a bar, and he sits down. A man next to him, a very attractive woman, leans in close and whispers something in his ear. The man gets up and goes to the bar. He orders a drink. He sits there for a while, and then hears a knock on the door. He opens it. A man standing nearby, in a very drunken state, says to the man, "Exc

Prompt: It was just an alternative event.
Generated Joke: It was just an alternative event.
I had a friend that had a dog, but he didn't like it.  He told me this joke about a dog that lived with two people.  One day he and his dog were walking down the street when they saw a man walking down the sidewalk.  He said to the dog, "Hey man, what are you doing?"  The dog replied, "I'm walking down the street and I saw two people walking down the sidewalk." 

Prompt: Thats your puke your eating
Generated Joke: Thats your puke your eating?
I'm going to have to put my dick in the toilet


MY DECODER MODEL OUTPUT :
Input Text: There's 2 grains of rice in the sink
Generated Joke: the of in 2 There's sink rice grains


Input Text: Reddit
Generated Joke: Reddit


Input Text: I would tell you but the joke would finish to fast.
Generated Joke: the to I you but would tell joke finish fast.

Input Text: Thats your puke your eating
Generated Joke: your eating Thats puke


Input Text: It was just an alternative event.
Generated Joke: was just an It alternative event.



It can be  clearly  seen that my decoder model is not giving much relevant output whereas finetuned GPT-2 gives quite good results 
