In [None]:
# Install required libraries
!pip install torch transformers tqdm datasets praw google-cloud-secret-manager pdfplumber



Scraping reddit to make a jokes dataset

In [None]:
from google.colab import userdata
import praw

# Set up Reddit API
reddit = praw.Reddit(client_id=userdata.get('client_id'),
                     client_secret=userdata.get('client_secret'),
                     user_agent=userdata.get('user_agent'),
                     check_for_async=False)

In [None]:
import praw
import re
from hashlib import md5
import json

# Parameters
num_posts = 1000
score_threshold = 10
min_length = 5
cleaned_jokes = []

# Cleaning function
def clean_text(text):
    # Remove links
    text = re.sub(r'http\S+', '', text)
    # Remove special characters
    text = re.sub(r'[^\w\s.,?!]', '', text)
    # Remove content after "EDIT:" or "edit:"
    text = re.split(r'\bedit\b', text, flags=re.IGNORECASE)[0]
    return text.strip()

# Scrape jokes
def scrape_jokes(subreddit_name, num_posts, score_threshold):
    subreddit = reddit.subreddit(subreddit_name)
    jokes = []

    for submission in subreddit.hot(limit=num_posts):
        # Skip posts with images or other media
        if not submission.is_self:  # True for text-only posts
            continue

        # Clean title and selftext
        title = clean_text(submission.title)
        body = clean_text(submission.selftext)

        # Skip posts that start with "EDIT:" or "edit:"
        if title.lower().startswith("edit") or body.lower().startswith("edit"):
            continue

        # Apply length and score filters
        if submission.score >= score_threshold and len(title + body) >= min_length:
            jokes.append({
                'title': title,
                'body': body,
                'score': submission.score
            })

    print(f"Posts scraped from r/{subreddit_name}: {len(jokes)}")
    return jokes

# Run scraping
subreddits = ["cleandadjokes", "cleanjokes"]
raw_jokes = []

for i in subreddits:
    raw_jokes.extend(scrape_jokes(i, num_posts, score_threshold))

# Remove duplicates using hash
unique_jokes = []
seen = set()
for joke in raw_jokes:
    joke_hash = md5((joke['title'] + joke['body']).encode('utf-8')).hexdigest()
    if joke_hash not in seen:
        unique_jokes.append(joke)
        seen.add(joke_hash)

# Print summary
print(f"Total posts scraped: {len(raw_jokes)}")
print(f"Unique jokes after cleaning: {len(unique_jokes)}")

Posts scraped from r/cleandadjokes: 925
Posts scraped from r/cleanjokes: 878
Total posts scraped: 1803
Unique jokes after cleaning: 1775


In [None]:
unique_jokes = [joke['title'] + " " + joke['body'] for joke in unique_jokes]

In [None]:
print(unique_jokes)



Adding more jokes from a joke book

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pdfplumber
import re

# Specify the PDF file path
pdf_file = "/content/drive/MyDrive/jokesbook.pdf"

# Open the PDF
with pdfplumber.open(pdf_file) as pdf:
    for page in pdf.pages:
        # Extract text from the page
        text = page.extract_text()
        if text:
            parts = re.split(r"\b\d{2,3}\b\s", text)
            # Remove leading/trailing spaces and empty strings
            cleaned_parts = [part.strip() for part in parts if part.strip()]
            # Extend jokes with cleaned parts
            unique_jokes.extend(cleaned_parts)

# Filter out any empty strings or unwanted data
unique_jokes = [joke.strip().replace("\n", " ") for joke in unique_jokes if joke.strip()][1:]

print(len(unique_jokes))

2725


Adding some more jokes by scraping

In [None]:
import requests
from bs4 import BeautifulSoup
import re

url = "https://www.lalo.app/story/jokes-for-kids.html"
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')

# Find the start and end articles
start_article = soup.find('article', {'id': '200_jokes'})
end_article = soup.find('article', {'id': 'chistes_jokes'})

# Collect jokes from the section between the two articles

skip_first_h2 = True  # Flag to skip the first h2 after every article

for h2 in start_article.find_all_next('h2', class_='h2'):
    if end_article and h2.find_next('h2', class_='h2') == end_article:
        break  # Stop once we reach the end article
    if skip_first_h2:
        skip_first_h2 = False
        continue  # Skip the first h2
    if h2.find_parent('article') == end_article:
        break  # Stop once we reach the end article
    joke = h2.get_text().strip()
    punchline_tag = h2.find_next('p', class_='p')
    if punchline_tag:
        punchline = punchline_tag.get_text().strip()
        joke_text = re.sub(r'^\d+\.\s*', '', joke)
        unique_jokes.append(f"{joke_text} {punchline}")

print(len(unique_jokes))

3636


Also adding a jokes dataset

In [None]:
import csv

csv_file = "/content/drive/MyDrive/onelinefun.csv"

with open(csv_file, 'r') as file:
    reader = csv.DictReader(file)  # Create a DictReader object
    for row in reader:
        unique_jokes.append(row['Joke'].strip())  # Access 'Joke' column by name

print(len(unique_jokes))

6586


In [None]:
# Load pre-trained GPT-2 model and tokenizer
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# Load tokenizer and add a padding token if not set
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token  # Assign EOS token as PAD token

# Load pre-trained GPT-2 model
model = GPT2LMHeadModel.from_pretrained('gpt2')

# Align model with tokenizer
model.resize_token_embeddings(len(tokenizer))

# Set device
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)  # Move model to appropriate device

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"

Fine tuning with 6.5k jokes

In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
import torch
from datasets import Dataset

# Load the GPT-2 tokenizer and add a padding token if needed
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token  # Use the EOS token as the padding token

# Tokenize the dataset
def tokenize_function(examples):
    # The 'text' column contains the jokes (strings)
    texts = examples['text']

    # Tokenize the jokes
    tokenized = tokenizer(texts, padding="max_length", truncation=True, max_length=128, return_tensors="pt")
    tokenized['labels'] = tokenized['input_ids'].clone()  # Create labels by copying input_ids
    return tokenized

# Convert jokes list to a Hugging Face Dataset
# Create a dictionary with 'text' as the key and unique_jokes as the value
jokes_dataset = Dataset.from_dict({"text": unique_jokes})

# Assume dataset has a 'joke' field. Modify as needed for your dataset structure.
tokenized_datasets = jokes_dataset.map(tokenize_function, batched=True, remove_columns=['text'])

# Prepare for PyTorch
tokenized_datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

# Split the dataset
train_dataset = tokenized_datasets.train_test_split(test_size=0.2, seed=42)['train']
eval_dataset = tokenized_datasets.train_test_split(test_size=0.2, seed=42)['test']

# Load the GPT-2 model
model = GPT2LMHeadModel.from_pretrained("gpt2")
model.resize_token_embeddings(len(tokenizer))

# Define training arguments
training_args = TrainingArguments(
    output_dir="./gpt2-jokes",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="steps",
    logging_steps=200,
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=2,
    learning_rate=5e-5,  # Learning Rate
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    save_total_limit=2,
    fp16=torch.cuda.is_available(),  # Use mixed precision if a GPU is available
    load_best_model_at_end=True,
    push_to_hub=False,  # Change to True if you want to push to Hugging Face Hub
    optim="adamw_torch"  # Optimizer: AdamW
)

# Create Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer
)

# Train the model
trainer.train()

Map:   0%|          | 0/6586 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


Epoch,Training Loss,Validation Loss
0,2.3955,0.684777
2,0.6222,0.665406


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


TrainOutput(global_step=987, training_loss=0.9942766168318017, metrics={'train_runtime': 483.5228, 'train_samples_per_second': 32.685, 'train_steps_per_second': 2.041, 'total_flos': 1031058358272000.0, 'train_loss': 0.9942766168318017, 'epoch': 2.9954476479514414})

In [None]:
def generate_joke(max_length=70):
    """
    Generate jokes using the fine-tuned GPT-2 model.
    """
    model.eval()  # Set the model to evaluation mode

    seed_text = "[FUNNY JOKE]"

    # Tokenize input without padding and ensure truncation for large inputs
    inputs = tokenizer(seed_text, return_tensors="pt", truncation=True, max_length=max_length).to(model.device)

    # Generate text with parameters to encourage more creativity
    outputs = model.generate(
        input_ids=inputs['input_ids'],
        attention_mask=inputs['attention_mask'],
        max_length=max_length,
        temperature=0.8,
        top_p=0.9,
        num_return_sequences=1,
        top_k=50,
        do_sample=True,   # Enable sampling (not greedy generation)
        pad_token_id=tokenizer.pad_token_id
    )

    # Decode and return text, skipping special tokens (like <|endoftext|>)
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Ensure that the prompt is removed from the generated text
    return generated_text[len(seed_text):].strip()

In [None]:
i = 1
while i <= 10:
    joke = generate_joke()
    if joke != "":
        print(f"JOKE {i}:", joke)
        i += 1

JOKE 1: What is it that makes you go through the motions of running in circles?
JOKE 2: Whats the difference between a pig and a chicken? A pig's a chicken and a pig's a chicken!
JOKE 3: I know you're jealous but I'm also a really sweet girl so I'm guessing you'd like to have a kiss on the cheek.
JOKE 4: I'll bet you two pennies on a dollar, and that one will be worth $3.
JOKE 5: Are you sure youre a clown? I mean, I think youre a clown.
JOKE 6: What's the difference between a duck and a turkey? A duck doesn't come with a turkey. A turkey doesn't come with a duck.
JOKE 7: How do you know my wife likes to go shopping?   A lady  a lady walks in and says, "I'm not buying anything, you know."
JOKE 8: is an alcoholic beverage.
JOKE 9: What do you call a woman who is married? A knot.
JOKE 10: What's the difference between a monkey and a duck? A duck is a monkey. A monkey is a duck.


# Some funny jokes that were generated

*   Are you a princess? I'm a princess of my dreams.
*   What does a donkey say when he gets into a fight with a pig?  A: Knock, knock!  And when you get in, you can knock him out!
*   What do you call a man who doesn't get laid? A man with no legs.
*   What do you call a person who can't remember a word? A liar.
*   Who's there? A blonde. A blonde who knows where I am.
*   What's the difference between a man and a witch? A witch is always in the mood for something.

