In [1]:
import hashlib
import json
import jsonlines
import openai
import os
import unicodedata
import random
import spacy

from collections import Counter, defaultdict
from dotenv import load_dotenv

load_dotenv()
nlp = spacy.load('en_core_web_sm')
openai.api_key = os.environ['OPENAI_API_KEY']

In [2]:
%load_ext blackcellmagic

In [3]:
with open('./data/raw/jokes-all-hosts.json') as f:
    data_all = json.load(f)
    
data_all[:5]

[{'host': 'Leno',
  'date': '2009-12-03',
  'source': 12,
  'joke': 'Twenty-eight years old! That shows you how quickly things can change. Britney is now back on top of her game and Tiger Woods is in the tabloids for crashing his car. Who could have seen . . .'},
 {'host': 'Leno',
  'date': '2009-12-03',
  'source': 12,
  'joke': 'President Obama and the Democratic majority in congress are now assembling a new jobs package. The area with the most job openings: White House security. A lot of jobs there.'},
 {'host': 'Leno',
  'date': '2009-12-03',
  'source': 12,
  'joke': 'How is this for nerve? That White House party crashing couple, the Salahis . . . They refused an invitation to testify before Congress today. They refused. Isn’t that unbelievable? The one thing they actually get invited to and they don’t show up.'},
 {'host': 'Leno',
  'date': '2009-12-03',
  'source': 12,
  'joke': 'This week, America’s last living World War I veteran, a man named Frank Buckles, 108 years old, said

In [4]:
counts = defaultdict(int)
for joke in data_all:
    counts[joke['host']] += 1

counts

defaultdict(int,
            {'Leno': 3354,
             'Conan': 4800,
             'Ferguson': 3342,
             'Kimmel': 4735,
             'Fallon': 6318,
             'Letterman': 3753,
             'Meyers': 2549,
             'Corden': 1703,
             'Colbert': 975})

In [5]:
HOSTS = ['Meyers']

jokes_filtered = [j["joke"] for j in data_all if j["host"] in HOSTS]

print('total: {}\n'.format(len(jokes_filtered)))
print('\n\n'.join(jokes_filtered[:5]))

total: 2549

Chelsea Clinton has announced that she is pregnant with her first child. The baby is expected to crawl after nine months and run in 2055.

This weekend over 37,000 people went to Denver to participate in the 4th annual Cannabis Cup. And they all made memories that would last a few minutes.

Last week, NBA Commissioner Adam Silver announced that he wants to raise the league’s age minimum from 19 to 20. The league’s age maximum will continue to be Kevin Garnett.

Today is Earth Day. At least according to the guy who saw me throw a banana peel in the blue trash can.

The Christian Science Monitor is claiming “Hillary Clinton will be a tad less interested in running for president now that she’s about to be a grandmother.” And if you put a grain of sand in your pocket there’s a tad less sand on the beach.


In [6]:
with open('./data/raw/jokes-coco.json') as f:
    data_conan = json.load(f)

data_conan[0]

{'tags': [{'tags': '',
   'hrefTag': 'misc',
   'status': 1,
   'slug': 'jokes/misc',
   'tid': 7402,
   'data': None,
   'type': {'name': 'jokes', 'id': 6, 'title': 'Jokes category'},
   'name': 'Misc',
   'description': '',
   'content': 0},
  {'tags': '',
   'hrefTag': 'conan',
   'status': 1,
   'slug': 'category/tags/conan',
   'tid': 24,
   'data': None,
   'type': {'name': 'tags', 'id': 2, 'title': 'Tags'},
   'name': 'CONAN',
   'description': '',
   'content': 8}],
 'id': 64106,
 'thumbWithText': 'http://cdn.teamcococdn.com/jokes/64106/1,1/650/jokes/nov-8-2010-yes-i-know-what-you-guys-are-thinking-hey-its-the-guy.jpg',
 'slug': 'jokes/nov-8-2010-yes-i-know-what-you-guys-are-thinking-hey-its-the-guy',
 'body': 'Yes, I know what you guys are thinking, "Hey, it\'s the guy from Twitter."',
 'title': 'Nov 8, 2010 - Yes, I know what you guys are thinking, "Hey, it\'s the guy',
 'credit-date': 'November 08, 2010',
 'hier1': '',
 'thumb': 'http://cdn.teamcococdn.com/image/650x650,fram

In [7]:
EXCLUDE_IDS = set([88572, 99457, 99483]) # dupes

jokes_conan = [d['body'] for d in data_conan if d['id'] not in EXCLUDE_IDS]

print('total: {}\n'.format(len(jokes_conan)))
print('\n\n'.join(jokes_conan[:5]))

total: 10041

Yes, I know what you guys are thinking, "Hey, it's the guy from Twitter."

I'm glad to be on cable. The truth is, I've dreamed of being a talk show host on basic cable ever since I was 46.

I'm happy to report that we're already #1 in TBS's key demographic—people who can't afford HBO.

It's not easy doing a late-night show on a channel without a lot of money and that viewers have trouble finding. So that's why I left NBC.

That's right—the whitest man in show business is back…on the second blackest channel on TV.


In [8]:
# jokes = [*jokes_conan, *jokes_filtered]
jokes = jokes_conan
random.shuffle(jokes)

In [9]:
counts = defaultdict(int)
jokes_cleaned = []

for joke in jokes:
    if "Conan" in joke:
        continue
        
    joke_clean = (
        unicodedata.normalize("NFKD", joke)
        .replace("  ", " ")
        .replace("—", "--")
        .strip()
    )

    tokens = nlp(joke_clean)
    sentences = [sent.text for sent in tokens.sents]
    sentence_ct = len(sentences)

    entry = {"text": joke_clean, "sentences": sentences, "sentence_ct": sentence_ct}
    jokes_cleaned.append(entry)
    counts[sentence_ct] += 1

print(counts)

defaultdict(<class 'int'>, {2: 8783, 3: 347, 1: 844, 4: 20, 5: 6, 7: 1})


In [10]:
# sample jokes with given num of sentences
sentence_ct = 2
filtered = [j["text"] for j in jokes_cleaned if j["sentence_ct"] == sentence_ct]

print("\n\n".join(random.sample(filtered, 5)))

Over the weekend, President Trump was in Mississippi speaking at the Civil Rights Museum, and he called civil rights "big, big stuff." To be fair, Trump was just paraphrasing Dr. King’s famous speech, "I Have A Thing."

Whole Foods has started selling rabbit meat? That’s great, I was looking for a place to buy way-too-expensive rabbit meat.

Google has banned a porn app for Google Glass. A Google spokesman said, “We’re a principled company, so if you want to look at smut and filth, you’ll just have to use our website.”

A doctor is being accused of illegal lion hunting in Zimbabwe. Here’s my question: whatever happened to golf?

After his speech, President Obama surprised Clinton by walking on stage and giving him a hug. Out of habit, when Bill Clinton saw a woman's husband coming at him he threw his shoe and hit in a closet.


In [11]:
i, MAX_ENTRIES = 0, 5000
SYSTEM_PROMPT = "You are a creative and hilarious comedy writer that loves to craft jokes"

fname = "./data/training-latest.jsonl"
seen = set()

# format and output training data
with jsonlines.open(fname, mode="w") as f:
    for joke in jokes_cleaned:
        # don't use single sentence or really long jokes
        ct = joke["sentence_ct"]
        if ct < 2 or ct > 4:
            continue
        
        i += 1
        if i > MAX_ENTRIES:
            break
        
        setup = joke["sentences"][0]
        punchline = " ".join(joke["sentences"][1:])
        
        hash_id = hashlib.shake_128(punchline.encode()).hexdigest(4)
        if hash_id in seen:
            continue

        seen.add(hash_id)
        f.write({
            "messages": [
                { "role": "system", "content": SYSTEM_PROMPT },
                { "role": "user", "content": setup },
                { "role": "assistant", "content": punchline }
            ]
        })

In [None]:
openai.File.create(
  file=open("./data/training-latest.jsonl", "rb"),
  purpose='fine-tune'
)

In [None]:
TRAINING_FILE = 'FILL_IN'

openai.FineTuningJob.create(training_file=TRAINING_FILE, model="gpt-3.5-turbo")

In [None]:
JOB_ID = 'FILL_IN'

openai.FineTuningJob.retrieve(JOB_ID)

In [43]:
MODEL_ID = 'FILL_IN'

test_entry = random.choice(jokes_cleaned)
print("ORIGINAL JOKE:\n{}\n".format(test_entry["text"]))

prompt = test_entry["sentences"][0]
print("PROMPT:\n{}".format(prompt))

messages = [
    {"role": "system", "content": SYSTEM_PROMPT},
    {"role": "user", "content": prompt},
]

response = openai.ChatCompletion.create(
    model=MODEL_ID,
    messages=messages,
    max_tokens=150,
    temperature=0.5,
    n=3,
)

print("MODEL RESPONSE:\n")
print(
    "\n\n".join(
        [
            "{}: {}".format(i + 1, choice.message["content"])
            for i, choice in enumerate(response["choices"])
        ]
    )
)

ORIGINAL JOKE:
Everyone’s OK, but a rapper has been arrested for allegedly stabbing an NFL player. The rapper is being charged with “impersonating an NFL player.”

PROMPT:
Everyone’s OK, but a rapper has been arrested for allegedly stabbing an NFL player.
MODEL RESPONSE:

1: The rapper is being charged with "keeping it too real."

2: The rapper is facing charges of assault and being a rapper.

3: The rapper is being charged with "being a rapper."
