In [2]:
import pandas as pd

pd.set_option('max_colwidth', 200)

In [3]:
data = pd.read_csv('data/shortjokes.csv')

In [4]:
data

Unnamed: 0,ID,Joke
0,1,"[me narrating a documentary about narrators] ""I can't hear what they're saying cuz I'm talking"""
1,2,"Telling my daughter garlic is good for you. Good immune system and keeps pests away.Ticks, mosquitos, vampires... men."
2,3,I've been going through a really rough period at work this week It's my own fault for swapping my tampax for sand paper.
3,4,"If I could have dinner with anyone, dead or alive... ...I would choose alive. -B.J. Novak-"
4,5,Two guys walk into a bar. The third guy ducks.
...,...,...
231652,231653,The Spicy Sausage by Delia Katessen
231653,231654,"TIL That I Shouldn't have gone to law school, because everyone in /r/news already has their law degree"
231654,231655,What did the RAM stick say to the politician? I'm PC2!
231655,231656,what do you call a play about victorian era menstruation? A period piece!


In [5]:
full_df = pd.DataFrame(data)

full_df.head(20)

Unnamed: 0,ID,Joke
0,1,"[me narrating a documentary about narrators] ""I can't hear what they're saying cuz I'm talking"""
1,2,"Telling my daughter garlic is good for you. Good immune system and keeps pests away.Ticks, mosquitos, vampires... men."
2,3,I've been going through a really rough period at work this week It's my own fault for swapping my tampax for sand paper.
3,4,"If I could have dinner with anyone, dead or alive... ...I would choose alive. -B.J. Novak-"
4,5,Two guys walk into a bar. The third guy ducks.
5,6,Why can't Barbie get pregnant? Because Ken comes in a different box. Heyooooooo
6,7,Why was the musician arrested? He got in treble.
7,8,Did you hear about the guy who blew his entire lottery winnings on a limousine? He had nothing left to chauffeur it.
8,9,What do you do if a bird shits on your car? Don't ask her out again.
9,10,He was a real gentlemen and always opened the fridge door for me


In [6]:
def clean_jokes(joke_text):
    return str(joke_text).lower()

In [7]:
full_df['clean_jokes'] = full_df['Joke'].apply(clean_jokes)

In [8]:
full_df.head(20)

Unnamed: 0,ID,Joke,clean_jokes
0,1,"[me narrating a documentary about narrators] ""I can't hear what they're saying cuz I'm talking""","[me narrating a documentary about narrators] ""i can't hear what they're saying cuz i'm talking"""
1,2,"Telling my daughter garlic is good for you. Good immune system and keeps pests away.Ticks, mosquitos, vampires... men.","telling my daughter garlic is good for you. good immune system and keeps pests away.ticks, mosquitos, vampires... men."
2,3,I've been going through a really rough period at work this week It's my own fault for swapping my tampax for sand paper.,i've been going through a really rough period at work this week it's my own fault for swapping my tampax for sand paper.
3,4,"If I could have dinner with anyone, dead or alive... ...I would choose alive. -B.J. Novak-","if i could have dinner with anyone, dead or alive... ...i would choose alive. -b.j. novak-"
4,5,Two guys walk into a bar. The third guy ducks.,two guys walk into a bar. the third guy ducks.
5,6,Why can't Barbie get pregnant? Because Ken comes in a different box. Heyooooooo,why can't barbie get pregnant? because ken comes in a different box. heyooooooo
6,7,Why was the musician arrested? He got in treble.,why was the musician arrested? he got in treble.
7,8,Did you hear about the guy who blew his entire lottery winnings on a limousine? He had nothing left to chauffeur it.,did you hear about the guy who blew his entire lottery winnings on a limousine? he had nothing left to chauffeur it.
8,9,What do you do if a bird shits on your car? Don't ask her out again.,what do you do if a bird shits on your car? don't ask her out again.
9,10,He was a real gentlemen and always opened the fridge door for me,he was a real gentlemen and always opened the fridge door for me


In [11]:
train_jokes_df = full_df['clean_jokes'].sample(n=20000, random_state=42)

test_jokes_df = full_df.drop(train_jokes_df.index)
test_jokes_df = test_jokes_df['clean_jokes']

In [12]:
train_jokes_df.head()

4510                                                                                                                   what do all battered women have in common? they don't listen.
161744    who invented the north america? teacher: sarah, go to the map and find north america. sarah: here it is. teacher: correct. now class, who discovered america?class: sarah!
155603                           i feel like this election ended up being a good thing for hilary clinton. at least now she knows what it feels like to get fucked by the president.
100593                                                                                                                               what do you call a pile of kittens? a meowntain
23208                                                                                i feel bad for people named john smith. they probably didn't get the gmail account they wanted.
Name: clean_jokes, dtype: object

In [13]:
test_jokes_df.head()

0                             [me narrating a documentary about narrators] "i can't hear what they're saying cuz i'm talking"
1      telling my daughter garlic is good for you. good immune system and keeps pests away.ticks, mosquitos, vampires... men.
2    i've been going through a really rough period at work this week it's my own fault for swapping my tampax for sand paper.
3                                  if i could have dinner with anyone, dead or alive... ...i would choose alive. -b.j. novak-
4                                                                              two guys walk into a bar. the third guy ducks.
Name: clean_jokes, dtype: object

In [14]:
print(f'Size of train jokes dataset = {len(train_jokes_df)}')
print(f'Size of test jokes dataset = {len(test_jokes_df)}')

Size of train jokes dataset = 20000
Size of test jokes dataset = 211657


In [16]:
#Create the short jokes train and test datasets as files
with open('train_jokes.txt', mode='w', encoding='utf-8') as f:
    f.write(train_jokes_df.to_string(header=False, index=False))
    
with open('test_jokes.txt', mode='w', encoding='utf-8') as f:
    f.write(test_jokes_df.to_string(header=False, index=False))

In [17]:
from nltk.util import pad_sequence
from nltk.util import bigrams
from nltk.util import ngrams
from nltk.util import everygrams
from nltk.lm.preprocessing import pad_both_ends
from nltk.lm.preprocessing import flatten

In [18]:
try: # Use the default NLTK tokenizer.
    from nltk import word_tokenize, sent_tokenize 
    # Testing whether it works. 
    # Sometimes it doesn't work on some machines because of setup issues.
    word_tokenize(sent_tokenize("This is a foobar sentence. Yes it is.")[0])
except: # Use a naive sentence tokenizer and toktok.
    import re
    from nltk.tokenize import ToktokTokenizer
    # See https://stackoverflow.com/a/25736515/610569
    sent_tokenize = lambda x: re.split(r'(?<=[^A-Z].[.?]) +(?=[A-Z])', x)
    # Use the toktok tokenizer that requires no dependencies.
    toktok = ToktokTokenizer()
    word_tokenize = word_tokenize = toktok.tokenize

In [19]:
import os
import requests
import io #codecs

with io.open('train_jokes.txt', encoding='utf-8') as fin:
    text = fin.read()

In [20]:
# Tokenize the text.
tokenized_text = [list(map(str.lower, word_tokenize(sent))) 
                  for sent in sent_tokenize(text)]

In [24]:
print(tokenized_text[:1000])

tions', ',', 'i', "'ll", 'never', 'dine', 'with', 'a', 'russian', 'again', '.'], ['they', 'ca', "n't", 'stop', 'talking', 'about', 'going', 'putin-free', '.'], ['a', 'vegan', ',', 'a', 'priest', ',', '&', 'a', 'rabbi', 'walk', 'into', 'a', 'bar', '.'], ['the', 'vegan', 'pretty', 'much', 'just', 'blabs', 'on', '&', 'on', 'about', 'how', 'he', "'s", 'a', 'vegan', 'for', 'the', 'next', 'hour', '.'], ['the', 'end', '.'], ['why', 'did', "n't", 'the', 'skeleton', 'go', 'for', 'the', 'halloween', 'party', '?'], ['he', 'had', '``', 'no', 'body', '``', 'to', 'go', 'with', '.'], ['hilary', 'clinton', 'will', 'be', 'the', 'first', 'f', 'president', 'of', 'the', 'united', 'states', '...', '.', 'i', 'was', 'gon', 'na', 'say', 'female', 'but', 'someone', 'deleted', 'the', 'emale', '.'], ['i', 'made', 'a', 'joke', 'for', 'my', 'dog', 'tonight', '...', 'why', 'did', 'the', 'cat', 'cross', 'the', 'road', '?'], ['splat', '*clap', 'my', 'hands*', '.'], ['it', 'did', "n't", '!', '!'], ['(', 'my', 'dog', '

In [25]:
from nltk.lm.preprocessing import padded_everygram_pipeline

# Preprocess the tokenized text for 3-grams language modelling
n = 3
train_data, padded_sents = padded_everygram_pipeline(n, tokenized_text)

In [26]:
from nltk.lm import MLE
model = MLE(n) # Lets train a 3-grams model, previously we set n=3

In [27]:
len(model.vocab)

0

In [28]:
model.fit(train_data, padded_sents)
print(model.vocab)

<Vocabulary with cutoff=1 unk_label='<UNK>' and 23394 items>


In [29]:
len(model.vocab)

23394

In [30]:
print(model.counts)

<NgramCounter with 3 ngram orders and 1531080 ngrams>


In [31]:
print(model.generate(20, random_seed=321))

['<s>', '<s>', 'me', ':', 'yes', "ma'am", 'they', 'ca', "n't", 'recall', 'where', 'i', 'ran', 'over', 'your', 'own', 'tweets', ',', 'you', 'will']


In [32]:
from nltk.tokenize.treebank import TreebankWordDetokenizer

detokenize = TreebankWordDetokenizer().detokenize

def generate_sent(model, num_words, random_seed=42):
    """
    :param model: An ngram language model from `nltk.lm.model`.
    :param num_words: Max no. of words to generate.
    :param random_seed: Seed value for random.
    """
    content = []
    for token in model.generate(num_words, random_seed=random_seed):
        if token == '<s>':
            continue
        if token == '</s>':
            break
        content.append(token)
    return detokenize(content)

In [33]:
for i in range(321, 330):
    print(generate_sent(model, 20, random_seed=i))

me: yes ma'am they can't recall where i ran over your own tweets, you will
the second most popular family car in a lightbulb?
no one.
hiding my shit i just stopped at a cafe and said``no thank-you", then most facebook status
fund.
: checking your phone when i discovered a contraceptive better than sex i mostly hit 3's and copy exactly
the smallest region in the breakroom, i'm cold and dry ;) guess who's there?
is such a rip-off.
sorry, i can't steven hawking runs.. ..on batteries.
