In [92]:
import pickle
import pandas as pd
import re

In [3]:
# Link to original dataset: https://github.com/CrowdTruth/Short-Text-Corpus-For-Humor-Detection/blob/master/datasets/humorous_oneliners.pickle
# git versions changed the Unix new lines ('\n') to DOS lines ('\r\n') so the following step is needed

from library.dos2unixfile import dos2unixfile
dos2unixfile("datasets/original_humorous_oneliners.pickle", "datasets/humorous_oneliners.pickle")

Done. Saved 10502 bytes.


In [79]:
jokes_dataL = pd.read_pickle("datasets/humorous_oneliners.pickle", compression=None)
jokes_data = pd.DataFrame(jokes_dataL, columns = ["joke"])

In [80]:
pd.set_option('display.max_colwidth', -1)
jokes_data.head(10)

Unnamed: 0,joke
0,"Insanity is hereditary, - You get it from your children."
1,"An honest politician is one who, when bought, stays bought."
2,"You can tune a piano, but you can't tuna fish."
3,A closed mouth gathers no foot.
4,What's black and white and red all over? An embarassed zebra.
5,What's black and white and red all over? Certainly not the Halifax newspapers.
6,Gravity doesn't exist: the earth sucks.
7,What's the most popular form of birth control? The headache.
8,He who laughs last probably doesn't understand the joke.
9,Let him who takes the plunge remember to return it by Tuesday.


In [269]:
# data preprocessing

MIN_JOKE_LEN = 10
MAX_JOKE_LEN = 200

def regex_match(sentence):
    return bool(re.match(r"^[ -z]+$", sentence))

print("Number of jokes originally: ", jokes_data.shape)

# convert to lowercase. Strip left and right whitespaces. Replace multiple spaces with a single space
# Replace odd characters with appropriate ones

clean_jokes_data = (jokes_data
                    .replace('^\-', '', regex=True)
                    .replace('[\t|\r|\n]', ' ', regex=True)
                    .replace("\\\\", ' ', regex=True)
                    .replace("`|´", "'", regex=True)
                    .replace("\u2018|\u2019", "'", regex=True)
                    .replace("\u201C|\u201D", '"', regex=True)
                    .replace("–", '-', regex=True)
                    .replace("…", '...', regex=True)
                    .replace("\&", " and ", regex=True)
                    .replace("\s+", ' ', regex=True)
                    .applymap(str.lower)
                    .applymap(str.strip))

clean_jokes_data = clean_jokes_data[clean_jokes_data["joke"].apply(regex_match) &
                                    (clean_jokes_data["joke"].str.len() >= MIN_JOKE_LEN) & 
                                    (clean_jokes_data["joke"].str.len() <= MAX_JOKE_LEN)]

print("Number of jokes after cleaning: ", clean_jokes_data.shape)
clean_jokes_data.head(10)

Number of jokes originally:  (5251, 1)
Number of jokes after cleaning:  (5218, 1)


Unnamed: 0,joke
0,"insanity is hereditary, - you get it from your children."
1,"an honest politician is one who, when bought, stays bought."
2,"you can tune a piano, but you can't tuna fish."
3,a closed mouth gathers no foot.
4,what's black and white and red all over? an embarassed zebra.
5,what's black and white and red all over? certainly not the halifax newspapers.
6,gravity doesn't exist: the earth sucks.
7,what's the most popular form of birth control? the headache.
8,he who laughs last probably doesn't understand the joke.
9,let him who takes the plunge remember to return it by tuesday.


In [271]:
unique_chars = sorted(list(set(clean_jokes_data["joke"].str.cat(sep=''))))
print(unique_chars)
print("Number of unique chars = ", len(unique_chars))

[' ', '!', '"', '#', '$', '%', "'", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '=', '?', '@', '^', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
Number of unique chars =  58
