In [1]:
import os
import json
import numpy as np

import re
import string
from nltk.stem import WordNetLemmatizer
from wordcloud import STOPWORDS

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.metrics import confusion_matrix
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

In [45]:
# LOAD DATA

main_dir = ""
jokes_dir = os.path.join(main_dir, "joke-dataset")
jokes_path = os.path.join(jokes_dir, "stupidstuff.json")

def load(filepath):
    '''
    reads filepath and returns jokes and scores
    '''
    jokes_data = []
    scores_data = []

    with open(filepath) as f:
        reddit_json = json.load(f)

        for item in reddit_json:
            jokes_data.append(item["body"])
            scores_data.append(round(item["rating"]))

    return jokes_data, scores_data

jokes, scores = load(jokes_path)

In [46]:
# LIMIT DATA

limit = 4000
jokes, scores = jokes[:limit], scores[:limit]

In [4]:
# PREPROCESS DATA

def preprocess(X, lowercase=True, numbers=True, punctuation=True, whitespaces=True, stopwords=True, lemmatization=True):
    # remove antislash+character (str -> str)
    esc_chars = [
        # r"\\",
        # r"\a",
        # r"\\b",
        # r"\f",
        # r"\r",
        r"\n",
        r"\t"
    ]
    for esc_char in esc_chars:
        X = [re.sub(esc_char, " ", line) for line in X]
    # replace \" with " and \' with ' (str -> str)
    X = [re.sub(r"\'", "'", line) for line in X]
    X = [re.sub(r"\"", "\"", line) for line in X] 

    # lowercase (str -> str)
    if lowercase:
        X = [line.lower() for line in X]

    # remove numbers (str -> str)
    if numbers:
        X = [re.sub(r'\d+', '', line) for line in X]

    # replace ‘ and ’ with ' and replace “ and ” with " (str -> str)
    X = [line.replace("’","'") for line in X]
    X = [line.replace("‘","'") for line in X]
    X = [line.replace("“","\"") for line in X]
    X = [line.replace("”","\"") for line in X]

    # replace punctuation with space except ' (str -> str)
    string_punctuation = string.punctuation[:6] + string.punctuation[7:]
    if punctuation:
        X = [line.translate({ord(punctuation):' ' for punctuation in string_punctuation}) for line in X]

    # remove whitespaces (str -> str)
    if whitespaces:
        X = [line.strip() for line in X]

    # tokenization (str -> list)
    X = [line.split(' ') for line in X]

    # remove stopwords (list -> list)
    if stopwords:
        stopwords = set(STOPWORDS)
        stopwords.update({})
        X = [[word for word in line if len(word) > 2 and not word in stopwords] for line in X]
    
    # remove ' (list -> list)
    X = [[word.translate({ord('\''):None}) for word in line] for line in X]

    # Lemmatization (list -> list)
    if lemmatization:
        X = [[WordNetLemmatizer().lemmatize(word) for word in line] for line in X]

    return X

jokes1 = preprocess(jokes, lowercase=True, numbers=True, punctuation=True, whitespaces=True, stopwords=True, lemmatization=True)
jokes2 = preprocess(jokes, lowercase=False, numbers=False, punctuation=True, whitespaces=True, stopwords=True, lemmatization=True)

In [5]:
# BAG_OF_WORDS

def get_vocab(X):
    '''
    X: list of list of words

    returns vocabulary of X
    '''
    vocab = set()
    for sentence in X:
        vocab.update(sentence)
    vocab = list(vocab)
    return vocab

def bag_of_words(X, vocab):
    '''
    X: list of list of words

    returns list of bag_of_words of each sentence in X
    '''
    length = len(vocab)

    # transform each sentence into a bag of words
    X_bag_of_words = []
    for sentence in X:
        sentence_bag_of_words = []
        # one hot encoding representation of each word
        for word in sentence:
            word_one_hot_encoding = [0] * length
            try:
                word_index = vocab.index(word)
                word_one_hot_encoding[word_index] = 1
            except Exception:
                pass
            sentence_bag_of_words.append(word_one_hot_encoding)
        
        # bag of words representation of the sentence is
        # the sum of one hot encoding representations of the words it contains
            # OR
        # [0] * len(vocab) if the sentence doesn't contain any word
        if len(sentence_bag_of_words) == 0:
            X_bag_of_words.append([0] * length)
        else:
            X_bag_of_words.append([
                sum([word_one_hot_encoding[i] for word_one_hot_encoding in sentence_bag_of_words]) for i in range(length)
            ])

    return X_bag_of_words

vocab1 = get_vocab(jokes1)
vocab2 = get_vocab(jokes2)

jokes1 = bag_of_words(jokes1, vocab1)
jokes2 = bag_of_words(jokes2, vocab2)

In [6]:
# SPLIT DATA

def split(X, Y, split=0.3, random_state=109):
    # Split dataset into training set and test set
    # 70% training and 30% test (default)
    return train_test_split(X, Y, test_size=split, random_state=random_state)

X_train1, X_test1, Y_train1, Y_test1 = split(jokes1, scores, split=0.3)
X_train2, X_test2, Y_train2, Y_test2 = split(jokes2, scores, split=0.3)

In [62]:
# PREPARE MODELS

def cross_validation(model, X, Y, cv, scoring):
    return cross_validate(model, X, Y, cv=cv, scoring=scoring)

def get_confusion_matrices(results):
    confusion_matrices = np.array([
        [
            [
                results[f"test_{i}{j}"][k] for j in range(6)
            ] for i in range(6)
        ] for k in range(iterations)
    ])
    return confusion_matrices

def show_results(results):
    confusion_matrices = get_confusion_matrices(results)
    print('confusion matrices :')
    print(confusion_matrices)
    print(f'accuracy = {results["test_accuracy"]}')
    print(f'recall = {results["test_recall"]}')
    print(f'precision = {results["test_precision"]}')
    print(f'f1 = {results["test_f1"]}')
    return confusion_matrices

def conf_mat(y_true, y_pred, i, j):
    return confusion_matrix(y_true, y_pred, labels=list(range(6)))[i, j]

def make_score(metric):
    return make_scorer(metric, average='macro')

dec_tree1 = DecisionTreeClassifier()
rand_forest1 = RandomForestClassifier(random_state=6)
log_reg1 = LogisticRegression()
dec_tree2 = DecisionTreeClassifier()
rand_forest2 = RandomForestClassifier(random_state=6)
log_reg2 = LogisticRegression()

iterations = 3

scoring = {
    f'{i}{j}' : make_scorer(conf_mat, i=i, j=j) for i in range(6) for j in range(6)
}

scoring['accuracy'] = make_scorer(accuracy_score)
scoring['precision'] = make_score(precision_score)
scoring['recall'] = make_score(recall_score)
scoring['f1'] = make_score(f1_score)

In [63]:
# PREPROCESSING 1
# MODEL 1 : Decision Tree

dec_tree1_results = cross_validation(dec_tree1, X_train1, Y_train1, iterations, scoring)
dec_tree1_confusion_matrices = show_results(dec_tree1_results)
dec_tree1 = dec_tree1.fit(X_train1, Y_train1)

confusion matrices :
[[[ 0  0  0  0  0  0]
  [ 0  0  0  0  0  0]
  [ 0  0  2  2  4  0]
  [ 0  0  4  6 10  1]
  [ 0  1  1 11 19  2]
  [ 0  0  2  4  1  0]]

 [[ 0  0  0  0  0  0]
  [ 0  0  0  0  0  1]
  [ 0  0  1  5  2  0]
  [ 0  0  3  9  8  1]
  [ 0  0  1  7 21  4]
  [ 0  0  0  4  3  0]]

 [[ 0  0  0  0  0  0]
  [ 0  0  0  1  0  0]
  [ 0  0  2  3  3  0]
  [ 0  0  1 10  8  1]
  [ 0  0  1 13 20  0]
  [ 0  0  1  2  4  0]]]
accuracy = [0.38571429 0.44285714 0.45714286]
recall = [0.21890756 0.23798701 0.26764706]
precision = [0.20838306 0.23552941 0.26325123]
f1 = [0.21336898 0.23440323 0.25911314]


In [64]:
# PREPROCESSING 1
# MODEL 2 : Random Forest

rand_forest1_results = cross_validation(rand_forest1, X_train1, Y_train1, iterations, scoring)
rand_forest1_confusion_matrices = show_results(rand_forest1_results)
rand_forest1 = rand_forest1.fit(X_train1, Y_train1)

confusion matrices :
[[[ 0  0  0  0  0  0]
  [ 0  0  0  0  0  0]
  [ 0  0  0  1  7  0]
  [ 0  0  0  1 20  0]
  [ 0  0  0  2 32  0]
  [ 0  0  0  0  7  0]]

 [[ 0  0  0  0  0  0]
  [ 0  0  0  0  1  0]
  [ 0  0  0  0  8  0]
  [ 0  0  0  0 21  0]
  [ 0  0  0  1 32  0]
  [ 0  0  0  0  7  0]]

 [[ 0  0  0  0  0  0]
  [ 0  0  0  0  1  0]
  [ 0  0  0  2  6  0]
  [ 0  0  0  2 18  0]
  [ 0  0  0  5 29  0]
  [ 0  0  0  2  5  0]]]
accuracy = [0.47142857 0.45714286 0.44285714]
recall = [0.24719888 0.19393939 0.19058824]
precision = [0.18371212 0.09275362 0.13466872]
f1 = [0.18       0.1254902  0.15053763]


In [65]:
# PREPROCESSING 1
# MODEL 3 : Logistic Regression

log_reg1_results = cross_validation(log_reg1, X_train1, Y_train1, iterations, scoring)
log_reg1_confusion_matrices = show_results(log_reg1_results)
log_reg1 = log_reg1.fit(X_train1, Y_train1)

confusion matrices :
[[[ 0  0  0  0  0  0]
  [ 0  0  0  0  0  0]
  [ 0  0  0  5  3  0]
  [ 0  0  0  5 15  1]
  [ 0  0  0  8 25  1]
  [ 0  0  1  1  5  0]]

 [[ 0  0  0  0  0  0]
  [ 0  0  0  0  1  0]
  [ 0  0  0  2  5  1]
  [ 0  0  1  4 16  0]
  [ 0  0  0 12 20  1]
  [ 0  0  2  0  5  0]]

 [[ 0  0  0  0  0  0]
  [ 0  0  0  1  0  0]
  [ 0  0  1  5  2  0]
  [ 0  0  0  6 14  0]
  [ 0  0  0 14 19  1]
  [ 0  0  0  5  2  0]]]
accuracy = [0.42857143 0.34285714 0.37142857]
recall = [0.24334734 0.15930736 0.19676471]
precision = [0.19599781 0.12955083 0.34141238]
f1 = [0.21493902 0.14102564 0.19854552]


In [66]:
# PREPROCESSING 2
# MODEL 1 : Decision Tree

dec_tree2_results = cross_validation(dec_tree2, X_train2, Y_train2, iterations, scoring)
dec_tree2_confusion_matrices = show_results(dec_tree2_results)
dec_tree2 = dec_tree2.fit(X_train2, Y_train2)

confusion matrices :
[[[ 0  0  0  0  0  0]
  [ 0  0  0  0  0  0]
  [ 0  1  0  4  2  1]
  [ 0  0  3  9  8  1]
  [ 0  0  2  9 19  4]
  [ 0  0  1  3  3  0]]

 [[ 0  0  0  0  0  0]
  [ 0  0  0  0  1  0]
  [ 0  0  1  5  2  0]
  [ 0  0  2  9  6  4]
  [ 0  0  1 11 15  6]
  [ 0  0  0  3  4  0]]

 [[ 0  0  0  0  0  0]
  [ 0  0  0  1  0  0]
  [ 0  0  2  5  1  0]
  [ 0  0  1  9  7  3]
  [ 0  0  1 15 14  4]
  [ 0  0  0  2  4  1]]]
accuracy = [0.4        0.35714286 0.37142857]
recall = [0.19747899 0.20162338 0.25092437]
precision = [0.19075    0.22142857 0.28894231]
f1 = [0.19341238 0.20516338 0.25589744]


In [67]:
# PREPROCESSING 2
# MODEL 2 : Random Forest

rand_forest2_results = cross_validation(rand_forest2, X_train2, Y_train2, iterations, scoring)
rand_forest2_confusion_matrices = show_results(rand_forest2_results)
rand_forest2 = rand_forest2.fit(X_train2, Y_train2)

confusion matrices :
[[[ 0  0  0  0  0  0]
  [ 0  0  0  0  0  0]
  [ 0  0  0  2  6  0]
  [ 0  0  0  1 20  0]
  [ 0  0  0  0 34  0]
  [ 0  0  0  1  6  0]]

 [[ 0  0  0  0  0  0]
  [ 0  0  0  0  1  0]
  [ 0  0  0  1  7  0]
  [ 0  0  0  1 20  0]
  [ 0  0  0  1 32  0]
  [ 0  0  0  1  6  0]]

 [[ 0  0  0  0  0  0]
  [ 0  0  0  0  1  0]
  [ 0  0  0  3  5  0]
  [ 0  0  0  5 15  0]
  [ 0  0  0  9 25  0]
  [ 0  0  0  3  4  0]]]
accuracy = [0.5        0.47142857 0.42857143]
recall = [0.26190476 0.2034632  0.19705882]
precision = [0.19128788 0.1469697  0.15      ]
f1 = [0.19       0.14529293 0.16904762]


In [34]:
# PREPROCESSING 2
# MODEL 3 : Logistic Regression

log_reg2_results = cross_validation(log_reg2, X_train2, Y_train2, iterations, scoring)
log_reg2_confusion_matrices = show_results(log_reg2_results)
log_reg2 = log_reg2.fit(X_train2, Y_train2)

confusion matrices :
[[[ 0  0  0  0  0  0]
  [ 0  0  0  0  0  0]
  [ 0  0  0  5  3  0]
  [ 0  0  0  4 17  0]
  [ 0  0  0  9 24  1]
  [ 0  0  0  3  4  0]]

 [[ 0  0  0  0  0  0]
  [ 0  0  0  0  1  0]
  [ 0  0  0  3  4  1]
  [ 0  0  0  4 17  0]
  [ 0  0  1  9 23  0]
  [ 0  0  1  0  6  0]]

 [[ 0  0  0  0  0  0]
  [ 0  0  0  1  0  0]
  [ 0  0  1  5  2  0]
  [ 0  0  0  7 13  0]
  [ 0  0  0 13 20  1]
  [ 0  0  0  3  3  1]]]
accuracy = [0.4        0.38571429 0.41428571]
recall = [0.4        0.38571429 0.41428571]
precision = [0.4        0.38571429 0.41428571]
f1 = [0.4        0.38571429 0.41428571]


In [61]:
# ENTER YOUR JOKE TO TEST IT ON THE DIFFERENT MODELS

JOKE = ''

# EXAMPLES
# JOKE = 'What do you call a sleeping bull? A bulldozer!'
# JOKE = 'What has four wheels and flies? A garbage truck!'
# # Score = 2
# JOKE = jokes[7]
# JOKE = jokes[42]
# # Score = 3
# JOKE = jokes[0]
# JOKE = jokes[2]
# JOKE = jokes[11]

joke = preprocess([JOKE])
jokes1 = bag_of_words(joke, vocab1)
jokes2 = bag_of_words(joke, vocab2)
# model 1 : decision tree
print("decision tree : ", dec_tree1.predict(jokes1)[0])
print("decision tree : ", dec_tree2.predict(jokes2)[0])
# model 2 : random forest
print("random forest : ", rand_forest1.predict(jokes1)[0])
print("random forest : ", rand_forest2.predict(jokes2)[0])
# model 3 : logistic regression
print("logistic regression : ", log_reg1.predict(jokes1)[0])
print("logistic regression : ", log_reg2.predict(jokes2)[0])

decision tree :  4
decision tree :  4
random forest :  4
random forest :  4
logistic regression :  4
logistic regression :  3
