In [1]:
import pandas as pd
import numpy as np
import os
import re # regex
import shutil
import string
import nltk

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, confusion_matrix, auc, accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier

import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.callbacks import *
from tensorflow.keras import Model, Input, Sequential
from datetime import datetime
from tensorflow.keras import Input, Model
from tensorflow.keras.layers import *
from tensorflow.keras.utils import plot_model

print(tf.__version__)

2.8.0-rc0


# Read in the data #

In [2]:
test_dataset = pd.read_json('./data/Sarcasm_Headlines_Dataset.json', lines=True)
test_dataset.head()

Unnamed: 0,article_link,headline,is_sarcastic
0,https://www.huffingtonpost.com/entry/versace-b...,former versace store clerk sues over secret 'b...,0
1,https://www.huffingtonpost.com/entry/roseanne-...,the 'roseanne' revival catches up to our thorn...,0
2,https://local.theonion.com/mom-starting-to-fea...,mom starting to fear son's web series closest ...,1
3,https://politics.theonion.com/boehner-just-wan...,"boehner just wants wife to listen, not come up...",1
4,https://www.huffingtonpost.com/entry/jk-rowlin...,j.k. rowling wishes snape happy birthday in th...,0


In [3]:
train_dataset = pd.read_json('./data/Sarcasm_Headlines_Dataset_v2.json', lines=True)
train_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28619 entries, 0 to 28618
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   is_sarcastic  28619 non-null  int64 
 1   headline      28619 non-null  object
 2   article_link  28619 non-null  object
dtypes: int64(1), object(2)
memory usage: 670.9+ KB


# Basic data cleanup and column removal #

In [4]:
train_dataset = train_dataset.drop('article_link', axis=1)

In [5]:
train_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28619 entries, 0 to 28618
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   is_sarcastic  28619 non-null  int64 
 1   headline      28619 non-null  object
dtypes: int64(1), object(1)
memory usage: 447.3+ KB


# Pre-processing #
1. contractions
2. stop words
3. lowercase
4. stemming
5. tokenize

Q: Do we need to do all of these?

### Contractions ###

In [6]:
def remove_contractions(sentence):
    sentence = re.sub(r"won\'t", "will not", sentence)
    sentence = re.sub(r"can\'t", "can not", sentence)
    sentence = re.sub(r"n\'t", " not", sentence)
    sentence = re.sub(r"\'re", " are", sentence)
    sentence = re.sub(r"\'s", " is", sentence)
    sentence = re.sub(r"\'d", " would", sentence)
    sentence = re.sub(r"\'ll", " will", sentence)
    sentence = re.sub(r"\'t", " not", sentence)
    sentence = re.sub(r"\'ve", " have", sentence)
    sentence = re.sub(r"\'m", " am", sentence)
    return sentence.lower()

In [7]:
train_dataset["headline"].apply(remove_contractions)
test_dataset["headline"].apply(remove_contractions)

0        former versace store clerk sues over secret 'b...
1        the 'roseanne' revival catches up to our thorn...
2        mom starting to fear son is web series closest...
3        boehner just wants wife to listen, not come up...
4        j.k. rowling wishes snape happy birthday in th...
                               ...                        
26704                 american politics in moral free-fall
26705                             america is best 20 hikes
26706                                reparations and obama
26707    israeli ban targeting boycott supporters raise...
26708                    gourmet gifts for the foodie 2014
Name: headline, Length: 26709, dtype: object

In [8]:
y_train = train_dataset["is_sarcastic"]
y_test = test_dataset["is_sarcastic"]

In [9]:
train_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28619 entries, 0 to 28618
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   is_sarcastic  28619 non-null  int64 
 1   headline      28619 non-null  object
dtypes: int64(1), object(1)
memory usage: 447.3+ KB


### Stop Words ###

In [10]:
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer 
from nltk.tokenize import RegexpTokenizer

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/andraszolyomi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


#### One way of doing it: ####

In [11]:
# stopwords = stopwords.words('english')
# stemmer = SnowballStemmer('english')
# tokenizer = RegexpTokenizer(r'\w+')

# def preprocess_text(sentence, stem = False):
#     text = []
#     for word in sentence:
        
#         if word not in stopwords:
            
#             if stem:
#                 text.append(stemmer.stem(word).lower())
#             else:
#                 text.append(word.lower())
#     return tokenizer.tokenize(" ".join(text))

In [12]:
print(train_dataset["headline"][0])

thirtysomething scientists unveil doomsday clock of hair loss


## Tokenize ##

In [13]:
t = Tokenizer()
t.fit_on_texts(train_dataset["headline"])

encoded_train = t.texts_to_sequences(train_dataset["headline"])
encoded_test = t.texts_to_sequences(test_dataset["headline"])

max_length = 25

padded_train = pad_sequences(encoded_train, 
    maxlen = max_length, 
    padding = "post", 
    truncating = "post")

padded_test = pad_sequences(encoded_test, 
    maxlen = max_length, 
    padding = "post", 
    truncating = "post")

print(padded_train.shape, padded_test.shape, type(padded_train))

vocab_size = len(t.word_index) + 1
vocab_size

(28619, 25) (26709, 25) <class 'numpy.ndarray'>


30885

In [14]:
# !wget http://nlp.stanford.edu/data/glove.6B.zip
# !unzip -q glove.6B.zip

In [15]:
path_to_glove_file = "./glove/glove.6B.100d.txt"

embeddings_index = {}
with open(path_to_glove_file) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print("Found %s word vectors." % len(embeddings_index))

Found 400000 word vectors.


In [16]:
num_tokens = vocab_size + 2
embedding_dim = 100
hits = 0
misses = 0

# Prepare embedding matrix
embedding_matrix = np.zeros((num_tokens, embedding_dim))
for word, i in t.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        # This includes the representation for "padding" and "OOV"
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        misses += 1
print("Converted %d words (%d misses)" % (hits, misses))

Converted 24737 words (6147 misses)


In [17]:
earlystop = EarlyStopping(monitor = "val_accuracy", 
                          patience = 7, 
                          verbose = 1,  
                          restore_best_weights = True, 
                          mode = 'max')

reduce_lr = ReduceLROnPlateau(monitor = "val_accuracy", 
                              factor = .4642,
                              patience = 3,
                              verbose = 1, 
                              min_delta = 0.001,
                              mode = 'max')

In [18]:
input = Input(shape = (max_length, ), name = "input")

embedding = Embedding(input_dim = vocab_size + 2, 
                      output_dim = 100, 
                      weights = [embedding_matrix], 
                      trainable = False)(input)

lstm = LSTM(32)(embedding)
flatten = Flatten()(lstm)

dense = Dense(16, activation = None, 
              kernel_initializer = "he_uniform")(flatten)

dropout = Dropout(.25)(dense)
activation = Activation("relu")(dropout)
output = Dense(2, activation = "softmax", name = "output")(activation)
model = Model(inputs = input, outputs = output)

model.compile(optimizer = "adam", loss = "sparse_categorical_crossentropy", metrics = ["accuracy"])

model.summary()

2022-01-13 15:31:31.850871: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input (InputLayer)          [(None, 25)]              0         
                                                                 
 embedding (Embedding)       (None, 25, 100)           3088700   
                                                                 
 lstm (LSTM)                 (None, 32)                17024     
                                                                 
 flatten (Flatten)           (None, 32)                0         
                                                                 
 dense (Dense)               (None, 16)                528       
                                                                 
 dropout (Dropout)           (None, 16)                0         
                                                                 
 activation (Activation)     (None, 16)                0     

In [20]:
model.fit(padded_train, y_train, 
        validation_data = (padded_test, y_test), 
        epochs = 30, 
        batch_size = 32)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x15785e9b0>

In [31]:
y_pred_softmax = model.predict(padded_test)
y_pred = []
for i in range(len(y_pred_softmax)):
    if  y_pred_softmax[i][1] >= 0.6:
        y_pred.append(1)
    else:
        y_pred.append(0)
        
print("Accuracy:", 100*accuracy_score(y_test, y_pred))

Accuracy: 98.44996068740875


In [23]:
exported_model = tf.keras.Sequential([
    model,
    tf.keras.layers.Activation('sigmoid')
])

exported_model.compile(
    loss=tf.keras.losses.SparseCategoricalCrossentropy, optimizer="adam", metrics=['accuracy']
)

In [24]:
def standardize(input_data):
    lowercase = tf.strings.lower(input_data)
    decontracted = remove_contractions(input_data)
    sequenced = t.texts_to_sequences([input_data])
    padded_sequenced = pad_sequences(
        sequenced,
        maxlen=max_length,
        padding = "post", 
        truncating = "post")
    return padded_sequenced
    

In [32]:
examples = [
    "Breakthrough Procedure Allows Surgeons To Transplant Pig Rib Directly Into Human Mouth",
    "Jan. 6 Committee Seeks Interview With Kevin McCarthy",
    "CDC Shortens COVID Isolation Guidelines to One Pump Up Song on Way to Work",
    "Report: Snickers Basically Protein Bar",
    "Crappy Music Has Helped Moron Through Hardest Times In His Pointless Life",
]


standardized = map(standardize, examples)

prediction = exported_model.predict(standardized)

for i in range(len(prediction)):
    if (prediction[i][1] >= 0.6):
        print("I'm pretty sure that's sarcastic...")
    else:
        print("I buy it!")

I'm pretty sure that's sarcastic...
I buy it!
I'm pretty sure that's sarcastic...
I'm pretty sure that's sarcastic...
I'm pretty sure that's sarcastic...
