In [1]:
import pandas as pd

In [2]:
sarcasm_headlines = pd.read_json("../data/sarcasm-headlines.json", lines=True)

sentences = sarcasm_headlines['headline'].tolist()
labels = sarcasm_headlines['is_sarcastic'].tolist()
urls = sarcasm_headlines['article_link'].tolist()

In [3]:
from sklearn.model_selection import train_test_split

In [4]:
training_sentences, testing_sentences, training_labels, testing_labels = train_test_split(
    sentences,
    labels,
    test_size=0.20,
    random_state=42
)

In [5]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer 
from tensorflow.keras.preprocessing.sequence import pad_sequences 

In [6]:
tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(training_sentences)
word_index = tokenizer.word_index

training_sequences = tokenizer.texts_to_sequences(training_sentences)
training_padded = pad_sequences(training_sequences, padding='post', truncating='post', maxlen=40)

testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sequences, padding='post', truncating='post', maxlen=40)

In [7]:
len(word_index)

26536

In [8]:
from tensorflow.keras import layers

In [16]:
embedding_dim = 64
vocab_size = len(word_index) + 1

model = keras.Sequential([
    layers.Embedding(vocab_size, embedding_dim),
    layers.Bidirectional(layers.LSTM(64, return_sequences=True)),
    layers.Bidirectional(layers.LSTM(32)),
    layers.Dense(64, activation='relu'),
    layers.Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [17]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, None, 64)          1698368   
                                                                 
 bidirectional_2 (Bidirectio  (None, None, 128)        66048     
 nal)                                                            
                                                                 
 bidirectional_3 (Bidirectio  (None, 64)               41216     
 nal)                                                            
                                                                 
 dense_2 (Dense)             (None, 64)                4160      
                                                                 
 dense_3 (Dense)             (None, 1)                 65        
                                                                 
Total params: 1,809,857
Trainable params: 1,809,857
No

In [12]:
training_padded = tf.constant(training_padded)
testing_padded = tf.constant(testing_padded)
training_labels = tf.constant(training_labels)
testing_labels = tf.constant(testing_labels)

In [18]:
early_stopping = keras.callbacks.EarlyStopping(
    monitor='val_loss',
    mode='min',
    min_delta=1e-4,
    patience=5,
    verbose=1,
    restore_best_weights=True
)

history = model.fit(
    training_padded, 
    training_labels, 
    epochs=30, 
    validation_data=(testing_padded, testing_labels), 
    verbose=0,
    callbacks=[early_stopping]
)

Restoring model weights from the end of the best epoch: 1.
Epoch 6: early stopping


In [19]:
sentence = [
    "Oh great, another Monday—just what I needed!",
    "I’m excited to start my new project today."
]

sequences = tokenizer.texts_to_sequences(sentence)

padded = pad_sequences(sequences, padding='post', truncating='post', maxlen=40)

print(model.predict(padded))

[[0.24245593]
 [0.7388185 ]]


In [22]:
loss, acc = model.evaluate(testing_padded, testing_labels, verbose=0)
print(acc)

0.865218997001648
