In [1]:
# Importing required libraries
import json
import tensorflow as tf
import requests
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
# Get the dataset
srcsm_json = requests.get('https://storage.googleapis.com/laurencemoroney-blog.appspot.com/sarcasm.json')
# Inspecting the data, print 450 characters
print(srcsm_json.text[0:450])

[
{"article_link": "https://www.huffingtonpost.com/entry/versace-black-code_us_5861fbefe4b0de3a08f600d5", "headline": "former versace store clerk sues over secret 'black code' for minority shoppers", "is_sarcastic": 0},
{"article_link": "https://www.huffingtonpost.com/entry/roseanne-revival-review_us_5ab3a497e4b054d118e04365", "headline": "the 'roseanne' revival catches up to our thorny political mood, for better and worse", "is_sarcastic": 0},


In [4]:
# Separate the json into sentences and labels
sentences = []
labels = []
urls = []
for item in srcsm_json.json():
    sentences.append(item['headline'])
    labels.append(item['is_sarcastic'])
    urls.append(item['article_link'])
print(pd.DataFrame({'sentence' : sentences[0:10], 'label':labels[0:10],'urls':labels[0:10]}))

                                            sentence  label  urls
0  former versace store clerk sues over secret 'b...      0     0
1  the 'roseanne' revival catches up to our thorn...      0     0
2  mom starting to fear son's web series closest ...      1     1
3  boehner just wants wife to listen, not come up...      1     1
4  j.k. rowling wishes snape happy birthday in th...      0     0
5                        advancing the world's women      0     0
6     the fascinating case for eating lab-grown meat      0     0
7  this ceo will send your kids to school, if you...      0     0
8  top snake handler leaves sinking huckabee camp...      1     1
9  friday's morning email: inside trump's presser...      0     0


In [5]:
# Splitting the dataset into Train and Test
training_size = round(len(sentences) * .75)
training_sentences = sentences[0:training_size]
testing_sentences = sentences[training_size:]
training_labels = labels[0:training_size]
testing_labels = labels[training_size:]
# Setting tokenizer properties
vocab_size = 10000
oov_tok = "<oov>"
# Fit the tokenizer on Training data
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(training_sentences)
word_index = tokenizer.word_index
# Setting the padding properties
max_length = 100
trunc_type='post'
padding_type='post'
# Creating padded sequences from train and test data
training_sequences = tokenizer.texts_to_sequences(training_sentences)
training_padded = pad_sequences(training_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [6]:
# Setting the model parameters
embedding_dim = 16
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 100, 16)           160000    
_________________________________________________________________
global_average_pooling1d (Gl (None, 16)                0         
_________________________________________________________________
dense (Dense)                (None, 24)                408       
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 25        
Total params: 160,433
Trainable params: 160,433
Non-trainable params: 0
_________________________________________________________________


In [7]:
# Converting the lists to numpy arrays for Tensorflow 2.x
training_padded = np.array(training_padded)
training_labels = np.array(training_labels)
testing_padded = np.array(testing_padded)
testing_labels = np.array(testing_labels)
# Training the model
num_epochs = 30
history = model.fit(training_padded, training_labels, epochs=num_epochs, validation_data=(testing_padded, testing_labels), verbose=2)

Epoch 1/30
626/626 - 5s - loss: 0.6662 - accuracy: 0.5764 - val_loss: 0.5957 - val_accuracy: 0.7428
Epoch 2/30
626/626 - 2s - loss: 0.4566 - accuracy: 0.8234 - val_loss: 0.4023 - val_accuracy: 0.8287
Epoch 3/30
626/626 - 2s - loss: 0.3250 - accuracy: 0.8744 - val_loss: 0.3599 - val_accuracy: 0.8496
Epoch 4/30
626/626 - 2s - loss: 0.2709 - accuracy: 0.8948 - val_loss: 0.3775 - val_accuracy: 0.8290
Epoch 5/30
626/626 - 2s - loss: 0.2336 - accuracy: 0.9098 - val_loss: 0.3431 - val_accuracy: 0.8553
Epoch 6/30
626/626 - 2s - loss: 0.2053 - accuracy: 0.9217 - val_loss: 0.3497 - val_accuracy: 0.8520
Epoch 7/30
626/626 - 2s - loss: 0.1827 - accuracy: 0.9316 - val_loss: 0.3569 - val_accuracy: 0.8564
Epoch 8/30
626/626 - 2s - loss: 0.1647 - accuracy: 0.9375 - val_loss: 0.3668 - val_accuracy: 0.8555
Epoch 9/30
626/626 - 2s - loss: 0.1473 - accuracy: 0.9468 - val_loss: 0.3834 - val_accuracy: 0.8564
Epoch 10/30
626/626 - 2s - loss: 0.1345 - accuracy: 0.9528 - val_loss: 0.4080 - val_accuracy: 0.8495

In [8]:
sentence = ["Coworkers At Bathroom Sink Locked In Tense Standoff Over Who Going To Wash Hands Longer", 
            "Spiking U.S. coronavirus cases could force rationing decisions similar to those made in Italy, China."]
sequences = tokenizer.texts_to_sequences(sentence)
padded = pad_sequences(sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
print(model.predict(padded))

[[0.9996165 ]
 [0.20619968]]
