In [1]:
import json
import tensorflow as tf

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
# Only the 10,000 most common words will be considered
vocab_size = 10000
# Words will be turned into vectors of 16 numbers for the model to understand relationships between words
embedding_dim = 16
# Sentences will be cut off or padded to fit exactly 100 words
max_length = 100
# Delete if it goes over 100
trunc_type='post'
# Pad with 0's if it goes under 100 words
padding_type='post'
# Replace any unknown word in the 10,000 word dictionary into 00V
oov_tok = "<OOV>"
# How many examples to be used in the training model
training_size = 20000

In [6]:
# Library allows python to download data from the internet
import requests

url = "https://storage.googleapis.com/learning-datasets/sarcasm.json"
response = requests.get(url)
with open("sarcasm.json", 'wb') as f:  # saves in current folder
    f.write(response.content)

# Load JSON data
with open("sarcasm.json", 'r') as f:
    datastore = json.load(f)

# Prepare sentences and labels
sentences = []
labels = []

for item in datastore:
    sentences.append(item['headline'])
    labels.append(item['is_sarcastic'])

print("Number of samples:", len(sentences))
print("Example headline:", sentences[0], "| Label:", labels[0])

Number of samples: 26709
Example headline: former versace store clerk sues over secret 'black code' for minority shoppers | Label: 0


In [7]:
# Slicing the data
training_sentences = sentences[0:training_size]
testing_sentences = sentences[training_size:]
training_labels = labels[0:training_size]
testing_labels = labels[training_size:]

In [8]:
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(training_sentences)

word_index = tokenizer.word_index

training_sequences = tokenizer.texts_to_sequences(training_sentences)
training_padded = pad_sequences(training_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [9]:
import numpy as np
training_padded = np.array(training_padded)
training_labels = np.array(training_labels)
testing_padded = np.array(testing_padded)
testing_labels = np.array(testing_labels)