In [1]:
# Import Dependencies
import json
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
# Download the dataset
!wget --no-check-certificate \
    https://storage.googleapis.com/laurencemoroney-blog.appspot.com/sarcasm.json \
    -O ./sarcasm.json

--2020-08-16 12:09:50--  https://storage.googleapis.com/laurencemoroney-blog.appspot.com/sarcasm.json
Resolving storage.googleapis.com (storage.googleapis.com)... 2607:f8b0:4006:806::2010, 2607:f8b0:4006:807::2010, 2607:f8b0:4006:81b::2010, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|2607:f8b0:4006:806::2010|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5643545 (5.4M) [application/json]
Saving to: ‘./sarcasm.json’


2020-08-16 12:09:50 (12.4 MB/s) - ‘./sarcasm.json’ saved [5643545/5643545]



In [3]:
# Load the Dataset
with open("./sarcasm.json", 'r') as f:
    datastore = json.load(f)

In [4]:
datastore[0]

{'article_link': 'https://www.huffingtonpost.com/entry/versace-black-code_us_5861fbefe4b0de3a08f600d5',
 'headline': "former versace store clerk sues over secret 'black code' for minority shoppers",
 'is_sarcastic': 0}

In [5]:
# Separate out the Label, Sentence and the URL
sentences = [] 
labels = []
urls = []

for item in datastore:
    sentences.append(item['headline'])
    labels.append(item['is_sarcastic'])
    urls.append(item['article_link'])

In [6]:
# Tokenize the Textt
tokenizer = Tokenizer(oov_token='<OOV>')
tokenizer.fit_on_texts(sentences)
word_idx = tokenizer.word_index

In [7]:
# Vocabulary: Number of Unique Words with Index
len(word_idx)

29657

In [8]:
# Text to Token Mapping
sequences = tokenizer.texts_to_sequences(sentences)
print(sentences[0])
print(sequences[0])

former versace store clerk sues over secret 'black code' for minority shoppers
[308, 15115, 679, 3337, 2298, 48, 382, 2576, 15116, 6, 2577, 8434]


In [9]:
# Padding the Text
padded = pad_sequences(sequences, padding='post')
print(sentences[0])
print(padded[0])

former versace store clerk sues over secret 'black code' for minority shoppers
[  308 15115   679  3337  2298    48   382  2576 15116     6  2577  8434
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0]


In [10]:
# Shape of Encoded Dataset
padded.shape

(26709, 40)

Here, we see that we have 26709 sentences, represented using text encodings, having a length of 40 each. The length is decided by the length of the longest sentence. That length is then set to the length for all the sentences and the missing values for shorter sentences are filled with 0's.