In [2]:
import json
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [3]:
sentences = [
	"I love my dog",
    "I love my cat",
    "You love my dog!",
    "Do you think my dog is amazing?"
]

tokenizer = Tokenizer( num_words=100, oov_token="<OOV>")

tokenizer.fit_on_texts( sentences)

word_index = tokenizer.word_index

sequences = tokenizer.texts_to_sequences( sentences)

print( f"Word Index = { word_index}")
print( f"Sequences = { sequences}")

Word Index = {'<OOV>': 1, 'my': 2, 'love': 3, 'dog': 4, 'i': 5, 'you': 6, 'cat': 7, 'do': 8, 'think': 9, 'is': 10, 'amazing': 11}
Sequences = [[5, 3, 2, 4], [5, 3, 2, 7], [6, 3, 2, 4], [8, 6, 9, 2, 4, 10, 11]]


In [4]:
padded = pad_sequences(
	sequences,
	maxlen=5,
	padding="post",
	truncating="post"
)

print( "padded sequences:")
print( padded)

padded sequences:
[[5 3 2 4 0]
 [5 3 2 7 0]
 [6 3 2 4 0]
 [8 6 9 2 4]]


In [5]:
test_data = [
	"i really love my dog",
    "my dog loves my manatee"
]

test_seq = tokenizer.texts_to_sequences( test_data)

print( f"Test Sequences = { test_seq}")

padded = pad_sequences(
	test_seq,
	maxlen=7,
	padding="post",
	truncating="post"
)

print( "padded test sequences")
print( padded)

Test Sequences = [[5, 1, 3, 2, 4], [2, 4, 1, 2, 1]]
padded test sequences
[[5 1 3 2 4 0 0]
 [2 4 1 2 1 0 0]]


In [6]:
with open( "./sarcasm.json", "r") as f:
	datastore = json.load( f)

print( datastore[ 0])
print( datastore[ 20000])

{'article_link': 'https://www.huffingtonpost.com/entry/versace-black-code_us_5861fbefe4b0de3a08f600d5', 'headline': "former versace store clerk sues over secret 'black code' for minority shoppers", 'is_sarcastic': 0}
{'article_link': 'https://www.theonion.com/pediatricians-announce-2011-newborns-are-ugliest-babies-1819572977', 'headline': 'pediatricians announce 2011 newborns are ugliest babies in 30 years', 'is_sarcastic': 1}


In [8]:
sentences = []
labels = []

for item in datastore:
	sentences.append( item[ "headline"])
	labels.append( item[ "is_sarcastic"])

In [9]:
tokenizer = Tokenizer( oov_token="<OOV>")
tokenizer.fit_on_texts( sentences)

word_index = tokenizer.word_index
print( f"number of words in word_index: { len( word_index)}")

sequences = tokenizer.texts_to_sequences( sentences)
padded = pad_sequences( sequences, padding="post")

print( f"sample headline: { sentences[ 2]}")
print( f"padded sequence: { padded[ 2]}")

print( f"shape of padded sequences: { padded.shape}")

number of words in word_index: 29657
sample headline: mom starting to fear son's web series closest thing she will have to grandchild
padded sequence: [  145   838     2   907  1749  2093   582  4719   221   143    39    46
     2 10736     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0]
shape of padded sequences: (26709, 40)
