In [None]:
!wget --no-check-certificate \
    https://storage.googleapis.com/laurencemoroney-blog.appspot.com/sarcasm.json \
    -O /tmp/sarcasm.json

In [None]:
import tensorflow as tf
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import tensorflow_datasets as tfds
import json

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

## Example

In [None]:
sentences = [
    'I love dogs and dont love cats',
    'Cats dont love dogs'
]

In [None]:
tokenizer = Tokenizer(num_words=None, oov_token='<OOV>')
tokenizer.fit_on_texts(sentences)
sequences = tokenizer.texts_to_sequences(sentences)
print(tokenizer.word_index)
print(tokenizer.texts_to_sequences(sentences))
print(tokenizer.texts_to_sequences(['I love cars']))  ## unknown word cars -> <OOV> -> 1

In [None]:
padded = pad_sequences(sequences, padding='post', truncating='post', maxlen=5)
padded

## Load dataset

In [None]:
with open('/tmp/sarcasm.json', 'r', encoding='utf-8') as f:
    datastore = json.load(f)

sentences = []
labels = []
urls = []
for item in datastore:
    sentences.append(item['headline'])
    labels.append(item['is_sarcastic'])
    urls.append(item['article_link'])

In [None]:
tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(sentences)

word_index = tokenizer.word_index
print(len(word_index))
#print(word_index)
sequences = tokenizer.texts_to_sequences(sentences)
padded = pad_sequences(sequences, padding='post')
print(padded[0])
print(padded.shape)

## Imdb

In [None]:
imdb, info = tfds.load('imdb_reviews', with_info=True, as_supervised=True)

In [None]:
train_data, test_data = imdb['train'], imdb['test']
train_data, test_data

## Preprocess data

In [None]:
train_sent, train_labels = [], []
test_sent, test_labels = [], []

for s, l in train_data:
    train_sent.append(s.numpy().decode('UTF-8'))
    train_labels += [l.numpy()]
    
for s, l in test_data:
    test_sent.append(s.numpy().decode('UTF-8'))
    test_labels += [l.numpy()]

train_sent[:2], train_labels[:2]

In [None]:
tokenizer = Tokenizer(num_words=10000, oov_token='<OOV>')
tokenizer.fit_on_texts(train_sent)
train_seq = tokenizer.texts_to_sequences(train_sent)
train_padded = pad_sequences(train_seq, maxlen=120, truncating='post')

test_seq = tokenizer.texts_to_sequences(test_sent)
test_padded = pad_sequences(test_seq, maxlen=120, truncating='post')

## Create model

In [None]:
def create_model():
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(10000, 16, input_length=120),
        tf.keras.layers.LSTM(16, return_sequences=True),
        tf.keras.layers.LSTM(16, return_sequences=True),
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(8, activation='relu'),
        tf.keras.layers.Dense(1, activation='sigmoid'),
    ])
    return model
model = create_model()
model.summary()

In [None]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])
model.fit(train_padded, np.array(train_labels), epochs=10, validation_data=(test_padded, np.array(test_labels)))

In [None]:
weights = model.layers[0].get_weights()[0]
weights.shape

In [None]:
weights[0]

In [None]:
import io

reverse_word_index = dict([(value, key) for (key, value) in tokenizer.word_index.items()])

out_v = io.open('vecs.tsv', 'w', encoding='utf-8')
out_m = io.open('meta.tsv', 'w', encoding='utf-8')
for word_num in range(1, 10000):
    word = reverse_word_index[word_num]
    embeddings = weights[word_num]
    out_m.write(word + "\n")
    out_v.write('\t'.join([str(x) for x in embeddings]) + "\n")
out_v.close()
out_m.close()

In [None]:
sentence = "I really think this is amazing. honest."
sequence = tokenizer.texts_to_sequences([sentence])
print(sequence)

## Use generator instead of arrays
https://www.tensorflow.org/tutorials/load_data/text#encode_text_lines_as_numbers

In [None]:
train_data, test_data

In [None]:
tokenizer = tfds.features.text.Tokenizer()

vocabulary_set = set()
for text_tensor, label in train_data:
    some_tokens = tokenizer.tokenize(text_tensor.numpy().decode('UTF-8'))
    vocabulary_set.update(some_tokens)

vocab_size = len(vocabulary_set)
vocab_size

In [None]:
encoder = tfds.features.text.TokenTextEncoder(vocabulary_set)
example_text = next(iter(train_data))[0].numpy()
print(example_text)

In [None]:
encoded_example = encoder.encode(example_text)
print(encoded_example)

In [None]:
encoder.decode(encoder.encode('Hello, Friend'))