# Imports

In [266]:
# handle math and data
import numpy as np
import pandas as pd
import time
import math

# to plot nice figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# handle files
import os
import sys
import joblib

# hash table classes
from collections import Counter

# output
import tqdm

# deep learning
import tensorflow as tf
from tensorflow import keras
import tensorflow_datasets as tfds
import tensorflow_addons as tfa
import tensorflow_hub as hub

# Setup

In [2]:
SEED = 69

K = keras.backend

AUTO = tf.data.AUTOTUNE

def reset_backend():
    K.clear_session()
    np.random.seed(SEED)
    tf.random.set_seed(SEED)

# Learning

## Generate Shakespearean Text With Character RNN

### Get Data

In [3]:
DOWNLOAD_URL = "https://homl.info/shakespeare"
filepath = keras.utils.get_file("shakespeare.txt", DOWNLOAD_URL)

Downloading data from https://homl.info/shakespeare


In [4]:
filepath

'/Users/calvinhuang/.keras/datasets/shakespeare.txt'

In [7]:
with open(filepath) as f:
    shakespeare_text = f.read()

In [8]:
len(shakespeare_text)

1115394

Over 1 million characters.

Let's encode our text, basically char to indices.

In [9]:
tokenizer = keras.preprocessing.text.Tokenizer(char_level=True)
tokenizer.fit_on_texts(shakespeare_text)

Now we can use the tokenizer.

Wrap the text in a list, because the method by default will convert each text in the given `texts`. Which means if you just pass the string, it will convert each character to its own sequence and you will have a list of lists.

In [36]:
tokenizer.texts_to_sequences(["Hi! My name is Calvin!\n"])

[[7, 6, 31, 1, 15, 16, 1, 10, 5, 15, 2, 1, 6, 8, 1, 19, 5, 12, 26, 6, 10, 31, 11]]

In [37]:
tokenizer.sequences_to_texts([[7, 6, 31, 1, 15, 16, 1, 10, 5, 15, 2, 1, 6, 8, 1, 19, 5, 12, 26, 6, 10, 31, 11]])

['h i !   m y   n a m e   i s   c a l v i n ! \n']

In [40]:
tokenizer.word_index

{' ': 1,
 'e': 2,
 't': 3,
 'o': 4,
 'a': 5,
 'i': 6,
 'h': 7,
 's': 8,
 'r': 9,
 'n': 10,
 '\n': 11,
 'l': 12,
 'd': 13,
 'u': 14,
 'm': 15,
 'y': 16,
 'w': 17,
 ',': 18,
 'c': 19,
 'f': 20,
 'g': 21,
 'b': 22,
 'p': 23,
 ':': 24,
 'k': 25,
 'v': 26,
 '.': 27,
 "'": 28,
 ';': 29,
 '?': 30,
 '!': 31,
 '-': 32,
 'j': 33,
 'q': 34,
 'x': 35,
 'z': 36,
 '3': 37,
 '&': 38,
 '$': 39}

In [41]:
max_id = len(tokenizer.word_index)
max_id

39

In [42]:
dataset_size = tokenizer.document_count
dataset_size

1115394

So the tokenizer can be fit to text and it will create a word_index hashtable aka dict for each char or word. Then you can easily map texts to sequences and vice versa. Let's create our encoded data now.

Subtract by 1 so that our enocded indices will be from 0 to 38

In [48]:
[encoded] = np.array(tokenizer.texts_to_sequences([shakespeare_text])) - 1

In [49]:
encoded.shape

(1115394,)

In [50]:
np.unique(encoded)

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38])

Now let's create our efficient datasets.

### Create Datasets

- Set train set to first 90% of text
- Create dataset with windows of 100 characters
    - Window size of 101 because target is next char
- Flat map with batch on windows
- Shuffle
- Batch datset
- Split input and target data
- Apply one hot encoding to inputs
- Prefetch

In [59]:
train_size = dataset_size * 90 // 100
train_size

1003854

In [65]:
dataset = tf.data.Dataset.from_tensor_slices(encoded[:train_size])

Now let's get windows of data.

In [66]:
n_steps = 100
window_length = n_steps + 1
dataset = dataset.window(window_length, shift=1, drop_remainder=True)

This creates a dataset of window datasets (yes each window is a dataset). To get just one dataset and convert windows to tensors, we need to use `flat_map` and batch each window to it's own size.

In [68]:
dataset = dataset.flat_map(lambda window: window.batch(window_length))

Now let's shuffle our windows and then batch them.

In [69]:
batch_size = 32
dataset = dataset.shuffle(10000).batch(batch_size)

Let's split our data into input and target sequences.

Shape of our data: `(batch_size, 101)`

In [70]:
dataset = dataset.map(lambda windows: (windows[:, :-1], windows[:, 1:]))

Now let's apply one hot to our inputs.

Shape of our data: `(input (batch_size, 100, 1), target (batch_size, 100, 1))`

In [72]:
dataset = dataset.map(lambda X_batch, Y_batch:
                          (tf.one_hot(X_batch, max_id), Y_batch))

Finally, let's prefetch.

In [74]:
dataset = dataset.prefetch(1)

### Model Arch.

Let's use two GRU layers with 128 units each for long term memory. Add 20% dropout to both inputs and hidden states to prevent unstable gradients since this model will run through 100 steps or 100 layers when unrolled (which is pretty deep). Finally, we have a timedistributed dense layer (which means it will output for every step, not just last output) that will output 39 probabilities for the 39 possible characters using a softmax activation.

- 2 GRU layers, 128 units, 20% dropout
- Dense layer, 39 units, softmax

In [None]:
model = keras.models.Sequential([
    keras.layers.GRU(128, return_sequences=True, input_shape=[None, max_id],
                     dropout=0.2, recurrent_dropout=0.2),
    keras.layers.GRU(128, return_sequences=True,
                     dropout=0.2, recurrent_dropout=0.2),
    keras.layers.TimeDistributed(keras.layers.Dense(max_id, 
                                                    activation="softmax"))
])

model.compile(loss="sparse_categorical_crossentropy",
              optimizer="nadam",
              metrics=["accuracy"])

history = model.fit(dataset, epochs=10)

This model takes way too long to train, let's use a stateful model and just copy the weights over.

### Stateful Model

Stateful means copying the last hidden state of the previous iteration over to the new iteration instead of using 0s.

To do this we will need our batches to be consecutive, an easy way is to have batches of size 1. Also our windows will need to have a shift of n_steps (not n_steps + 1 since we only add 1 for the outputs).

In [80]:
dataset = tf.data.Dataset.from_tensor_slices(encoded[:train_size])
dataset = dataset.window(window_length, shift=n_steps, drop_remainder=True)
dataset = dataset.flat_map(lambda window: window.batch(window_length))
dataset = dataset.batch(1) # no shuffle and batch of size 1
dataset = dataset.map(lambda windows: (windows[:, :-1], windows[:, 1:])) # batches are just one window
dataset = dataset.map(lambda X_batch, Y_batch: 
                      (tf.one_hot(X_batch, max_id), Y_batch)) # again, batches are just one window
dataset.prefetch(1)

<PrefetchDataset element_spec=(TensorSpec(shape=(None, None, 39), dtype=tf.float32, name=None), TensorSpec(shape=(None, None), dtype=tf.int64, name=None))>

But then we won't be able to take advantage of vectorization performance improvements.

So, let's do this:
- Split our data into 32 chunks in order
- Window each dataset
- Stack the datasets
    - Each element should be a (32, 101) tensor
    - The first element is the first 101 steps or first window of the 32 datasets
    - No need to batch now, since it's already in batch shape after stacking
- Split input and target
- Apply one hot 
- Prefetch

In [89]:
batch_size = 32
encoded_parts = np.array_split(encoded[:train_size], batch_size)
datasets = []
for encoded_part in encoded_parts:
    dataset = tf.data.Dataset.from_tensor_slices(encoded_part)
    dataset = dataset.window(window_length, shift=n_steps, drop_remainder=True)
    dataset = dataset.flat_map(lambda window: window.batch(window_length))
    datasets.append(dataset)
dataset = tf.data.Dataset.zip(tuple(datasets)).map(lambda *windows: tf.stack(windows))
dataset = dataset.map(lambda windows: (windows[:, :-1], windows[:, 1:]))
dataset = dataset.map(
    lambda X_batch, Y_batch: (tf.one_hot(X_batch, depth=max_id), Y_batch))
dataset = dataset.prefetch(1)

Now let's train our model - we need to set stateful to True for our RNN layers. Also, we need to make sure to reset the state between epochs (because we don't want to have bias from previous epochs).

In [91]:
model = keras.models.Sequential([
    keras.layers.GRU(128, batch_input_shape=[batch_size, None, max_id], 
                     return_sequences=True, stateful=True, 
                     dropout=0.2, recurrent_dropout=0.2),
    keras.layers.GRU(128, 
                     return_sequences=True, stateful=True, 
                     dropout=0.2, recurrent_dropout=0.2),
    keras.layers.TimeDistributed(keras.layers.Dense(max_id, 
                                                    activation="softmax"))
])

In [92]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 gru (GRU)                   (32, None, 128)           64896     
                                                                 
 gru_1 (GRU)                 (32, None, 128)           99072     
                                                                 
 time_distributed (TimeDistr  (32, None, 39)           5031      
 ibuted)                                                         
                                                                 
Total params: 168,999
Trainable params: 168,999
Non-trainable params: 0
_________________________________________________________________


In [93]:
# reset states between epochs
class ResetStatesCallback(keras.callbacks.Callback):
    def on_epoch_begin(self, epoch, logs):
        self.model.reset_states()

model.compile(loss="sparse_categorical_crossentropy",
              optimizer="nadam",
              metrics=["accuracy"])

history = model.fit(dataset, epochs=50,
                    callbacks=[ResetStatesCallback()])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


After training this, let's now save the weights and copy to the stateless model (so we can predict on batches of any size, not just 32).

In [102]:
SAVE_DIR = os.path.join(".", "_models", "15_nlp")
os.makedirs(SAVE_DIR, exist_ok=True)
model.save_weights(os.path.join(SAVE_DIR, "shakespeare_rnn_weights.h5"))

In [103]:
model = keras.models.Sequential([
    keras.layers.GRU(128, return_sequences=True, input_shape=[None, max_id],
                     dropout=0.2, recurrent_dropout=0.2),
    keras.layers.GRU(128, return_sequences=True,
                     dropout=0.2, recurrent_dropout=0.2),
    keras.layers.TimeDistributed(keras.layers.Dense(max_id, 
                                                    activation="softmax"))
])

model.compile(loss="sparse_categorical_crossentropy",
              optimizer="nadam",
              metrics=["accuracy"])

In [104]:
model.load_weights(os.path.join(SAVE_DIR, "shakespeare_rnn_weights.h5"))

### Generate Text

To generate new shakespearean text:
- Preprocess input text
    - text to seq encode, subtract 1 from indices, to one hot
- Predict next char probas
- Randomly select a char using probas
    - Add a temperature var to control how much variability you want
- Convert selected char index to char
    - Add 1, seq to text

In [118]:
def preprocess(text):
    encoded_text = np.array(tokenizer.texts_to_sequences([text]))
    return tf.one_hot(encoded_text - 1, max_id)

def next_char(model, text, temperature):
    X_new = preprocess(text)
    y_proba = model.predict(X_new, verbose=0)[0, -1:, :] # first batch, last step's output (aka last char), all probas
    rescaled_logits = tf.math.log(y_proba) / temperature
    char_id = tf.random.categorical(rescaled_logits, num_samples=1) + 1
    return tokenizer.sequences_to_texts(char_id.numpy())[0]

Now let's create a function that generates `n_chars` chars of shakespearean text.

In [119]:
def shakespearean_text(model, seed_text, n_chars=50, temperature=1):
    text = seed_text
    for _ in range(n_chars):
        text += next_char(model, text, temperature)
    return text

Let's try it out!

In [123]:
seed_text = "t"
print(shakespearean_text(model, seed_text, temperature=0.1))

ter:
the seem the contraction of the court of the c


In [124]:
seed_text = "t"
print(shakespearean_text(model, seed_text, temperature=1))

t, bear for the
good more one with your triar:
nor,


In [122]:
seed_text = "t"
print(shakespearean_text(model, seed_text, temperature=2))

t, ly yearss, margand?
ty-murr incewertess, kness y


Cool! Let's try using a longer seed.

In [126]:
seed_text = "Hamlet said thou are"
print(shakespearean_text(model, seed_text, n_chars=300, temperature=1))

Hamlet said thou are
since to serve go alive a timplint of such gentlemans,
the suition warwick. then!
him thou best one thy fortune one hisband.

all:
i will go should being give my cieizen's or thou
so cursed wife with a a.---

bygy:
aid you had be rather, our conession that?

sicinius:
ha, sheve i horse of a lunious


## IMDB Review Sentiment Analysis

### Get Data

In [136]:
datasets, info = tfds.load("imdb_reviews", as_supervised=True, with_info=True)

In [130]:
info.splits

{Split('train'): <SplitInfo num_examples=25000, num_shards=1>,
 Split('test'): <SplitInfo num_examples=25000, num_shards=1>,
 Split('unsupervised'): <SplitInfo num_examples=50000, num_shards=1>}

In [134]:
train_size = info.splits["train"].num_examples
train_size

25000

Let's look at what the input data looks like.

In [142]:
datasets

{'train': <PrefetchDataset element_spec=(TensorSpec(shape=(), dtype=tf.string, name=None), TensorSpec(shape=(), dtype=tf.int64, name=None))>,
 'test': <PrefetchDataset element_spec=(TensorSpec(shape=(), dtype=tf.string, name=None), TensorSpec(shape=(), dtype=tf.int64, name=None))>,
 'unsupervised': <PrefetchDataset element_spec=(TensorSpec(shape=(), dtype=tf.string, name=None), TensorSpec(shape=(), dtype=tf.int64, name=None))>}

In [143]:
dataset = datasets["train"]

In [150]:
for x, y in dataset.take(100).shuffle(100):
    pass
x

<tf.Tensor: shape=(), dtype=string, numpy=b'In this film we have the fabulous opportunity to see what happened to Timon and Pumbaa in the film when they are not shown - which is a lot! This film even goes back to before Simba and (presumbably) just after the birth of Kiara. <br /><br />Quite true to the first film, "Lion King 1/2 (or Lion King 3 in other places)" is a funny, entertaining, exciting and surprising film (or sequel if that\'s what you want to call it). A bundle of surprises and hilarity await for you!<br /><br />While Timon and Pumbaa are watching a film at the cinema (with a remote control), Timon and Pumbaa have an argument of what point of "The Lion King" they are going to start watching, as Timon wants to go to the part when he and Pumbaa come in and Pumbaa wants to go back to the beginning. They have a very fair compromise of watching the film of their own story, which is what awaits... It starts with Timon\'s first home...<br /><br />For anyone with a good sense of h

We need to preprocess the text.

In [152]:
def preprocess(X_batch, y_batch):
    X_batch = tf.strings.substr(X_batch, 0, 300) # truncate every review to first 300 chars
    X_batch = tf.strings.regex_replace(X_batch, b"<br\\s*/?>", b" ") # replace <br /> with whitespace
    X_batch = tf.strings.regex_replace(X_batch, b"[^a-zA-Z']", b" ") # replace anything that is not a-z, A-Z, or ' 
                                                                     # with whitespace
    X_batch = tf.strings.split(X_batch) # create a sequence of words split by whitespace
    return X_batch.to_tensor(default_value=b"<pad>"), y_batch # pad uneven ragged tensor so that we will have a dense tensor

Now let's encode the words to ids
- Create a vocabulary w/ Counter
- Create a lookup table

Create a vocabulary in descending order most common to least common words.

In [157]:
batch_size = 32
vocabulary = Counter()
for X_batch, y_batch in dataset.batch(batch_size).map(preprocess):
    for review in X_batch:
        vocabulary.update(list(review.numpy()))

In [170]:
vocabulary.most_common(5)

[(b'<pad>', 214309),
 (b'the', 61137),
 (b'a', 38564),
 (b'of', 33983),
 (b'and', 33431)]

Looks about right.

In [163]:
len(vocabulary)

53893

In [171]:
vocabulary.most_common(10005)[-5:]

[(b'Legion', 7),
 (b'Republic', 7),
 (b'Cassie', 7),
 (b'hallucinations', 7),
 (b'Clinton', 7)]

Let's just use the 10000 most common and use 1000 oov buckets for any other possible characters (probably don't even need 1000 since each sequence is only 300 words max).

In [172]:
vocab_size = 10000
truncated_vocabulary = [word for word, count in vocabulary.most_common(vocab_size)]

In [173]:
len(truncated_vocabulary)

10000

In [174]:
truncated_vocabulary[:5]

[b'<pad>', b'the', b'a', b'of', b'and']

Create a lookup table initializer.

In [176]:
words = tf.constant(truncated_vocabulary)
word_ids = tf.range(len(truncated_vocabulary), dtype=tf.int64)
vocab_init = tf.lookup.KeyValueTensorInitializer(words, word_ids)

Create a lookup table.

In [177]:
num_oov_buckets = 1000
table = tf.lookup.StaticVocabularyTable(vocab_init, num_oov_buckets)

In [185]:
review = tf.constant([b"This movie sucks real fucking ass man <pad>".split()])
table.lookup(review)

<tf.Tensor: shape=(1, 8), dtype=int64, numpy=array([[   22,    12,  1488,   175, 10716,  1831,   167,     0]])>

Now let's map the lookup table to each input sequence.

In [187]:
def encode_words(X_batch, y_batch):
    return table.lookup(X_batch), y_batch

### Create Datasets

Now let's wrap everything up together
- Shuffle data
- Batch the data
- Preprocess text
- Encode words
- Prefetch

In [228]:
train, valid, test = tfds.load("imdb_reviews", as_supervised=True, split=["train", "test[:50%]", "test[50%:]"])

In [229]:
batch_size = 32
train_set = train.shuffle(10000).batch(batch_size).map(preprocess).map(encode_words).prefetch(1)
valid_set = valid.batch(batch_size).map(preprocess).map(encode_words).prefetch(1)
test_set = test.batch(batch_size).map(preprocess).map(encode_words).prefetch(1)

### Model

We need to embed our word sequences for the model to learn good patterns and fast.
- Input (batch_size, seq_length)
- Embed (batch_size, seq_length, embed_dimen)
- GRU
- GRU
- Dense Binary final output only (batch_size)

In [216]:
embed_dimen = 128
model = keras.models.Sequential([
    keras.layers.Embedding(vocab_size + num_oov_buckets, 
                          embed_dimen, input_shape=[None]),
    keras.layers.Conv1D(filters=embed_dimen, kernel_size=2, strides=2),
    keras.layers.BatchNormalization(),
    keras.layers.GRU(128, return_sequences=True, 
                     dropout=0.2, recurrent_dropout=0.2),
    keras.layers.GRU(128, 
                     dropout=0.2, recurrent_dropout=0.2),
    keras.layers.Dense(1, activation="sigmoid")
])

In [217]:
model.compile(loss="binary_crossentropy",
              optimizer="nadam",
              metrics=["accuracy"])

In [218]:
model.fit(train_set, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fe7f926bf40>

In [219]:
model.evaluate(test_set)



[1.018828272819519, 0.7404000163078308]

Seems like it overfit. Let's use masks to get rid of padding from the learning process.

### Model Masking

Using a mask to omit input values that equal 0, (id 0 is `<pad>`)

In [230]:
inputs = keras.layers.Input(shape=[None])
mask = keras.layers.Lambda(lambda inputs: K.not_equal(inputs, 0))(inputs) # returns True for all non zero inputs
z = keras.layers.Embedding(vocab_size + num_oov_buckets, embed_dimen)(inputs)
z = keras.layers.GRU(128, return_sequences=True)(z, mask=mask)
z = keras.layers.GRU(128)(z, mask=mask)
outputs = keras.layers.Dense(1, activation="sigmoid")(z)
model = keras.Model(inputs=[inputs], outputs=[outputs])

In [231]:
model.compile(loss="binary_crossentropy",
              optimizer="nadam",
              metrics=["accuracy"])

Let's add tensorboard callback for visualization.

In [232]:
TENSORBOARD_DIR = os.path.join(".", "_tf_logs", "15_nlp")
os.makedirs(TENSORBOARD_DIR, exist_ok=True)

In [233]:
run_index = 1
run_logdir = os.path.join(TENSORBOARD_DIR, "imdb_masked_model_run_{:05d}".format(run_index))
tensorboard_cb = keras.callbacks.TensorBoard(log_dir=run_logdir, embeddings_freq=10)
print(run_logdir)

./_tf_logs/15_nlp/imdb_masked_model_run_00001


In [234]:
early_stopping_cb = keras.callbacks.EarlyStopping(patience=5)
lr_scheduler_cb = keras.callbacks.ReduceLROnPlateau(factor=0.5, patience=1)

In [235]:
model.fit(train_set, epochs=10,
          validation_data=valid_set,
          callbacks=[tensorboard_cb, early_stopping_cb, lr_scheduler_cb])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10


<keras.callbacks.History at 0x7fe7f8db1450>

Using tensorboard, we can visualize the closest embeddings to the two words below. Let's see what words their neighbors are!

In [244]:
sample_words = tf.constant(["amazing", "awful"])
table.lookup(sample_words)

<tf.Tensor: shape=(2,), dtype=int64, numpy=array([438, 260])>

In [250]:
# 100 closest neighbors including amazing itself
amazing_neighbor_ids = [438,674,332,327,269,284,325,1498,813,93,687,107,1573,2719,1291,577,902,870,1921,751,68,783,10126,2304,2197,2554,3101,1238,1015,3369,2474,7271,5496,2811,592,2440,2756,3093,1744,142,3530,2830,849,5639,434,6034,3925,3665,2463,2359,335,3454,1172,6188,4264,376,3762,1982,8960,8737,1196,3977,1597,276,1519,10035,2489,2169,1124,3051,4542,2772,2672,1166,579,1499,5808,1030,3350,2821,6080,2075,1040,4482,1542,3559,2931,3173,5640,4556,1618,7308,2137,3792,6371,8059,8326,9408,695,5392,3180]

# 100 closest neighbors including awful itself
awful_neighbor_ids = [260,143,1044,302,396,981,731,313,420,450,1069,1256,388,1572,788,2503,972,1669,1724,537,2893,631,966,623,1329,1619,908,3175,2523,2912,2786,460,2149,2261,1488,1129,749,391,988,1927,221,514,498,1339,3829,2841,1882,853,633,1200,1361,1052,10838,4227,1657,4736,8873,3082,2364,1037,2530,1701,1184,5948,5242,1022,1814,2232,2500,3886,2386,2621,3116,624,3512,880,1702,805,4708,1464,5311,728,1666,2736,3599,961,4834,6331,2130,5558,1065,2920,493,4003,5134,2432,356,3305,3730,5277,8640]


Now let's see what words they are!

In [262]:
amazing_neighbors = [str(truncated_vocabulary[id_]) for id_ in amazing_neighbor_ids if id_ < 10000]
awful_neighbors = [str(truncated_vocabulary[id_]) for id_ in awful_neighbor_ids if id_ < 10000]

In [263]:
for amazing_neighbor, awful_neighbor in zip(amazing_neighbors, awful_neighbors):
    print("{:20} {}".format(amazing_neighbor, awful_neighbor))

b'amazing'           b'awful'
b'fantastic'         b'worst'
b'favorite'          b'pathetic'
b'wonderful'         b'terrible'
b'excellent'         b'worse'
b'loved'             b'fails'
b'enjoyed'           b'poorly'
b'underrated'        b'boring'
b'unique'            b'horrible'
b'best'              b'waste'
b'enjoyable'         b'dumb'
b'love'              b'laughable'
b'wonderfully'       b'stupid'
b'extraordinary'     b'unfunny'
b'touching'          b'weak'
b'simple'            b'tedious'
b'awesome'           b'disappointment'
b'Great'             b'bottom'
b'feelings'          b'promising'
b'masterpiece'       b'crap'
b'great'             b'WORST'
b'superb'            b'Unfortunately'
b'refreshing'        b'mess'
b'Excellent'         b'ridiculous'
b'captured'          b'redeeming'
b'Wonderful'         b'lacks'
b'plenty'            b'badly'
b'funniest'          b'stinker'
b'stellar'           b'Worst'
b'friendship'        b'turkey'
b'limitations'       b'idiotic'
b'advanced'       

Wow, embeddings really do work!!!

### Pretrained Embeddings

Let's use Google's pretrained model for english sentence embedding.

This model does not require preprocessing, so let's create new datasets.

In [316]:
train, valid, test = tfds.load("imdb_reviews", as_supervised=True, 
                               split=["train+test[:50%]", "test[50%:75%]", "test[75%:]"])

batch_size = 32
train_set = train.shuffle(10000).batch(batch_size).prefetch(1)
valid_set = valid.batch(batch_size).prefetch(1)
test_set = test.batch(batch_size).prefetch(1)

Now let's create our model using the tfhub model.

In [317]:
os.environ["TFHUB_CACHE_DIR"] = os.path.join(".", "my_tfhub_cache")

In [318]:
model = keras.models.Sequential([
    hub.KerasLayer("https://tfhub.dev/google/tf2-preview/nnlm-en-dim50/1", output_shape=[50],
                   input_shape=[], dtype=tf.string),
    keras.layers.Dense(150, activation="relu"),
    keras.layers.BatchNormalization(),
    keras.layers.Dropout(0.25),
    keras.layers.Dense(1, activation="sigmoid"),
])

In [320]:
model.compile(loss="binary_crossentropy",
              optimizer="nadam",
              metrics=["accuracy"])

In [321]:
model.fit(train_set, epochs=5,
          validation_data=valid_set)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fe7dc6d7940>

In [323]:
model.layers[0].trainable = True

In [324]:
run_index = 6
run_logdir = os.path.join(TENSORBOARD_DIR, "imdb_tfhub_model_run_{:05d}".format(run_index))
tensorboard_cb = keras.callbacks.TensorBoard(log_dir=run_logdir, embeddings_freq=2)
print(run_logdir)

./_tf_logs/15_nlp/imdb_tfhub_model_run_00006


In [325]:
early_stopping_cb = keras.callbacks.EarlyStopping(patience=10)
lr_scheduler_cb = keras.callbacks.ReduceLROnPlateau(factor=0.5, patience=2)

In [326]:
model.fit(train_set, epochs=100,
          validation_data=valid_set,
          callbacks=[tensorboard_cb, early_stopping_cb, lr_scheduler_cb])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100


<keras.callbacks.History at 0x7fe8054888b0>