In [1]:
import os.path as path
import numpy as np
from imdb.imdb_reviews import ImdbReviews

In [2]:
import logging
logging.basicConfig(level=logging.DEBUG)

# Load Data

In [3]:
dataroot = path.expanduser("~/mldata/imdb-reviews")

In [4]:
imdb_train = ImdbReviews(dataroot)

100%|██████████| 40000/40000 [00:28<00:00, 1381.86it/s]


In [5]:
len(imdb_train)

1250

In [6]:
imdb_train._encoded_texts.shape

TensorShape([40000, 250])

In [7]:
imdb_train._targets.shape

TensorShape([40000, 1])

In [8]:
x_batch, y_batch = imdb_train[0]

In [10]:
x = x_batch[0]
y = y_batch[0]

In [11]:
x

<tf.Tensor: shape=(250,), dtype=int32, numpy=
array([70250, 33996, 80935, 61359, 79231, 17293, 27466, 46996, 34619,
       22602,  6740, 24905, 45344, 86548, 23421, 19207, 20005, 41857,
       19019, 45344, 45677, 70250, 36330, 57129, 70250, 45624, 54547,
       83303, 32491, 10786, 46996, 20300, 84837, 63962, 84837, 54547,
        1893,  7907, 35602,  9638, 46996, 33996, 70975, 78291, 84837,
       63962,  6740, 50229, 84837, 14272, 91648, 81956, 33996,  1515,
       43191, 33996, 15574, 80984, 46996, 78803, 79747, 32253, 70250,
       33996, 33819, 45624, 33996, 24574,  9137, 33996, 34077, 59044,
       45703, 67968,  9137, 76738, 13468, 88421, 13468, 33996, 44164,
       27444, 26351, 81956, 33996, 41286, 82273, 43191, 54547, 86305,
       79231, 45344, 69074,  8744, 10446, 63900, 81956, 45703, 54547,
       65679, 33996, 19140, 70250, 79231, 61359, 65677, 54345, 12529,
       15197,  8621,  9137, 19855, 36330, 54547, 83303, 27697, 45554,
       86548, 51817, 20300, 84837, 46114, 84

In [12]:
y

<tf.Tensor: shape=(1,), dtype=float32, numpy=array([1.], dtype=float32)>

In [13]:
imdb_train.encoder.decode(x[x>0])

'of the puzzle and sometimes saying aha is what makes or breaks a show like this one surface had a couple of flaws first of all it s basic premise is not as exciting as it could have been nor is the revealed story as exciting or daring as i hoped in the beginning also the tv feeling is very present much of the time all the way from the crappy cgi that ranges from decent to awful to the rather shifting quality in the acting department also it feels sometimes a bit too family oriented in that it takes the edge of sometimes and becomes almost cutesy but aside from these flaws it s an enjoyable show maybe not as spectacular as some of the other sci fi shows out there but it manages to keep me interested the whole season and it offers a couple of nice cliffhangers between shows as well the ending for me is not that appealing i don t like shows that end without ending so to speak leaving the story unresolved it s especially unfortunate in this case since the show seems to be canceled after t

In [14]:
imdb_train.label(y.numpy()[0])

'pos'

# Build Embeddings

In [15]:
import pickle

In [16]:
glove_embeddings = path.expanduser("~/mldata/glove/glove_6b_100d.pkl")
with open(glove_embeddings, "rb") as f:
    embeddings_ndx = pickle.load(f)

In [17]:
len(embeddings_ndx)

400000

In [18]:
print(embeddings_ndx["in"].shape)
embeddings_ndx["in"]

(100,)


array([ 0.086, -0.222,  0.166,  0.134,  0.382,  0.354,  0.013,  0.225,
       -0.438,  0.502, -0.359, -0.35 ,  0.055,  0.696, -0.18 ,  0.068,
        0.391,  0.16 , -0.266, -0.211,  0.537,  0.494,  0.937,  0.669,
        0.218, -0.466,  0.224, -0.362, -0.177,  0.175, -0.204,  0.139,
        0.02 , -0.104, -0.202,  0.55 , -0.155,  0.987, -0.269, -0.291,
       -0.329, -0.342, -0.169, -0.42 , -0.047, -0.163,  0.708, -0.749,
       -0.092, -0.962, -0.197,  0.103,  0.552,  1.382, -0.656, -3.25 ,
       -0.316, -1.206,  1.771,  0.403, -0.798,  1.16 , -0.33 ,  0.314,
        0.774,  0.226,  0.525, -0.034,  0.32 ,  0.08 ,  0.178, -0.494,
       -0.7  , -0.446,  0.172,  0.203,  0.023, -0.207, -1.016,  0.183,
        0.568,  0.318, -0.65 ,  0.683, -0.866, -0.059, -0.293, -0.557,
       -0.347, -0.329,  0.402, -0.127, -0.202,  0.874, -0.545,  0.792,
       -0.207, -0.074,  0.758, -0.342], dtype=float32)

In [19]:
vocab_size = len(imdb_train.encoder.tokens)
vocab_size

94923

In [21]:
embeddings = np.zeros((vocab_size+1, 100))

In [22]:
for i, token in enumerate(imdb_train.encoder.tokens):
    idx = i + 1
    if token in embeddings_ndx:
        embeddings[idx] = embeddings_ndx[token]

In [23]:
embeddings[0]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [24]:
imdb_train.encoder.tokens[2]

'anda'

In [25]:
embeddings[3]

array([-0.231, -0.524, -0.034,  0.287,  0.296, -0.305, -0.271,  0.444,
        0.744,  0.346, -0.236,  1.017, -1.25 ,  0.01 ,  0.251,  0.217,
        0.161, -0.02 , -0.225, -0.028,  0.169,  0.334, -0.328,  0.78 ,
       -0.545, -0.029, -0.171, -0.019, -0.245, -0.617, -0.513, -0.106,
       -0.37 ,  0.085,  0.506,  0.142,  0.835,  0.265, -0.153,  0.183,
        0.55 , -0.061, -0.724,  0.562, -0.7  , -0.277, -0.349,  0.068,
       -0.697,  0.403, -0.107, -0.75 ,  0.272, -0.809, -0.589,  0.292,
        0.4  ,  0.439, -0.836,  0.233,  0.578, -0.545, -0.121,  0.211,
        0.237, -0.211, -0.436,  0.407, -0.399,  0.366, -0.09 , -0.238,
        0.118, -0.899,  0.25 ,  0.258,  0.638, -0.08 , -0.28 , -0.095,
        0.13 , -1.236, -0.235, -0.264,  0.311,  0.278, -0.258,  0.334,
       -0.406,  0.124,  0.591, -0.419,  0.026,  1.196,  0.066,  0.214,
       -0.088, -0.188, -0.438, -0.222])

In [26]:
embeddings_ndx["extract"]

array([-0.891,  0.892, -0.563,  0.453,  0.027, -0.169,  0.35 , -0.007,
        0.37 ,  0.051,  0.235,  0.682, -0.403,  0.503, -0.261,  0.463,
        0.784, -0.232,  0.004,  0.515,  0.052, -0.472, -0.258,  0.123,
       -0.576,  0.879,  0.096,  0.028,  0.331,  0.432,  0.144,  0.222,
       -0.997, -0.552, -0.166,  0.032, -0.139,  0.147, -0.044, -0.24 ,
       -0.325, -0.978, -0.61 , -0.603, -0.199,  0.203, -0.124, -0.093,
       -0.155, -0.007, -0.304,  0.422,  0.379,  1.169, -0.002, -0.349,
       -0.376, -0.911,  0.408, -0.494,  0.368,  0.148, -0.413, -0.401,
        0.598, -0.882,  0.758, -0.238,  0.011, -1.046, -0.161,  0.325,
        0.178,  0.07 ,  0.498,  0.489, -0.041, -0.921, -0.841,  0.381,
        0.488,  0.105, -1.121,  0.546, -0.652, -0.009,  1.551,  0.712,
        0.121,  0.693, -0.334, -0.053, -0.821, -0.266,  0.12 , -0.946,
       -0.037, -0.469,  0.052,  0.095], dtype=float32)

# Build Model

In [27]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Flatten, Dense
from haikunator import Haikunator

In [None]:
model = Sequential()
model.add(Embedding(vocab_size+1, 100, input_length=imdb_train.max_seq_len))
model.add(Flatten())
model.add(Dense(32, activation="relu"))
model.add(Dense(1, activation="sigmoid"))
model.summary()

In [None]:
model.layers[0].set_weights([embeddings])
model.layers[0].trainable = False
model.summary()

In [30]:
imdb_val = ImdbReviews(dataroot=dataroot, split="val", max_seq_len=imdb_train.max_seq_len, encoder=imdb_train.encoder)

100%|██████████| 5000/5000 [00:03<00:00, 1331.75it/s]


In [None]:
run_id = Haikunator().haikunate()
print(run_id)
tblog = path.expanduser(path.join("~/mldata/tblogs/imdb-reviews/", run_id))
tb = tf.keras.callbacks.TensorBoard(tblog, histogram_freq=0, update_freq="epoch")
model.compile(optimizer="rmsprop", loss="binary_crossentropy", metrics=["acc"])
history = model.fit(imdb_train, validation_data=imdb_val, epochs=10, callbacks=[tb], verbose=2)

In [None]:
imdb_test = ImdbReviews(dataroot=dataroot, split="test", max_seq_len=imdb_train.max_seq_len, encoder=imdb_train.encoder)

In [None]:
model.evaluate(imdb_test)

# Build LSTM Model

In [28]:
from tensorflow.keras.layers import LSTM

In [35]:
model = Sequential()
model.add(Embedding(vocab_size+1, 100, input_length=imdb_train.max_seq_len))
model.add(LSTM(32))
model.add(Dense(1, activation="sigmoid"))
model.summary()


Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 250, 100)          9492400   
_________________________________________________________________
lstm_3 (LSTM)                (None, 32)                17024     
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 33        
Total params: 9,509,457
Trainable params: 9,509,457
Non-trainable params: 0
_________________________________________________________________


In [36]:
model.layers[0].set_weights([embeddings])
model.layers[0].trainable = False
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 250, 100)          9492400   
_________________________________________________________________
lstm_3 (LSTM)                (None, 32)                17024     
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 33        
Total params: 9,509,457
Trainable params: 17,057
Non-trainable params: 9,492,400
_________________________________________________________________


In [37]:
run_id = Haikunator().haikunate()
print(run_id)
tblog = path.expanduser(path.join("~/mldata/tblogs/imdb-reviews/", run_id))
tb = tf.keras.callbacks.TensorBoard(tblog, histogram_freq=0, update_freq="epoch")
model.compile(optimizer="rmsprop", loss="binary_crossentropy", metrics=["acc"])
history = model.fit(imdb_train, validation_data=imdb_val, epochs=10, callbacks=[tb], verbose=2)

falling-bread-5890
Train for 1250 steps, validate for 157 steps
Epoch 1/10
1250/1250 - 124s - loss: 0.5003 - acc: 0.7566 - val_loss: 0.4073 - val_acc: 0.8124
Epoch 2/10
1250/1250 - 112s - loss: 0.3718 - acc: 0.8362 - val_loss: 0.3232 - val_acc: 0.8616
Epoch 3/10
1250/1250 - 116s - loss: 0.3159 - acc: 0.8650 - val_loss: 0.3175 - val_acc: 0.8578
Epoch 4/10
1250/1250 - 118s - loss: 0.2878 - acc: 0.8792 - val_loss: 0.2873 - val_acc: 0.8794
Epoch 5/10
1250/1250 - 111s - loss: 0.2670 - acc: 0.8896 - val_loss: 0.2824 - val_acc: 0.8808
Epoch 6/10
1250/1250 - 111s - loss: 0.2509 - acc: 0.8978 - val_loss: 0.2669 - val_acc: 0.8898
Epoch 7/10
1250/1250 - 112s - loss: 0.2364 - acc: 0.9027 - val_loss: 0.2867 - val_acc: 0.8784
Epoch 8/10
1250/1250 - 113s - loss: 0.2237 - acc: 0.9099 - val_loss: 0.2620 - val_acc: 0.8966
Epoch 9/10
1250/1250 - 111s - loss: 0.2127 - acc: 0.9153 - val_loss: 0.2856 - val_acc: 0.8832
Epoch 10/10
1250/1250 - 111s - loss: 0.2024 - acc: 0.9190 - val_loss: 0.2758 - val_acc: 0.