In [6]:
DEBUG = "0"
INFO = "1"
WARNING = "2"
ERROR = "3"

import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = ERROR

In [7]:
import tensorflow as tf
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)

In [8]:
import tensorflow_datasets as tfds
from tensorflow import keras
import tensorflow_hub as hub
import numpy as np
import os.path as path

In [9]:
auto = tf.data.experimental.AUTOTUNE

# Explore the data

In [10]:
dataroot = path.expanduser("~/mldata/tf")
imdb, imdb_info = tfds.load("imdb_reviews", data_dir=dataroot, with_info=True)
imdb_info



tfds.core.DatasetInfo(
    name='imdb_reviews',
    version=0.1.0,
    description='Large Movie Review Dataset.
This is a dataset for binary sentiment classification containing substantially more data than previous benchmark datasets. We provide a set of 25,000 highly polar movie reviews for training, and 25,000 for testing. There is additional unlabeled data for use as well.',
    urls=['http://ai.stanford.edu/~amaas/data/sentiment/'],
    features=FeaturesDict({
        'label': ClassLabel(shape=(), dtype=tf.int64, num_classes=2),
        'text': Text(shape=(), dtype=tf.string),
    }),
    total_num_examples=100000,
    splits={
        'test': 25000,
        'train': 25000,
        'unsupervised': 50000,
    },
    supervised_keys=('text', 'label'),
    citation="""@InProceedings{maas-EtAl:2011:ACL-HLT2011,
      author    = {Maas, Andrew L.  and  Daly, Raymond E.  and  Pham, Peter T.  and  Huang, Dan  and  Ng, Andrew Y.  and  Potts, Christopher},
      title     = {Learning Word V

In [11]:
classes = imdb_info.features["label"].names
classes

['neg', 'pos']

In [12]:
elems = []
for x in imdb["train"].take(3):
    elems.append(x)

Based on the info above, I expect each elem to be a dict with two keys - `text` and `label`.

In [13]:
elem = elems[0]
elem.keys()

dict_keys(['label', 'text'])

In [14]:
elem["text"]

<tf.Tensor: id=153, shape=(), dtype=string, numpy=b'There is no way to avoid a comparison between The Cat in the Hat and The Grinch Who Stole Christmas, so let\'s get that part out of the way. First of all, let me start by saying that I think Grinch was an underrated and unappreciated film. Cat was... well, just awful.<br /><br />Jim Carey was cast because he is a brilliant physical comedian, and fearlessly commits to over the top, outrageous characters. Mike Myers fell back on his old bag of tricks.<br /><br />Why, why, why Mike Myers?? The kids could care less, and the Austin Powers demographic isn\'t going to spy this film. So, what was the studio thinking?<br /><br />The Cat was also apparently related to Linda Richmond. Can we talk? Why a New York Accent? Not entirely consistent with anything Dr. Seuss has ever written. Myers was even allowed to sneak in his Scottish shtick. I wonder how many different voices the director and the studio tried to edit out of before they just gave i

In [15]:
elem["label"]

<tf.Tensor: id=152, shape=(), dtype=int64, numpy=0>

Now lets find out if the text is same length of variable. The `numpy()` will probably give a binary string that I'll have to decode.

In [16]:
text = elem["text"].numpy().decode()
type(text)

str

In [17]:
text

'There is no way to avoid a comparison between The Cat in the Hat and The Grinch Who Stole Christmas, so let\'s get that part out of the way. First of all, let me start by saying that I think Grinch was an underrated and unappreciated film. Cat was... well, just awful.<br /><br />Jim Carey was cast because he is a brilliant physical comedian, and fearlessly commits to over the top, outrageous characters. Mike Myers fell back on his old bag of tricks.<br /><br />Why, why, why Mike Myers?? The kids could care less, and the Austin Powers demographic isn\'t going to spy this film. So, what was the studio thinking?<br /><br />The Cat was also apparently related to Linda Richmond. Can we talk? Why a New York Accent? Not entirely consistent with anything Dr. Seuss has ever written. Myers was even allowed to sneak in his Scottish shtick. I wonder how many different voices the director and the studio tried to edit out of before they just gave in and said "as long as you don\'t say fahklempt\', 

In [18]:
len(text)

1929

In [19]:
text = elems[1]["text"].numpy().decode()
print(len(text))
text

1049


'Just because someone is under the age of 10 does not mean they are stupid. If your child likes this film you\'d better have him/her tested. I am continually amazed at how so many people can be involved in something that turns out so bad. This "film" is a showcase for digital wizardry AND NOTHING ELSE. The writing is horrid. I can\'t remember when I\'ve heard such bad dialogue. The songs are beyond wretched. The acting is sub-par but then the actors were not given much. Who decided to employ Joey Fatone? He cannot sing and he is ugly as sin.<br /><br />The worst thing is the obviousness of it all. It is as if the writers went out of their way to make it all as stupid as possible. Great children\'s movies are wicked, smart and full of wit - films like Shrek and Toy Story in recent years, Willie Wonka and The Witches to mention two of the past. But in the continual dumbing-down of American more are flocking to dreck like Finding Nemo (yes, that\'s right), the recent Charlie & The Chocola

# Split into train, val, and test sets

In [20]:
# Train set will have 25,000 instances
split = tfds.Split.TRAIN
trainset = tfds.load("imdb_reviews", data_dir=dataroot, split=split)



In [21]:
# Val set will have 5,000 instances
first_20pct = tfds.Split.TEST.subsplit(tfds.percent[:20])
valset = tfds.load("imdb_reviews", data_dir=dataroot, split=first_20pct)

In [22]:
# Test set will have 20,000 instances
last_80pct = tfds.Split.TEST.subsplit(tfds.percent[-80:])
testset = tfds.load("imdb_reviews", data_dir=dataroot, split=last_80pct)

# Set up initial pipeline 

In [23]:
def to_tpl(elem):
    return elem["text"], elem["label"]

train_ds = trainset.map(to_tpl, auto)
train_ds = train_ds.shuffle(512)
# train_ds = train_ds.repeat()
train_ds = train_ds.batch(32, drop_remainder=True)

In [24]:
val_ds = valset.map(to_tpl, auto)
val_ds = val_ds.batch(1000)

# Build the model
I'll use the gnews-swivel-20dim embedding as my first layer. It takes a text of variable length and outputs a vector in $\mathbb R^{20}$.

In [25]:
hub_url = "https://tfhub.dev/google/tf2-preview/gnews-swivel-20dim/1"
embedding = hub.KerasLayer(hub_url, input_shape=[], dtype=tf.string, trainable=True)

TypeError: Variable is unhashable if Tensor equality is enabled. Instead, use tensor.experimental_ref() as the key.

In [None]:
one_batch = None
for texts, labels in train_ds.take(1):
    one_batch = texts

In [None]:
type(one_batch)

In [None]:
one_batch.shape

In [None]:
out = embedding(one_batch)

In [None]:
out.shape

In [None]:
model = keras.Sequential([
    embedding,
    keras.layers.Dense(16, activation="relu"),
    keras.layers.Dense(1, activation="sigmoid")
])
model.summary()

In [None]:
out = model(one_batch)

In [None]:
out

# Train the model

In [None]:
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

In [None]:
num_batches = 25000//32
num_batches
history = model.fit(train_ds, epochs=5, steps_per_epoch=781, validation_data=val_ds, verbose=1)

In [None]:
history = model.fit(train_ds, epochs=5, validation_data=val_ds)

# Evaluate the model

In [None]:
test_ds = testset.map(to_tpl, auto)
test_ds = test_ds.batch(1000)
results = model.evaluate(test_ds)

In [None]:
model.metrics_names

In [None]:
results