In [1]:
import io
import os
import re
import shutil
import string
import tensorflow as tf

from datetime import datetime
from tensorflow.keras import Model, Sequential
from tensorflow.keras.layers import Activation, Dense, Embedding, GlobalAveragePooling1D
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization 

In [2]:
dataset_dir = 'aclImdb'

In [3]:
os.listdir(dataset_dir)

['imdb.vocab', 'imdbEr.txt', 'README', 'test', 'train']

In [4]:
train_dir = os.path.join(dataset_dir, 'train')
os.listdir(train_dir)

['labeledBow.feat',
 'neg',
 'pos',
 'unsupBow.feat',
 'urls_neg.txt',
 'urls_pos.txt',
 'urls_unsup.txt']

In [5]:
batch_size = 1024
seed = 2020
train_data = tf.keras.preprocessing.text_dataset_from_directory(
    train_dir, 
    batch_size=batch_size,
    validation_split=0.2,
    subset='training', 
    seed=seed)
val_data = tf.keras.preprocessing.text_dataset_from_directory(
    train_dir,
    batch_size=batch_size,
    validation_split=0.2,
    subset='validation',
    seed=seed)

Found 25000 files belonging to 2 classes.
Using 20000 files for training.
Found 25000 files belonging to 2 classes.
Using 5000 files for validation.


<h4>1: positive, 0: negative</h4>

In [6]:
for text_batch, label_batch in train_data.take(1):
    for i in range(5):
        print(label_batch[i].numpy(), text_batch.numpy()[i])

1 b'Although this movie has some weaknesses, it is worth seeing. I chose it because of the cast, and applaud Bonham Carter and Branagh for choosing roles different from those they have taken in the past. Both portray very troubled people, complete with warts, but make them likeable because of their humanity. The story is touching, but it is the performances that soar. Bonham Carter\'s "Jane" is a remarkable achievement, whose quest for romance opened my eyes to aspects of being disabled that I had not thought of before, but was interesting as well for other reasons. I felt the movie ended too abruptly, but better that than a drawn out emotionally manipulative ending (see "Stepmom.") The very real English setting added to my enjoyment - it was England in the 90\'s, both urban and rural, without being depressing.'
1 b"This film pulls you in from the get-go because it grabs our attention by acknowledging, yeah, that this story is opening with a clich\xc3\xa9 \xc2\x96 a funeral.<br /><br /

<h3>Using the Embedding Layer</h3> 

In [7]:
# 1000 word vocabulary into 5 dimensions
# Random initialization, similarities would be encoded by the word embeddings.
embedding_layer = tf.keras.layers.Embedding(1000, 5)

result = embedding_layer(tf.constant([1,2,3]))

In [9]:
result.numpy()

array([[ 0.04029671, -0.02687234,  0.01883718, -0.00956687, -0.03077844],
       [ 0.04513909,  0.00062316,  0.00826148, -0.01962583, -0.01752422],
       [ 0.00568271,  0.0040072 , -0.02780952,  0.04252643,  0.01310693]],
      dtype=float32)

In [10]:
result = embedding_layer(tf.constant([1,2,3]))
result.numpy()

result_2 = embedding_layer(tf.constant([[0,1,2],[3,4,5]]))
print(result_2.numpy())
print(result_2.shape)

[[[-0.00080323 -0.01714882  0.0058575  -0.0066018  -0.03084812]
  [ 0.04029671 -0.02687234  0.01883718 -0.00956687 -0.03077844]
  [ 0.04513909  0.00062316  0.00826148 -0.01962583 -0.01752422]]

 [[ 0.00568271  0.0040072  -0.02780952  0.04252643  0.01310693]
  [ 0.00251956 -0.03746142  0.02632819  0.02475611 -0.01004137]
  [ 0.03897863 -0.03387398 -0.04908519 -0.01402403  0.03230834]]]
(2, 3, 5)


<h3>Text Preprocessing</h3>

In [13]:
def custom_standardization(input_data):
    lowercase = tf.strings.lower(input_data)
    stripped_html = tf.strings.regex_replace(lowercase, '<br />', ' ')
    return tf.strings.regex_replace(stripped_html, '[%s]' % re.escape(string.punctuation), '')

In [14]:
vocab_size = 10000
sequence_length = 100

vectorize_layer = TextVectorization(
    standardize=custom_standardization,
    max_tokens=vocab_size,
    output_mode='int',
    output_sequence_length=sequence_length)

# Text-only dataset, labels removed.
text_data = train_data.map(lambda x, y: x)
vectorize_layer.adapt(text_data)

<h3>Model Definition</h3>

In [15]:
embedding_dim = 16

model = Sequential([
    vectorize_layer,
    Embedding(vocab_size, embedding_dim, name='embedding'),
    GlobalAveragePooling1D(),
    Dense(16, activation='relu'),
    Dense(1)
])

<h3>Training</h3>

In [16]:
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="logs")
model.compile(optimizer='adam', 
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])
model.fit(
    train_data, 
    validation_data=val_data,
    epochs=15,
    callbacks=[tensorboard_callback])

Epoch 1/15
Instructions for updating:
use `tf.profiler.experimental.stop` instead.
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<tensorflow.python.keras.callbacks.History at 0x2075357c0d0>

In [19]:
%load_ext tensorboard
%tensorboard --logdir logs

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


Reusing TensorBoard on port 6006 (pid 40300), started 0:27:25 ago. (Use '!kill 40300' to kill it.)

In [20]:
# Learned Embeddings of shape(vocab_size, embedding_dimension)
weights = model.get_layer('embedding').get_weights()[0]
vocab = vectorize_layer.get_vocabulary()

In [21]:
out_v = io.open('vectors.tsv', 'w', encoding='utf-8')
out_m = io.open('metadata.tsv', 'w', encoding='utf-8')

for index, word in enumerate(vocab):
    if index == 0: continue
    vec = weights[index]
    out_v.write('\t'.join([str(x) for x in vec]) + "\n")
    out_m.write(word + "\n")
out_v.close()
out_m.close()