# LSTM for classifying feelings (IMDb dataset)

### Import stuff

In [1]:
import io
import os
import re
import shutil
import string
import tqdm
# import kormos
import tensorflow as tf
import numpy as np
from tensorflow.keras import Sequential, layers
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D, LSTM, Flatten
from keras.layers import TextVectorization
from aux_we import generate_training_data
%load_ext tensorboard
%reload_ext tensorboard
# mirrored_strategy = tf.distribute.MirroredStrategy(devices=["/gpu:0", "/gpu:1", "/gpu:2", "/gpu:3", "/gpu:4"])
# print("Num GPUs Available: ", len(tf.config.list_physical_devices('CPU')))

#### Dataset

Downloading dataset

In [2]:
url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"

if not(os.path.exists('aclImdb_v1.tar.gz')):
    print("===== Downloading Imdb Dataset =====")
    dataset = tf.keras.utils.get_file("aclImdb_v1.tar.gz", url,
                                    untar=True, cache_dir='.',
                                    cache_subdir='')

    dataset_dir = os.path.join(os.path.dirname(dataset), 'aclImdb')
    train_dir = os.path.join(dataset_dir, 'train')
    remove_dir = os.path.join(train_dir, 'unsup')
    shutil.rmtree(remove_dir)

Processing downloaded dataset

In [3]:
batch_size = 1024
seed = 127
train_ds = tf.keras.utils.text_dataset_from_directory(
    'aclImdb/train', batch_size=batch_size, validation_split=0.2,
    subset='training', seed=seed)
val_ds = tf.keras.utils.text_dataset_from_directory(
    'aclImdb/train', batch_size=batch_size, validation_split=0.2,
    subset='validation', seed=seed)

#Change to true to see a sample
print_one = False
if print_one:
    for text_batch, label_batch in train_ds.take(1):
        for i in range(1):
            print(f"Review: {text_batch.numpy()[i]}")
            print(f"Label: {label_batch.numpy()[i]}")
            print()

Found 25000 files belonging to 2 classes.
Using 20000 files for training.
Found 25000 files belonging to 2 classes.
Using 5000 files for validation.


In [4]:
AUTOTUNE = tf.data.AUTOTUNE
train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)

Standardizing the data

In [5]:
def custom_standardization(input_data):
  lowercase = tf.strings.lower(input_data)
  nots = tf.strings.regex_replace(lowercase, 'n\'t', ' not')
  ss = tf.strings.regex_replace(nots, '\'s', '')
  stripped_html = tf.strings.regex_replace(ss, '<br />', ' ')
  no_ponctuation = tf.strings.regex_replace(stripped_html,'[%s]' % re.escape(string.punctuation), '')
  single_spaces = tf.strings.regex_replace(no_ponctuation, '  ', ' ')
  for i in range(2):
    single_spaces = tf.strings.regex_replace(single_spaces, '  ', ' ')
  return single_spaces

dictionary_size = 500
max_review_size = 250
vectorize_layer = TextVectorization(
    standardize=custom_standardization,
    max_tokens=dictionary_size,
    output_mode='int',
    output_sequence_length=max_review_size)

#Build dictonary
text_ds = train_ds.map(lambda x, y: x)
text_ds = text_ds.cache().prefetch(buffer_size=10)
vectorize_layer.adapt(text_ds)

#Print one
# text_batch, label_batch = next(iter(train_ds))
# first_review, first_label = text_batch[10], label_batch[10]
# print("Review", custom_standardization(first_review))
# print("Label", train_ds.class_names[first_label])
# print("Vectorized review", vectorize_layer(first_review))

2022-10-12 18:16:38.355600: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


In [6]:
print_one = True
if print_one:
    text_batch, label_batch = next(iter(train_ds))
    first_review, first_label = text_batch[10], label_batch[10]
    print("Review", custom_standardization(first_review))
    print("Label", first_label)
    print("Vectorized review", vectorize_layer(first_review))

Review tf.Tensor(b'i never really knew who robert wuhl was before seeing this but after seeing it i realized what a funny man he is this hbo special features him teaching american history to new york university film students and the man was just phenomenal he poked fun at almost every key historic event that occurred not just in the us but some other parts of the world this documentarycomedy was a great satire that made me question if what i accept as the infallible true history is really true i enjoyed how mr wuhl managed to mix useful information with great comedy and made learning a lot more exciting i would recommend this to anyone interested in history and is willing to question what hisher beliefs', shape=(), dtype=string)
Label tf.Tensor(1, shape=(), dtype=int32)
Vectorized review tf.Tensor(
[ 10 111  63   1  33   1   1  14 149 301  11  19 100 301   8  10   1  47
   4 156 123  23   7  11   1 303   1  89   1 307 458   6 160   1   1  20
   1   3   2 123  14  41   1  23   1 239  29

## Word Embedding

Dataset for training the word embedding

In [7]:
print_stuff = False
neg_samples = 100

inverse_vocab = vectorize_layer.get_vocabulary()
if print_stuff:
    print('Part of vocab:',inverse_vocab[:200])

# Vectorize the data in text_ds
text_vector_ds = text_ds.prefetch(AUTOTUNE).map(vectorize_layer)
lst = list(text_vector_ds.as_numpy_iterator())
sequences = lst[0]
for i in range(1,len(lst)):
    arr = np.asarray(lst[i])
    sequences = np.append(sequences, arr, 0)
sequences.reshape(-1)
print(sequences.shape)

if print_stuff:
    print(len(sequences)*batch_size)
    for seq in sequences[2][:1]:
        print(f"{seq} => {[inverse_vocab[i] for i in seq]}")

# Create
targets, contexts, labels = generate_training_data(
    sequences=sequences,
    window_size=5,
    neg_samples=neg_samples,
    vocab_size=dictionary_size,
    seed=seed)

targets = np.array(targets)
contexts = np.array(contexts)[:,:,0]
labels = np.array(labels)

print('\n')
print(f"targets.shape: {targets.shape}")
print(f"contexts.shape: {contexts.shape}")
print(f"labels.shape: {labels.shape}")


(20000, 250)


100%|██████████| 20000/20000 [01:20<00:00, 247.38it/s]




targets.shape: (1238736,)
contexts.shape: (1238736, 101)
labels.shape: (1238736, 101)


Optimize dataset

In [8]:
BATCH_SIZE = 1024
BUFFER_SIZE = 10000
dataset = tf.data.Dataset.from_tensor_slices(((targets, contexts), labels))
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
print(dataset)

<BatchDataset element_spec=((TensorSpec(shape=(1024,), dtype=tf.int64, name=None), TensorSpec(shape=(1024, 101), dtype=tf.int64, name=None)), TensorSpec(shape=(1024, 101), dtype=tf.int64, name=None))>


Train embedding

In [9]:
embedding_dim=4

#Train WE
class Word2Vec(tf.keras.Model):
  def __init__(self, dictionary_size=dictionary_size, embedding_dim=embedding_dim):
    super(Word2Vec, self).__init__()
    self.target_embedding = layers.Embedding(dictionary_size,
                                      embedding_dim,
                                      input_length=1,
                                      name="w2v_embedding")
    self.context_embedding = layers.Embedding(dictionary_size,
                                       embedding_dim,
                                       input_length=neg_samples+1)
  def call(self, pair):
    target, context = pair
    # target: (batch, dummy?)  # The dummy axis doesn't exist in TF2.7+
    # context: (batch, context)
    if len(target.shape) == 2:
      target = tf.squeeze(target, axis=1)
    # target: (batch,)
    word_emb = self.target_embedding(target)
    # word_emb: (batch, embed)
    context_emb = self.context_embedding(context)
    # context_emb: (batch, context, embed)
    dots = tf.einsum('be,bce->bc', word_emb, context_emb)
    # dots: (batch, context)
    return dots

w2v = Word2Vec(dictionary_size, embedding_dim)

w2v.compile(optimizer='adam',
            loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
            metrics=['accuracy'])
            
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="logs")
w2v.fit(dataset,
       epochs=2)


Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x15cdd61d0>

Save weigts of trained embedding

In [10]:
weights = w2v.get_layer('w2v_embedding').get_weights()[0]
vocab = vectorize_layer.get_vocabulary()

Save vectors and words in .tsv

In [11]:
out_v = io.open('vectors.tsv', 'w', encoding='utf-8')
out_m = io.open('metadata.tsv', 'w', encoding='utf-8')

for index, word in enumerate(vocab):
  if index == 0:
    continue  # skip 0, it's padding.
  vec = weights[index]
  out_v.write('\t'.join([str(x) for x in vec]) + "\n")
  out_m.write(word + "\n")
out_v.close()
out_m.close()

Process vectors from tsv

In [12]:
import pandas as pd

df = pd.read_csv('vectors.tsv', sep="\t")
df2 = pd.read_csv('metadata.tsv', sep="\t")

vecs = df.values
wrds = df2.values

num_tokens = dictionary_size

#Prepare embedding matrix
embedding_matrix = np.zeros((num_tokens, embedding_dim))
for i in range(2,len(vecs)):
    embedding_matrix[i] = vecs[i-2]
    if (i%10==0):
        print(i, wrds[i-2], '=', inverse_vocab[i],embedding_matrix[i])

embedding_layer = Embedding(
    num_tokens,
    embedding_dim,
    embeddings_initializer=tf.keras.initializers.Constant(embedding_matrix),
    trainable=False,
)

# os.remove('metadata.tsv')
# os.remove('vectors.tsv')

10 ['i'] = i [ 0.54804885 -0.49318093 -0.58473796  0.5114748 ]
20 ['film'] = film [ 0.61194366 -0.5633387  -0.6212554   0.62394625]
30 ['all'] = all [ 0.55780363 -0.5729802  -0.59353685  0.6133544 ]
40 ['or'] = or [ 0.6076582 -0.5825299 -0.5710637  0.6185312]
50 ['more'] = more [ 0.56771666 -0.6696513  -0.5372741   0.6229178 ]
60 ['which'] = which [ 0.61152905 -0.6384574  -0.64311516  0.6083362 ]
70 ['did'] = did [ 0.59142256 -0.524214   -0.5511373   0.5434798 ]
80 ['get'] = get [ 0.55965173 -0.54531926 -0.60965157  0.55570817]
90 ['most'] = most [ 0.630753   -0.6164144  -0.5603311   0.62022763]
100 ['after'] = after [ 0.624686   -0.63663393 -0.5651988   0.6253247 ]
110 ['plot'] = plot [ 0.59741545 -0.59914213 -0.6299575   0.65324765]
120 ['ever'] = ever [ 0.59768796 -0.56610227 -0.51649904  0.52790487]
130 ['should'] = should [ 0.5859335 -0.5634419 -0.5330011  0.5589434]
140 ['back'] = back [ 0.5582274  -0.5864925  -0.59653825  0.6087693 ]
150 ['another'] = another [ 0.60563797 -0.572

## LSTM itself

In [16]:
model = Sequential([
  vectorize_layer,
  embedding_layer,
  LSTM(4),
  Dense(15),
  Dense(1, activation='sigmoid')
])

# if os.path.exists("logs"):
#   os.rmdir("logs")

tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="logs")

model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])

model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=15,
    callbacks=[tensorboard_callback])

#docs_infra: no_execute
%tensorboard --logdir logs

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


Reusing TensorBoard on port 6006 (pid 32479), started 0:21:15 ago. (Use '!kill 32479' to kill it.)