# LSTM for classifying feelings (IMDb dataset)

### Import stuff

In [1]:
import io
import os
import re
import shutil
import string
import tqdm
# import kormos
import tensorflow as tf
import numpy as np
from tensorflow.keras import Sequential, layers
from tensorflow.keras.layers import Dense, Embedding, LSTM
from keras.layers import TextVectorization
from aux_we import generate_training_data
%load_ext tensorboard
%reload_ext tensorboard
# mirrored_strategy = tf.distribute.MirroredStrategy(devices=["/gpu:0", "/gpu:1", "/gpu:2", "/gpu:3", "/gpu:4"])
# print("Num GPUs Available: ", len(tf.config.list_physical_devices('CPU')))

#### Dataset

Downloading dataset

In [2]:
url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"

if not(os.path.exists('aclImdb_v1.tar.gz')):
    print("===== Downloading Imdb Dataset =====")
    dataset = tf.keras.utils.get_file("aclImdb_v1.tar.gz", url,
                                    untar=True, cache_dir='.',
                                    cache_subdir='')

    dataset_dir = os.path.join(os.path.dirname(dataset), 'aclImdb')
    train_dir = os.path.join(dataset_dir, 'train')
    remove_dir = os.path.join(train_dir, 'unsup')
    shutil.rmtree(remove_dir)

Processing downloaded dataset

In [3]:
batch_size = 1024
seed = 127
train_ds = tf.keras.utils.text_dataset_from_directory(
    'aclImdb/train', batch_size=batch_size, validation_split=0.2,
    subset='training', seed=seed)
val_ds = tf.keras.utils.text_dataset_from_directory(
    'aclImdb/train', batch_size=batch_size, validation_split=0.2,
    subset='validation', seed=seed)

#Change to true to see a sample
print_one = False
if print_one:
    for text_batch, label_batch in train_ds.take(1):
        for i in range(1):
            print(f"Review: {text_batch.numpy()[i]}")
            print(f"Label: {label_batch.numpy()[i]}")
            print()

Found 25000 files belonging to 2 classes.
Using 20000 files for training.
Found 25000 files belonging to 2 classes.
Using 5000 files for validation.


In [4]:
AUTOTUNE = tf.data.AUTOTUNE
train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)

Standardizing the data

In [5]:
def custom_standardization(input_data):
  lowercase = tf.strings.lower(input_data)
  nots = tf.strings.regex_replace(lowercase, 'n\'t', ' not')
  ss = tf.strings.regex_replace(nots, '\'s', '')
  stripped_html = tf.strings.regex_replace(ss, '<br />', ' ')
  no_ponctuation = tf.strings.regex_replace(stripped_html,'[%s]' % re.escape(string.punctuation), '')
  single_spaces = tf.strings.regex_replace(no_ponctuation, '  ', ' ')
  for i in range(2):
    single_spaces = tf.strings.regex_replace(single_spaces, '  ', ' ')
  return single_spaces

dictionary_size = 500
max_review_size = 250
vectorize_layer = TextVectorization(
    standardize=custom_standardization,
    max_tokens=dictionary_size,
    output_mode='int',
    output_sequence_length=max_review_size)

#Build dictonary
text_ds = train_ds.map(lambda x, y: x)
text_ds = text_ds.cache().prefetch(buffer_size=10)
vectorize_layer.adapt(text_ds)

#Print one
# text_batch, label_batch = next(iter(train_ds))
# first_review, first_label = text_batch[10], label_batch[10]
# print("Review", custom_standardization(first_review))
# print("Label", train_ds.class_names[first_label])
# print("Vectorized review", vectorize_layer(first_review))

2022-10-12 19:05:08.273720: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


In [6]:
print_one = True
if print_one:
    text_batch, label_batch = next(iter(train_ds))
    first_review, first_label = text_batch[10], label_batch[10]
    print("Review", custom_standardization(first_review))
    print("Label", first_label)
    print("Vectorized review", vectorize_layer(first_review))

Review tf.Tensor(b'i never really knew who robert wuhl was before seeing this but after seeing it i realized what a funny man he is this hbo special features him teaching american history to new york university film students and the man was just phenomenal he poked fun at almost every key historic event that occurred not just in the us but some other parts of the world this documentarycomedy was a great satire that made me question if what i accept as the infallible true history is really true i enjoyed how mr wuhl managed to mix useful information with great comedy and made learning a lot more exciting i would recommend this to anyone interested in history and is willing to question what hisher beliefs', shape=(), dtype=string)
Label tf.Tensor(1, shape=(), dtype=int32)
Vectorized review tf.Tensor(
[ 10 111  63   1  33   1   1  14 149 301  11  19 100 301   8  10   1  47
   4 156 123  23   7  11   1 303   1  89   1 307 458   6 160   1   1  20
   1   3   2 123  14  41   1  23   1 239  29

## Word Embedding

Dataset for training the word embedding

In [7]:
print_stuff = False
neg_samples = 8

inverse_vocab = vectorize_layer.get_vocabulary()
if print_stuff:
    print('Part of vocab:',inverse_vocab[:200])

# Vectorize the data in text_ds
text_vector_ds = text_ds.prefetch(AUTOTUNE).map(vectorize_layer)
lst = list(text_vector_ds.as_numpy_iterator())
sequences = lst[0]
for i in range(1,len(lst)):
    arr = np.asarray(lst[i])
    sequences = np.append(sequences, arr, 0)
sequences.reshape(-1)
print(sequences.shape)

if print_stuff:
    print(len(sequences)*batch_size)
    for seq in sequences[2][:1]:
        print(f"{seq} => {[inverse_vocab[i] for i in seq]}")

# Create
targets, contexts, labels = generate_training_data(
    sequences=sequences,
    window_size=5,
    neg_samples=neg_samples,
    vocab_size=dictionary_size,
    seed=seed)

targets = np.array(targets)
contexts = np.array(contexts)[:,:,0]
labels = np.array(labels)

print('\n')
print(f"targets.shape: {targets.shape}")
print(f"contexts.shape: {contexts.shape}")
print(f"labels.shape: {labels.shape}")


(20000, 250)


100%|██████████| 20000/20000 [00:57<00:00, 350.79it/s]




targets.shape: (1241232,)
contexts.shape: (1241232, 9)
labels.shape: (1241232, 9)


Optimize dataset

In [8]:
BATCH_SIZE = 1024
BUFFER_SIZE = 10000
dataset = tf.data.Dataset.from_tensor_slices(((targets, contexts), labels))
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
print(dataset)

<BatchDataset element_spec=((TensorSpec(shape=(1024,), dtype=tf.int64, name=None), TensorSpec(shape=(1024, 9), dtype=tf.int64, name=None)), TensorSpec(shape=(1024, 9), dtype=tf.int64, name=None))>


Train embedding

In [9]:
embedding_dim=20

#Train WE
class Word2Vec(tf.keras.Model):
  def __init__(self, dictionary_size=dictionary_size, embedding_dim=embedding_dim):
    super(Word2Vec, self).__init__()
    self.target_embedding = layers.Embedding(dictionary_size,
                                      embedding_dim,
                                      input_length=1,
                                      name="w2v_embedding")
    self.context_embedding = layers.Embedding(dictionary_size,
                                       embedding_dim,
                                       input_length=neg_samples+1)
  def call(self, pair):
    target, context = pair
    # target: (batch, dummy?)  # The dummy axis doesn't exist in TF2.7+
    # context: (batch, context)
    if len(target.shape) == 2:
      target = tf.squeeze(target, axis=1)
    # target: (batch,)
    word_emb = self.target_embedding(target)
    # word_emb: (batch, embed)
    context_emb = self.context_embedding(context)
    # context_emb: (batch, context, embed)
    dots = tf.einsum('be,bce->bc', word_emb, context_emb)
    # dots: (batch, context)
    return dots

w2v = Word2Vec(dictionary_size, embedding_dim)

w2v.compile(optimizer='adam',
            loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
            metrics=['accuracy'])
            
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="logs")
w2v.fit(dataset,
       epochs=20)


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x167dcf310>

Save weigts of trained embedding

In [10]:
weights = w2v.get_layer('w2v_embedding').get_weights()[0]
vocab = vectorize_layer.get_vocabulary()

Save vectors and words in .tsv

In [11]:
out_v = io.open('vectors.tsv', 'w', encoding='utf-8')
out_m = io.open('metadata.tsv', 'w', encoding='utf-8')

for index, word in enumerate(vocab):
  if index == 0:
    continue  # skip 0, it's padding.
  vec = weights[index]
  out_v.write('\t'.join([str(x) for x in vec]) + "\n")
  out_m.write(word + "\n")
out_v.close()
out_m.close()

Process vectors from tsv

In [12]:
import pandas as pd

df = pd.read_csv('vectors.tsv', sep="\t")
df2 = pd.read_csv('metadata.tsv', sep="\t")

vecs = df.values
wrds = df2.values

num_tokens = dictionary_size

#Prepare embedding matrix
embedding_matrix = np.zeros((num_tokens, embedding_dim))
for i in range(2,len(vecs)):
    embedding_matrix[i] = vecs[i-2]
    if (i%100==0):
        print(i, wrds[i-2], '=', inverse_vocab[i],embedding_matrix[i])

embedding_layer = Embedding(
    num_tokens,
    embedding_dim,
    embeddings_initializer=tf.keras.initializers.Constant(embedding_matrix),
    trainable=True,
)

os.remove('metadata.tsv')
os.remove('vectors.tsv')

100 ['after'] = after [-0.25236744  0.22294095 -0.47748455  0.8000439  -0.27904424  0.6006815
 -0.3504664   0.27771127 -0.18391116 -0.2848318  -0.01555239 -0.13021846
 -0.24619533 -0.2636221   0.42959565 -0.33255276 -0.0963175  -0.28762874
  0.20503695 -0.02305496]
200 ['right'] = right [-0.24869512  0.3069537  -0.4729333   0.45070252  0.02212025  0.27604362
 -0.0543763   0.24817306 -0.23229143  0.1652415  -0.319195   -0.26941577
 -0.34723032  0.0796494   0.15716659 -0.32968554  0.4386617  -0.44977346
  0.19259012  0.38791212]
300 ['effects'] = effects [-8.6872090e-01  7.1108200e-01  3.1629488e-01 -4.8028690e-01
  1.0770323e-01  2.1325263e-01 -5.2093870e-01 -4.6367460e-01
 -1.2160134e+00  2.3402283e-02  5.6021210e-01 -8.7606170e-01
 -8.9379440e-02  7.0010360e-04  1.1745342e-01 -1.3862087e-01
  1.2602201e-01 -5.2380973e-01 -7.4549645e-01 -4.9149744e-02]
400 ['supposed'] = supposed [-0.22659782 -0.16609944 -0.53715944 -0.30274904 -0.168148    0.52029735
  0.6035064   0.57658684 -0.300329

## LSTM itself

In [17]:
model1 = Sequential([
  vectorize_layer,
  Embedding(dictionary_size,embedding_dim),
  LSTM(50),
  Dense(1, activation='sigmoid')
])

# if os.path.exists("logs"):
#   os.rmdir("logs")

tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="logs/lstm/model1")

model1.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])

model1.fit(
    train_ds,
    validation_data=val_ds,
    epochs=15,
    callbacks=[tensorboard_callback])

#docs_infra: no_execute
%load_ext tensorboard
%reload_ext tensorboard
%tensorboard --logdir logs

Epoch 1/15


  return dispatch_target(*args, **kwargs)


Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


Reusing TensorBoard on port 6006 (pid 32479), started 1:22:37 ago. (Use '!kill 32479' to kill it.)

In [18]:
model = Sequential([
  vectorize_layer,
  embedding_layer,
  LSTM(50),
  Dense(1, activation='sigmoid')
])

# if os.path.exists("logs"):
#   os.rmdir("logs")

tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="logs/lstm")

model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])

model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=15,
    callbacks=[tensorboard_callback])

#docs_infra: no_execute
%load_ext tensorboard
%reload_ext tensorboard
%tensorboard --logdir logs

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


Reusing TensorBoard on port 6006 (pid 32479), started 1:26:14 ago. (Use '!kill 32479' to kill it.)

In [25]:
predictions = model.predict(val_ds)



In [40]:
vali = val_ds.as_numpy_iterator()
text_batch, label_batch = next(iter(val_ds))

for i in range(0,500,100):
    print(f"Review: {text_batch[i]}\nTrue prediction: {predictions[i][0]}\tRounded prediction: {round(predictions[i][0])}\tTrue:{label_batch[i]}\n\n")


Review: b'Nightmare Weekend stars a cast of ridiculous actors with even less of an idea of what is going on than the director had, if you can imagine that. There is no decipherable plot or story, the special effects are a joke, and even the sound is terrible. This film was directed by Henry Sala. It was the only film that he ever directed, and the reason is obvious.'
True prediction: 0.035815365612506866	Rounded prediction: 0	True:0


Review: b"When I first saw this movie I was with my dad. He encouraged me to watch this movie because it was one of his favorites. After watching the movie it instantly became one of my favorites. <br /><br />A River Runs Through It is about two brothers who each take a different path in life. Norman Maclean (Craig Sheffer) is the older of the two brothers and he is set on the path of education. Paul Maclean (Brad Pitt) is the rebellious younger brother who travels on a path full of obstacles. The movie follows these characters as the each follow their ow

## Results

![](loss%20per%20epoch.png)

#### A few predictions

Review: b'Nightmare Weekend stars a cast of ridiculous actors with even less of an idea of what is going on than the director had, if you can imagine that. There is no decipherable plot or story, the special effects are a joke, and even the sound is terrible. This film was directed by Henry Sala. It was the only film that he ever directed, and the reason is obvious.'

True prediction: 0.035815365612506866	Rounded prediction: 0	True:0

-----


Review: b"When I first saw this movie I was with my dad. He encouraged me to watch this movie because it was one of his favorites. After watching the movie it instantly became one of my favorites. A River Runs Through It is about two brothers who each take a different path in life. Norman Maclean (Craig Sheffer) is the older of the two brothers and he is set on the path of education. Paul Maclean (Brad Pitt) is the rebellious younger brother who travels on a path full of obstacles. The movie follows these characters as the each follow their own path. There is no downside to this movie. You will be entertained the whole way through. The acting, directing, and script is all perfect. The two things that are exceptional are the cinematography and the score. Both of which entrap you in the world Robert Redford creates for you. This is an all around great movie that is destined to be a classic. It sure is in my book. If you haven't seen this movie definitely watch it as soon as you can because it will stay with you forever."

True prediction: 0.9545835852622986	Rounded prediction: 1	True:1

-----


Review: b'The Egyptian Movies has A Lot Of Filmes With High Level Of Drama Or Romance Or Comedy Or Action Even Sports... "Ziab la Ta\'Kohl AL lam" Was banned In Egypt Because It Content Nudity (Full Frontal Female Nudity) And This Kind Of Nudity Is Prohibited In The Egyptian Movies.. When I Saw this Movies I Felt Down... Fool Story.. Nude Actress.. Bad Action.. Some Horror & Awful Colors.. Dear Friend.. If You Wanna See A great Egyptian Movie...Simply: Stay Away Form "Ziab la Ta\'Kohl AL lam".. We Have Great Movies In Egypt... We Have A Great Actors Who Won A Global Wins Like: Omar El Sheriff Or Gameel Rateb.. We Have Great Directors Like "Yousef Shahin" So Believe Me Pall.. You Don\'t Need To See This Movie..'

True prediction: 0.7179211974143982	Rounded prediction: 1	True:0

-----


Review: b"He-he-hello!! This is a really fun movie. Basically, in Party Girl, you have your fun-lovin', independent, early 90's New Yorker chick. Along with her party friends, she meets a mature Turkish Vendor. It is a comming of age story for those new adults who are searching for what they want to do. It is comforting to see a female slacker develop into a mature woman. Hope is given to all of us slackers who might feel like their only skills are being able to maintain while hammered and a nack for throwing good parties. On a side note, Parker Posey makes this movie great. I have never been a great fan of her, but this movie makes me just want to watch all of her movies. There are subtle manerisms that perfected her character. If you want good laughs and a fun time, make sure to watch this movie. Repeated viewings are a must."

True prediction: 0.9345836639404297	Rounded prediction: 1	True:1

-----


Review: b'Just saw a pre-screening tonight. What can I say? It lived up to it\'s mediocre trailer run, though that\'s saying nothing at all. It did absolutely nothing that any movie before it hasn\'t done, and it played out in such a clich\xc3\xa9 fashion that eventually I got to the point where I stopped laughing only because I was laughing with the audience, and instead let the humorless movie play out. So let\'s see... we have the less-than-spectacular main character that is trying to get back with his ex-girlfriend but he\'s not good enough for her, check. We have the three buddies that all have their own "personality" with one being the best friend who tries to get with the main girl character\'s best friend but is constantly rejected, another friend being the super awkward one that can\'t live down seeing the positive in everything 24/7 and is thrown in for the one-liners (which in this case is just a bunch of movie references, specifically from Disney), and the third guy whose name you won\'t ever remember but is there to complete the square and throw in consoling messages to whomever will care to listen... check. We have the girl\'s ex-boyfriend and her parents ****-block the relationship at any possible means when things are looking up, not to mention the awkward family members from the main character\'s side... check. We have the downer period an hour into the movie where everyone is depressed, check. We have the movie\'s "funny" moments come from incessant swearing, people falling down or being hit, scenes from the trailer, and homosexual innuendos... check. And dare I call it a spoiler, but we have an ending that unfolds exactly as one thought that it would unfold before even seeing the movie... check. Honestly, this could have... no, wait... should have been a PG-13 movie. All that needed to be dropped were any F-bombs. Honestly, it would have gotten much more publicity from the crowd that enjoys this kind of humor, would have gotten less media exposure, and thusly would have not been disliked as much from people like myself who should try and hold it up higher to the recent R-rated comedies like Superbad and Knocked Up. The humor in this movie is just so awkward that it doesn\'t fit in with what general people look for. I bet even the actors were often times unsettled with some of the dialogue and action they had to deliver on camera. Let\'s put it this way... in the theater, it will help you laugh because it\'s on the big screen and others are laughing. When this movie hits Showtime and you\'re checking it out at 2:00 PM on an off-day, you may be inclined to change the channel. The only thing that will keep you watching is Alice Eve\'s hotness (who is not quite a 10, but still very good looking). Aside from the main resolution, this film kicked a lot of subplots to the side of the curb and seemed to forget to write more story that they tried to develop in the beginning of the movie, where everything else pretty much flies out the window. So there is a main resolution, but what comes of it? It\'s never really clear-cut, nor does it allow the ending to be "feel-good" with the abruptness. There was only one thing worth nothing in this movie, and that was the good soundtrack. Aside from the nice choice of 90\'s alternative rock songs, there was a nice upbeat score that would play in some parts of the movie (more so the beginning of it) that reminds me of something David Holmes would mix up/compose. I\'ll give them props for a great choice of sound. One last thing, this movie was probably filmed sometime late last summer, because the inadvertent yet proud Pepsi sponsorship showed the yellow bottle caps that they had during that Rock Band promotion. I just figured a lot of Rock Band gamers would catch onto that one if you saw it. But I say hold onto your money. If this was PG-13 and you were 15 years old on a Friday night with a group of friends, I\'d say knock yourselves out. Otherwise, definitely pass. It doesn\'t try and compete with the R-rated movies of the past few years, and ideally it definitely isn\'t as good.'

True prediction: 0.19758467376232147	Rounded prediction: 0	True:0
