## Import the Libraries

In [0]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, GRU, Embedding, LSTM
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
%matplotlib inline
import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np
from scipy.spatial.distance import cdist
from google.colab import files

uploaded = files.upload()

## Download IMDB dataset

In [0]:
import imdb

In [0]:
imdb.maybe_download_and_extract()

Data has apparently already been downloaded and unpacked.


## Load the Dataset

In [0]:
x_train_text, y_train = imdb.load_data(train=True)
x_test_text, y_test = imdb.load_data(train=False)
print("Train-set size: ", len(x_train_text))
print("Test-set size:  ", len(x_test_text))

Train-set size:  25000
Test-set size:   25000


In [0]:
data_text = x_train_text + x_test_text

## Preprocess the Dataset

### Tokenize and Fit

In [0]:
num_words = 10000
tokenizer = Tokenizer(num_words=num_words)

In [0]:
tokenizer.fit_on_texts(data_text)

CPU times: user 12 s, sys: 22.8 ms, total: 12 s
Wall time: 12 s


In [0]:
tokenizer.word_index ## dict ordered by frequency

{'the': 1,
 'and': 2,
 'a': 3,
 'of': 4,
 'to': 5,
 'is': 6,
 'br': 7,
 'in': 8,
 'it': 9,
 'i': 10,
 'this': 11,
 'that': 12,
 'was': 13,
 'as': 14,
 'for': 15,
 'with': 16,
 'movie': 17,
 'but': 18,
 'film': 19,
 'on': 20,
 'not': 21,
 'you': 22,
 'are': 23,
 'his': 24,
 'have': 25,
 'be': 26,
 'one': 27,
 'he': 28,
 'all': 29,
 'at': 30,
 'by': 31,
 'an': 32,
 'they': 33,
 'so': 34,
 'who': 35,
 'from': 36,
 'like': 37,
 'or': 38,
 'just': 39,
 'her': 40,
 'out': 41,
 'about': 42,
 'if': 43,
 "it's": 44,
 'has': 45,
 'there': 46,
 'some': 47,
 'what': 48,
 'good': 49,
 'when': 50,
 'more': 51,
 'very': 52,
 'up': 53,
 'no': 54,
 'time': 55,
 'my': 56,
 'even': 57,
 'would': 58,
 'she': 59,
 'which': 60,
 'only': 61,
 'really': 62,
 'see': 63,
 'story': 64,
 'their': 65,
 'had': 66,
 'can': 67,
 'me': 68,
 'well': 69,
 'were': 70,
 'than': 71,
 'much': 72,
 'we': 73,
 'bad': 74,
 'been': 75,
 'get': 76,
 'do': 77,
 'great': 78,
 'other': 79,
 'will': 80,
 'also': 81,
 'into': 82,
 'p

In [0]:
x_train_tokens = tokenizer.texts_to_sequences(x_train_text)
x_test_tokens = tokenizer.texts_to_sequences(x_test_text)

### Truncate and Pad

In [0]:
num_tokens = [len(tokens) for tokens in x_train_tokens + x_test_tokens]
num_tokens = np.array(num_tokens)
max_tokens = np.mean(num_tokens) + 2 * np.std(num_tokens)
max_tokens = int(max_tokens)
max_tokens

544

In [0]:
np.sum(num_tokens < max_tokens) / len(num_tokens)

0.94528

In [0]:
pad = 'pre'
x_train_pad = pad_sequences(x_train_tokens, maxlen=max_tokens, padding=pad, truncating=pad)
x_test_pad = pad_sequences(x_test_tokens, maxlen=max_tokens, padding=pad, truncating=pad)

### Convert from tokens to strings


In [0]:
idx = tokenizer.word_index
inverse_map = dict(zip(idx.values(), idx.keys()))
def tokens_to_string(tokens):
    # Map from tokens back to words.
    words = [inverse_map[token] for token in tokens if token != 0]
    
    # Concatenate all words.
    text = " ".join(words)

    return text

## Build The model ( without GloVe pre-trained Weights)

In [0]:
embedding_size = 8

model_one = Sequential()

model_one.add(Embedding(input_dim=num_words,
                    output_dim=embedding_size,
                    input_length=max_tokens,
                    name='layer_embedding'))

model_one.add(LSTM(units=16, return_sequences=True))
model_one.add(LSTM(units=8, return_sequences=True))
model_one.add(LSTM(units=4))

model_one.add(Dense(1, activation='sigmoid'))

In [0]:
optimizer = Adam(lr=1e-3)
model_one.compile(loss='binary_crossentropy',
              optimizer=optimizer,
              metrics=['accuracy'])

In [0]:
model_one.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
layer_embedding (Embedding)  (None, 544, 8)            80000     
_________________________________________________________________
lstm_12 (LSTM)               (None, 544, 16)           1600      
_________________________________________________________________
lstm_13 (LSTM)               (None, 544, 8)            800       
_________________________________________________________________
lstm_14 (LSTM)               (None, 4)                 208       
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 5         
Total params: 82,613
Trainable params: 82,613
Non-trainable params: 0
_________________________________________________________________


### Fit the Model

In [0]:
model_one.fit(x_train_pad, y_train,
          validation_split=0.05, epochs=3, batch_size=512)

Train on 23750 samples, validate on 1250 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
CPU times: user 8min 13s, sys: 51.8 s, total: 9min 4s
Wall time: 4min 52s


<tensorflow.python.keras.callbacks.History at 0x7fcfa61a1390>

## Test the model

In [0]:
result = model_one.evaluate(x_test_pad, y_test)

CPU times: user 3min 30s, sys: 13.8 s, total: 3min 44s
Wall time: 1min 59s


In [0]:
print("Accuracy: {0:.2%}".format(result[1]))

Accuracy: 83.78%


In [0]:
text1 = "This movie is fantastic! I really like it because it is so good!"
text2 = "Good movie!"
text3 = "Maybe I like this movie."
text4 = "Meh ..."
text5 = "If I were a drunk teenager then this movie might be good."
text6 = "Bad movie!"
text7 = "Not a good movie!"
text8 = "This movie really sucks! Can I get my money back please?"
texts = [text1, text2, text3, text4, text5, text6, text7, text8]

In [0]:
tokens = tokenizer.texts_to_sequences(texts)
tokens_pad = pad_sequences(tokens, maxlen=max_tokens,
                           padding=pad, truncating=pad)
tokens_pad.shape

(8, 544)

In [0]:
model_one.predict(tokens_pad)

array([[0.19948664],
       [0.16436228],
       [0.16335413],
       [0.16440535],
       [0.1616945 ],
       [0.16367534],
       [0.16376519],
       [0.16184837]], dtype=float32)

In [0]:
layer_embedding = model_one.get_layer('layer_embedding')
weights_embedding = layer_embedding.get_weights()[0]
weights_embedding.shape

(10000, 8)

In [0]:
token_good = tokenizer.word_index['good']
token_great = tokenizer.word_index['great']
token_bad = tokenizer.word_index['bad']

In [0]:
emb_good = weights_embedding[token_good]
emb_great = weights_embedding[token_great]
emb_bad = weights_embedding[token_bad]

In [0]:
print(cdist(emb_good[None,:], emb_great[None,:]))
print(cdist(emb_good[None,:], emb_bad[None,:]))

[[0.26918621]]
[[0.47154388]]


## Find the relation between different words in meaning

In [0]:
def print_sorted_words(word, metric='cosine'):
    """
    Print the words in the vocabulary sorted according to their
    embedding-distance to the given word.
    Different metrics can be used, e.g. 'cosine' or 'euclidean'.
    """

    # Get the token (i.e. integer ID) for the given word.
    token = tokenizer.word_index[word]

    # Get the embedding for the given word. Note that the
    # embedding-weight-matrix is indexed by the word-tokens
    # which are integer IDs.
    embedding = weights_embedding[token]

    # Calculate the distance between the embeddings for
    # this word and all other words in the vocabulary.
    distances = cdist(weights_embedding, [embedding],
                      metric=metric).T[0]
    
    # Get an index sorted according to the embedding-distances.
    # These are the tokens (integer IDs) for words in the vocabulary.
    sorted_index = np.argsort(distances)
    
    # Sort the embedding-distances.
    sorted_distances = distances[sorted_index]
    
    # Sort all the words in the vocabulary according to their
    # embedding-distance. This is a bit excessive because we
    # will only print the top and bottom words.
    sorted_words = [inverse_map[token] for token in sorted_index
                    if token != 0]

    # Helper-function for printing words and embedding-distances.
    def _print_words(words, distances):
        for word, distance in zip(words, distances):
            print("{0:.3f} - {1}".format(distance, word))

    # Number of words to print from the top and bottom of the list.
    k = 10

    print("Distance from '{0}':".format(word))

    # Print the words with smallest embedding-distance.
    _print_words(sorted_words[0:k], sorted_distances[0:k])

    print("...")

    # Print the words with highest embedding-distance.
    _print_words(sorted_words[-k:], sorted_distances[-k:])

In [0]:
print_sorted_words('great', metric='cosine')

Distance from 'great':
0.000 - great
0.015 - loved
0.016 - voight
0.018 - heart
0.020 - ramones
0.021 - dvd
0.021 - wonderful
0.022 - atmosphere
0.023 - greatest
0.024 - tears
...
1.981 - instead
1.982 - horrible
1.983 - wretched
1.983 - unwatchable
1.984 - unfunny
1.985 - point
1.986 - salvage
1.988 - named
1.991 - mates
1.991 - 000


In [0]:
print_sorted_words('worst', metric='cosine')

Distance from 'worst':
0.000 - worst
0.005 - total
0.011 - avoid
0.012 - lame
0.013 - disgusting
0.020 - complete
0.020 - cheap
0.020 - dull
0.022 - seagal
0.022 - bad
...
1.976 - voight
1.976 - dylan
1.979 - favorite
1.980 - number
1.980 - great
1.985 - tears
1.986 - brilliant
1.986 - amazing
1.990 - touching
1.996 - best


# Use Glove Pre-Trained Weights



## Download the weights

In [0]:
!wget 'http://nlp.stanford.edu/data/glove.6B.zip'

In [0]:
!unzip glove.6B.zip

*Steps*:

1) Load embeddings
2) understand the formatting (already discussed in Lab)
3) map each word to its vector of weights
4) make an embeddings weight matrix that will be used as initialization of the Embedding Layer in keras.
5) adjust the embeddings Layer accordingly

## Map each word to its vector of weights

In [0]:
embeddings_index = dict()

f = open('glove.6B.100d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

## Make an embeddings weight matrix 

In [0]:
embedding_matrix = np.zeros((num_words, 100))
for word, index in tokenizer.word_index.items():
    if index > num_words - 1:
        break
    else:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[index] = embedding_vector

## Build The New Model

In [0]:
embedding_size = 100

model_two = Sequential()

model_two.add(Embedding(input_dim=num_words,
                    output_dim=embedding_size,
                    input_length=max_tokens,
                    name='layer_embedding', weights = [embedding_matrix]))

model_two.add(LSTM(units=16, return_sequences=True))
model_two.add(LSTM(units=8, return_sequences=True))
model_two.add(LSTM(units=4))

model_two.add(Dense(1, activation='sigmoid'))

## Train The New Model

In [0]:
optimizer = Adam(lr=1e-3)
model_two.compile(loss='binary_crossentropy',
              optimizer=optimizer,
              metrics=['accuracy'])

In [0]:
model_two.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
layer_embedding (Embedding)  (None, 544, 100)          1000000   
_________________________________________________________________
lstm_15 (LSTM)               (None, 544, 16)           7488      
_________________________________________________________________
lstm_16 (LSTM)               (None, 544, 8)            800       
_________________________________________________________________
lstm_17 (LSTM)               (None, 4)                 208       
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 5         
Total params: 1,008,501
Trainable params: 1,008,501
Non-trainable params: 0
_________________________________________________________________


In [0]:
model_two.fit(x_train_pad, y_train,
          validation_split=0.05, epochs=6, batch_size=512)

Train on 23750 samples, validate on 1250 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
CPU times: user 23min 33s, sys: 2min 13s, total: 25min 46s
Wall time: 13min 39s


<tensorflow.python.keras.callbacks.History at 0x7fcfa37b7c88>

In [0]:
result = model_two.evaluate(x_test_pad, y_test)

CPU times: user 3min 43s, sys: 14 s, total: 3min 57s
Wall time: 2min 7s


## Test the new model

In [0]:
print("Accuracy: {0:.2%}".format(result[1]))

Accuracy: 85.85%
