In [1]:
## Most Important
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

## less Important
from functools import partial
import os
from scipy import stats
import missingno as msno
import joblib
import tarfile
import shutil
import urllib

## Sklearn
from sklearn import datasets
## Preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
## Metrics
from sklearn.metrics import accuracy_score

## tensorflow & Keras
import tensorflow as tf    ## i will use tf for every thing and for keras using tf.keras
import tensorflow_datasets as tfds

## In this Notebook:

### Stateless RNN 
* `trained to predict the next charachter in a sentence`
* `stateless RNN learns a random portion of text on each Iteration, without any information about the rest of the text`

### Statefull RNN
* `which preserves the hidden state between training iteration and continue reading where it left off, allowing it to learn longer pattrens`

### RNN for Sentiment Analysis
* `e.g, reading movies reviews, and extract the rater's feelings about the movie`
* `this time treating sentences as sequences if words rather than charchters`

### Encoder-Decoder archiecture
* `to perform NMT (neural machine translation) using seq2seq API provided by TensorFlow`

### Attention Mechanism
* `these are neural networks that learn to select the part of the inputs that the rest of the model should focus on each time step, and boost performance of the RNN-encoder-decoder, the we will rop RNN and use attention architecture only called (Transformers)`

### Stateless RNN
`trained to predict the next charachter in a sentence`

In [2]:
## reading the data => using shakespeare
shakespeare_url = 'https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt'

## take the name on your local pc and url
filepath = tf.keras.utils.get_file('shakespeare.txt', shakespeare_url)

with open(filepath) as f:       ## reading the file
    shakespeare_txt = f.read()

In [3]:
## tokenization

char_tokenizer = tf.keras.preprocessing.text.Tokenizer(char_level=True)
char_tokenizer.fit_on_texts(shakespeare_txt)

In [4]:
## test yout tokenizer by passing to it a text

## from text to sequence
text_to_seq = char_tokenizer.texts_to_sequences(['mohammed agoor'])
print('text_to_seq =>', text_to_seq)

print()

## take the above output and get from sequence to text
seq_to_text = char_tokenizer.sequences_to_texts(text_to_seq)
print('seq_to_text =>', seq_to_text)

text_to_seq => [[15, 4, 7, 5, 15, 15, 2, 13, 1, 5, 21, 4, 4, 9]]

seq_to_text => ['m o h a m m e d   a g o o r']


In [5]:
## test yout tokenizer by passing to it a text

## from text to sequence
max_chars = len(char_tokenizer.word_index)
print('max_chars =>', max_chars)

size_dataset = char_tokenizer.document_count
print('size_dataset =>', size_dataset)

max_chars => 39
size_dataset => 1115394


In [6]:
## tokenize from texts to sequences
## because the indexing starts from 1, decrease by 1 to start from 0

full_dataset = np.array(char_tokenizer.texts_to_sequences([shakespeare_txt])) - 1 
full_dataset = np.squeeze(full_dataset)
full_dataset

array([19,  5,  8, ..., 20, 26, 10])

In [7]:
##  num_total_chars = 1115394
## take 90% for training
ratio_train = size_dataset * 90 // 100
train_set = full_dataset[:ratio_train]

## create tensor dataset

train_set = tf.data.Dataset.from_tensor_slices(train_set)

## show some values
for val in train_set.batch(5).take(4):
    print(val)

tf.Tensor([19  5  8  7  2], shape=(5,), dtype=int32)
tf.Tensor([ 0 18  5  2  5], shape=(5,), dtype=int32)
tf.Tensor([35  1  9 23 10], shape=(5,), dtype=int32)
tf.Tensor([21  1 19  3  8], shape=(5,), dtype=int32)


#### `Chopping the Sequential Dataset into Mulible Windows`

* `the train_set now consists of a single sequence with over million chars, so we can not just train model because RNN would be equivalent to a deep net with over million layers`
* Instead we we will use dataset's window, to convert this long sequence of chars to small windows of text, every window will be a short substring of this long sequence
* `but what is the length of this substring, how many chars to used in it, it is easier to train RNN on shorter strings, but RNN will not be able to learn any pattern longer than this length, so don't make it so small`

In [8]:
## use n_steps=100
n_steps = 100
window_length = n_steps + 1  ## shift 1 for target

## we will create windows which are not overlapped, but shifted = 1
## the first window will be from 0 to 100, second from 1 to 101 and so on
## dropping the last windows which do not achieve the window length to have equally windows
train_set_window = train_set.window(size=window_length, shift=1, drop_remainder=True)


## show some values
for val in train_set_window.batch(5).take(4):
    print(val)

<tensorflow.python.data.ops.dataset_ops._NestedVariant object at 0x0000016477BF7C40>
<tensorflow.python.data.ops.dataset_ops._NestedVariant object at 0x0000016477BF7D00>
<tensorflow.python.data.ops.dataset_ops._NestedVariant object at 0x0000016477BF7C40>
<tensorflow.python.data.ops.dataset_ops._NestedVariant object at 0x0000016477BF7D00>


In [9]:
## this output is nested, we should flat_map it
## flat_map and the result should be batched for each window 
## e.g => if the output from the window is {{1,2},{3,4,5,6}}
## then after apply this to flat_map (lambda ds:ds.batch(2))
## the result will be {[1,2],[3,4],[5,6]}  => this what i want

train_set_mapped = train_set_window.flat_map(lambda window: window.batch(window_length))

## show some values
for val in train_set_mapped.batch(1).take(2):
    print(val)

tf.Tensor(
[[19  5  8  7  2  0 18  5  2  5 35  1  9 23 10 21  1 19  3  8  1  0 16  1
   0 22  8  3 18  1  1 12  0  4  9 15  0 19 13  8  2  6  1  8 17  0  6  1
   4  8  0 14  1  0  7 22  1  4 24 26 10 10  4 11 11 23 10  7 22  1  4 24
  17  0  7 22  1  4 24 26 10 10 19  5  8  7  2  0 18  5  2  5 35  1  9 23
  10 15  3 13  0]], shape=(1, 101), dtype=int32)
tf.Tensor(
[[ 5  8  7  2  0 18  5  2  5 35  1  9 23 10 21  1 19  3  8  1  0 16  1  0
  22  8  3 18  1  1 12  0  4  9 15  0 19 13  8  2  6  1  8 17  0  6  1  4
   8  0 14  1  0  7 22  1  4 24 26 10 10  4 11 11 23 10  7 22  1  4 24 17
   0  7 22  1  4 24 26 10 10 19  5  8  7  2  0 18  5  2  5 35  1  9 23 10
  15  3 13  0  4]], shape=(1, 101), dtype=int32)


In [10]:
## shuffle and batch_size of 32 (each batch contains 32 window, each window=101 chars)

use_batch_size = 32

## each batch consists of 32 window, each window with 101 chars
train_set_shuffled = train_set_mapped.shuffle(buffer_size=10000).batch(use_batch_size)
train_set_splitted = train_set_shuffled.map(lambda window: (window[:, :-1], window[:, 1:]))

## show some values
for feat, target in train_set_splitted.batch(1).take(1): 
    print('Features is : ==== ')
    print(feat)
    print('===='*20)
    print(target)   ## Looks very GREAT!

Features is : ==== 
tf.Tensor(
[[[15 10 16 ...  7  1  0]
  [24  1  0 ...  5 18  6]
  [12  0 18 ... 20  3 12]
  ...
  [ 9 23 10 ...  3 13 10]
  [ 5  2  5 ...  6  3  9]
  [ 0 13  7 ...  7  6  4]]], shape=(1, 32, 100), dtype=int32)
tf.Tensor(
[[[10 16  5 ...  1  0  4]
  [ 1  0  1 ... 18  6 17]
  [ 0 18 13 ...  3 12  7]
  ...
  [23 10  2 ... 13 10  5]
  [ 2  5 35 ...  3  9  1]
  [13  7 26 ...  6  4 11]]], shape=(1, 32, 100), dtype=int32)


In [11]:
## we must encode these IDS => I will use ONE HOT encoding
train_set_encoded = train_set_splitted.map(lambda X, y: (tf.one_hot(X, depth=max_chars), y))

for feat, target in train_set_encoded.batch(1).take(1): 
    print('Features is : ==== ')
    print(feat)
    print('===='*20)
    print(target)   ## Looks very GREAT!

Features is : ==== 
tf.Tensor(
[[[[1. 0. 0. ... 0. 0. 0.]
   [0. 0. 0. ... 0. 0. 0.]
   [0. 1. 0. ... 0. 0. 0.]
   ...
   [0. 0. 0. ... 0. 0. 0.]
   [0. 1. 0. ... 0. 0. 0.]
   [0. 0. 0. ... 0. 0. 0.]]

  [[0. 0. 0. ... 0. 0. 0.]
   [0. 0. 1. ... 0. 0. 0.]
   [0. 0. 0. ... 0. 0. 0.]
   ...
   [0. 0. 0. ... 0. 0. 0.]
   [0. 0. 0. ... 0. 0. 0.]
   [0. 0. 0. ... 0. 0. 0.]]

  [[0. 0. 1. ... 0. 0. 0.]
   [0. 0. 0. ... 0. 0. 0.]
   [0. 0. 0. ... 0. 0. 0.]
   ...
   [0. 0. 0. ... 0. 0. 0.]
   [0. 0. 0. ... 0. 0. 0.]
   [0. 0. 0. ... 0. 0. 0.]]

  ...

  [[0. 0. 0. ... 0. 0. 0.]
   [0. 0. 0. ... 0. 0. 0.]
   [0. 0. 0. ... 0. 0. 0.]
   ...
   [0. 0. 0. ... 0. 0. 0.]
   [0. 0. 0. ... 0. 0. 0.]
   [0. 0. 1. ... 0. 0. 0.]]

  [[0. 0. 0. ... 0. 0. 0.]
   [1. 0. 0. ... 0. 0. 0.]
   [0. 0. 1. ... 0. 0. 0.]
   ...
   [0. 0. 1. ... 0. 0. 0.]
   [0. 0. 0. ... 0. 0. 0.]
   [0. 1. 0. ... 0. 0. 0.]]

  [[0. 0. 0. ... 0. 0. 0.]
   [0. 0. 0. ... 0. 0. 0.]
   [0. 0. 0. ... 0. 0. 0.]
   ...
   [0. 0. 0. ... 0.

In [12]:
## finally prefetch
train_set_final = train_set_encoded.prefetch(1)

#### Building the Model

In [15]:
model_char = tf.keras.models.Sequential([
    tf.keras.layers.GRU(128, return_sequences=True, dropout=0.2, 
                        input_shape=[None, max_chars]),
    tf.keras.layers.GRU(128, return_sequences=True, dropout=0.2),
    tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(max_chars, activation='softmax'))
])

model_char.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
history = model_char.fit(train_set_final, epochs=3)  ## need more training (saving my time)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [16]:
model_char.save('model_char_stateless_1.h5')

In [20]:
### define a function to process the new data
def preprocess(texts):
    x = np.array(char_tokenizer.texts_to_sequences(texts)) - 1
    x = tf.one_hot(x, depth=max_chars)
    return x 
    
    
X_new = preprocess(['what is your nam'])
y_pred = np.argmax(model_char.predict(X_new), axis=-1)

char_tokenizer.sequences_to_texts(y_pred + 1)[0][-1]    ## add 1 which you decreases before

## Great work!

'e'

#### `Generating Fake Text`

* `to generate next text using char-RNN stateless model, we could feed it some test, make the model predict the most likely next char, add it to the whole text and so on`
* but in practice, this often leads to the same words being repeated over and over again
* `Instead we can pick the next char randomly with a probability equal to the estimated probability`
* using tf.random.categorical => samples random class indecies, given the class log probabilities (logits), and to have more control over diversity of the generated text, we can divide the logits by a number called (temperature)
* `A temperature close to 0 will favor the high probabilty chars and very high temperature will give all chars an equal probabilty`

In [29]:
def next_char(text, temperature=1.0):
    ''' this function predicts the next char and transform it to text
    '''
    X_new = preprocess(text)
    y_proba = model_char.predict(X_new)[0,-1:,:]
    y_logits = tf.math.log(y_proba) / temperature
    char_id = tf.random.categorical(logits=y_logits, num_samples=1) + 1 
    ## as we decrease 1 in preprocss
    char_next = char_tokenizer.sequences_to_texts(char_id.numpy())[0]
    return char_next


## the above function get the next char, we want another function to add this preidcted char to
## the whole text and so on

def complete_char(text, temperature=1.0, n_char=50):  ## required n_char
    ''' this function is to concatenate the predicted char with the whole text
    '''
    for i in range(n_char):
        text += next_char(text, temperature)
    return text

In [33]:
complete_char('t', temperature=1.0, n_char=50)  ## need more training and tuning (save my time)

't ho heifrh r\ne!ho  e ooh uloeho, h hsiyw,hhe  iao '

-------
------

### Statefull RNN
* `At stateless RNN, the model starts with the hidden state full of zeros then it updates this state at each time step and after the last time step it throws it away`
* `what if we told the RNN to preserve the final hidden state after processing on each training batch and use it as the initial state for the next batch (not the next batch in the same epoch), but each epoch is isolated from each other`
* `each batch starts where the previous batch stopped, so not overlapping or shuffling`

In [36]:
##  num_total_chars = 1115394
## take 90% for training
ratio_train = size_dataset * 90 // 100   ## to avoid any fraction
train_set_2 = full_dataset[:ratio_train]

train_set_2 = tf.data.Dataset.from_tensor_slices(train_set_2)

n_steps = 100
window_length = n_steps + 1


## the same steps as previous
train_set_window_2 = train_set_2.window(size=window_length, shift=n_steps, drop_remainder=True)
train_set_mapped_2 = train_set_window_2.flat_map(lambda window: window.batch(window_length))
train_set_mapped_2 = train_set_mapped_2.batch(1)
train_set_divided_2 = train_set_mapped_2.map(lambda window: (window[:, :-1], window[:, 1:]))
train_set_encoded_2 = train_set_divided_2.map(lambda x, y: (tf.one_hot(x, depth=max_chars), y))
train_set_final_2 = train_set_encoded_2.prefetch(1)

#### Training the Model
Note : `stateful=True`, `and you must specify the (batch_size) in the batch_input_shape`

In [40]:
use_batch = 1
model_char_2 = tf.keras.models.Sequential([
    tf.keras.layers.GRU(128, return_sequences=True, stateful=True, 
                       batch_input_shape=[use_batch, None, max_chars]),
    tf.keras.layers.GRU(128, return_sequences=True, stateful=True),
    tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(max_chars, activation='softmax'))
])

In [41]:
## hidden states in passed from one batch to another in the same epoch
## but after the epoch is done we start the next epoch with zero hidden states and so on
class ResetStatesCallback(tf.keras.callbacks.Callback):  ## put it in the callbacks in fit
    def on_epoch_begin(self, epoch, logs):
        self.model.reset_states()

In [42]:
model_char_2.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
history = model_char_2.fit(train_set_final_2, epochs=3, 
                           callbacks=[ResetStatesCallback()])  

## model need to be trained for many epochs (save my time)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [43]:
model_char_2.save('model_char_2.h5')

* `Note that you can not use this statefull model with diffrent batches, Insted we make a copy as stateless one and take the weights from statefull one, as I will do`

In [46]:
## create a stateless model with the same structure
stateless_model = tf.keras.models.Sequential([
    tf.keras.layers.GRU(128, return_sequences=True, input_shape=[None, max_chars]),
    tf.keras.layers.GRU(128, return_sequences=True),
    tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(max_chars, activation="softmax"))
])

#### `To set the weights, we first need to build the model (so the weights get created):`

In [48]:
stateless_model.build(tf.TensorShape([None, None, max_chars]))  ## must build first

In [49]:
stateless_model.set_weights(model_char_2.get_weights())  ## set the weights

In [50]:
def next_char(text, temperature=1.0):
    X_new = preprocess(text)
    y_proba = stateless_model.predict(X_new)[0,-1:,:]  ## change the model to > stateless_model
    y_logits = tf.math.log(y_proba) / temperature
    char_id = tf.random.categorical(logits=y_logits, num_samples=1) + 1 
    ## as we decrease 1 in preprocss
    char_next = char_tokenizer.sequences_to_texts(char_id.numpy())[0]
    return char_next


## the above function get the next char, we want another function to add this preidcted char to
## the whole text and so on

def complete_char(text, temperature=1.0, n_char=50):  ## required n_char
    for i in range(n_char):
        text += next_char(text, temperature)
    return text

In [51]:
complete_char('t', temperature=2, n_char=50)  ## model need to be trained and tuned (save my time)

't3tsl:\nj,:y y,h,s\n:?h o:?;,\n\na\n.:s wau:a- r .e::ez;'

----
----

### Sentiment Analysis
`imdb reviews dataset`

In [52]:
## loading the imdb data
(X_train_full, y_train_full), (X_test, y_test) = tf.keras.datasets.imdb.load_data()

print('X_train_full shape', X_train_full.shape)
print('y_train_full shape', y_train_full.shape)
print('X_test shape', X_test.shape)
print('y_test shape', y_test.shape)

X_train_full shape (25000,)
y_train_full shape (25000,)
X_test shape (25000,)
y_test shape (25000,)


In [53]:
## get insid the data
print(X_train_full[0], end='')  ## it is word indexed not char indexed

[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 22665, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 21631, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 19193, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 10311, 8, 4, 107, 117, 5952, 15, 256, 4, 31050, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 12118, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 7486, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 5535, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 5345, 19, 178, 32]

In [54]:
## get the max word indexed in the data
max_idx_word = max([max(sequence) for sequence in X_train_full])
print('max_idx_word =>', max_idx_word)

max_idx_word => 88586


In [57]:
### get the word index
word_index = tf.keras.datasets.imdb.get_word_index() ## dict with word as key, and index as value
word_index

{'fawn': 34701,
 'tsukino': 52006,
 'nunnery': 52007,
 'sonja': 16816,
 'vani': 63951,
 'woods': 1408,
 'spiders': 16115,
 'hanging': 2345,
 'woody': 2289,
 'trawling': 52008,
 "hold's": 52009,
 'comically': 11307,
 'localized': 40830,
 'disobeying': 30568,
 "'royale": 52010,
 "harpo's": 40831,
 'canet': 52011,
 'aileen': 19313,
 'acurately': 52012,
 "diplomat's": 52013,
 'rickman': 25242,
 'arranged': 6746,
 'rumbustious': 52014,
 'familiarness': 52015,
 "spider'": 52016,
 'hahahah': 68804,
 "wood'": 52017,
 'transvestism': 40833,
 "hangin'": 34702,
 'bringing': 2338,
 'seamier': 40834,
 'wooded': 34703,
 'bravora': 52018,
 'grueling': 16817,
 'wooden': 1636,
 'wednesday': 16818,
 "'prix": 52019,
 'altagracia': 34704,
 'circuitry': 52020,
 'crotch': 11585,
 'busybody': 57766,
 "tart'n'tangy": 52021,
 'burgade': 14129,
 'thrace': 52023,
 "tom's": 11038,
 'snuggles': 52025,
 'francesco': 29114,
 'complainers': 52027,
 'templarios': 52125,
 '272': 40835,
 '273': 52028,
 'zaniacs': 52130,

In [56]:
## what about decoding the first review in X_train_full
## well, but note 
## that integers(0,1,2) is reserved for 
## (padding <pad>, start of sequence <sos>, unknown words <unk>)

reversed_word_index = dict([(value, key) for (key, value) in word_index.items()])

## get the first review, join with space and decrease the value of dict by 3 for reserved words
decoded_first_review = ' '.join([reversed_word_index.get(i-3, '?') for i in X_train_full[0]])
decoded_first_review

"? this film was just brilliant casting location scenery story direction everyone's really suited the part they played and you could just imagine being there robert redford's is an amazing actor and now the same being director norman's father came from the same scottish island as myself so i loved the fact there was a real connection with this film the witty remarks throughout the film were great it was just brilliant so much that i bought the film as soon as it was released for retail and would recommend it to everyone to watch and the fly fishing was amazing really cried at the end it was so sad and you know what they say if you cry at a film it must have been good and this definitely was also congratulations to the two little boy's that played the part's of norman and paul they were just brilliant children are often left out of the praising list i think because the stars that play them all grown up are such a big profile for the whole film but these children are amazing and should b

* `the above data is already preprocessed , what about preprocessing it ourselves`
* `load the real data and let make our hands dirty`

In [58]:
## loading the original dataset
imdb_dataset, imdb_info = tfds.load('imdb_reviews', as_supervised=True, with_info=True)

In [60]:
size_train = imdb_info.splits['train'].num_examples
size_test = imdb_info.splits['test'].num_examples

print('size_train =>', size_train)
print('size_test =>', size_test)

size_train => 25000
size_test => 25000


In [74]:
## showing sample
for X, y in imdb_dataset['train'].batch(1).take(2):
    for review, label in zip(X.numpy(), y.numpy()):
        print('Review is =>', review.decode('utf-8')[:200], '....')
        print('Label is =>', label, '==> Negative' if label==0 else '==> Positive')
        print()

Review is => This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting  ....
Label is => 0 ==> Negative

Review is => I have been known to fall asleep during films, but this is usually due to a combination of things including, really tired, being warm and comfortable on the sette and having just eaten a lot. However  ....
Label is => 0 ==> Negative



In [81]:
## this function is to prepare to craete the lookup table
def preprocess_imdb(X_batch, y_batch):
    ## take only the first 300 char (enough for sentiement analysis)
    X_batch = tf.strings.substr(X_batch, 0, 300)  
    ## replace the <br /> with spaces
    X_batch = tf.strings.regex_replace(X_batch, b"<br\s*/?>", b" ")
    ## replace any char not in a-z or A-Z with spaces
    X_batch = tf.strings.regex_replace(X_batch, b"[^a-zA-Z']", b" ")
    ## split for each word
    X_batch = tf.strings.split(X_batch)
    ## convert this ragged tensor to dense one, padding all reviews to have the same lenght
    X_batch = X_batch.to_tensor(default_value=b"<pad>")
    return X_batch, y_batch

In [87]:
## we must loop aver the whole datset and get each word count
from collections import Counter
vocab = Counter()
for X_batch, y_batch in  imdb_dataset['train'].batch(32).map(preprocess_imdb):
    for review in X_batch:
        # add elements from another iterable
        vocab.update(list(review.numpy()))

In [88]:
vocab.most_common()[:3]

[(b'<pad>', 214309), (b'the', 61137), (b'a', 38564)]

In [91]:
## we can truncate the vocab and take only the max 10000 word
vocab_size = 10000
truncated_vocab = [word for word, count in vocab.most_common()[:vocab_size]]

In [92]:
## map each word for its ID ==> Create a lookup table
words = tf.constant(truncated_vocab)
words_idxs = tf.range(len(truncated_vocab), dtype=tf.int64)

#### `What is lookup table`

``` python
        init = tf.lookup.KeyValueTensorInitializer(
                    keys=tf.constant(['emerson', 'lake', 'palmer']),
                    values=tf.constant([0, 1, 2], dtype=tf.int64))
        table = tf.lookup.StaticVocabularyTable(init, num_oov_buckets=5)

```
The `Vocabulary` object will performs the following mapping:
* `emerson -> 0`
* `lake -> 1`
* `palmer -> 2`
* `<other term> -> bucket_id`, where `bucket_id` will be between `3` and
`3 + num_oov_buckets - 1 = 7`, calculated by:
`hash(<term>) % num_oov_buckets + vocab_size`


In [93]:
## init lookup
vocab_init = tf.lookup.KeyValueTensorInitializer(words, words_idxs)

## use oov (out of vocab) (you already know what it is)
use_oov = 1000
table = tf.lookup.StaticVocabularyTable(vocab_init, num_oov_buckets=use_oov)

In [94]:
## test your table

txt_test = 'this film is so fantaaaaastic'
table.lookup(tf.constant([txt_test.split()])).numpy() 
## note that for all words excepth the final one do not exceed the max size(>10000)
## but the final word is out of the vocabulary so the index is >10000
## so it eas mapped to one of the oov buckets with an ID integer greater or equal than 10000

array([[    9,    15,     7,    34, 10703]], dtype=int64)

In [95]:
## finally Indexing the whole text
def encode_text(X_batch, y_batch):
    return table.lookup(X_batch), y_batch

In [101]:
train_set_imdb = imdb_dataset['train'].batch(32).map(preprocess_imdb).map(encode_text)
train_set_imdb = train_set_imdb.prefetch(1)  #### Training the Model)

In [107]:
## show our final step here, what we catch till here
for X, y in train_set_imdb.batch(1).take(1):
    print(X) ## 32 review each with 60 word (aroud 300 chars we cut before)
    print()
    print(y)

tf.Tensor(
[[[  22   11   28 ...    0    0    0]
  [   6   21   70 ...    0    0    0]
  [4099 6881    1 ...    0    0    0]
  ...
  [  22   12  118 ...  331 1047    0]
  [1757 4101  451 ...    0    0    0]
  [3365 4392    6 ...    0    0    0]]], shape=(1, 32, 60), dtype=int64)

tf.Tensor([[0 0 0 1 1 1 0 0 0 0 0 1 1 0 1 0 1 1 1 0 1 1 1 1 1 0 0 0 1 0 0 0]], shape=(1, 32), dtype=int64)


#### Training the Model

In [97]:
vocab_size, use_oov

(10000, 1000)

In [98]:
try_embed_dims = 128  ## should be tuned

model_imdb = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(input_dim=vocab_size+use_oov, 
                              output_dim=try_embed_dims, input_shape=[None]),
    tf.keras.layers.GRU(128, return_sequences=True),
    tf.keras.layers.GRU(128),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

In [99]:
model_imdb.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

history =  model_imdb.fit(train_set_imdb, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [108]:
model_imdb.save('model_imdb.h5')