# Introduction
- Text classification and generation using RNNs
- Types/modes of RNNs
- Using IMDB reviews dataset 
- [Ref: Tensor Guide](https://www.tensorflow.org/tutorials/text/text_classification_rnn)

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk

import os, random, pickle 

from termcolor import colored

%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

# 1. Load Data - TFDS 
TFDS = Public datasets in an easy to use format

In [2]:
import tensorflow as tf
# a collection of ready to use datasets of type tf.data.Datasets
import tensorflow_datasets as tfds 

In [3]:
##### Helper functions
## Plot graphs 
def plot_graphs(fhist, metric): 
    plt.plot(fhist[ metric ] )
    plt.plot( fhist['val_'+metric ] , '')
    plt.xlabel('Epochs')
    plt.ylabel( metric )
    plt.legend( [ metric, 'val_'+metric ] )
    plt.show()
    

In [4]:
###### Download IMDB dataset using TFDS
dataset, infor = tfds.load("imdb_reviews/subwords8k", with_info=True, as_supervised=True)

train_x, test_x = dataset['train'], dataset['test']

# info has the text encoder, in this case tfds.features.text.SubwordTextEncoder 
encoder = infor.features['text'].encoder

In [5]:
print("{} {}".format( colored("Dataset Infor:", "blue"), infor) ) 
      
      
print("Number of training observations: {}".format( infor.splits['train'].num_examples  ) )
print("Number of testing observations: {}".format( infor.splits['test'].num_examples ) )

print("Number of unique labels/classes: {}\n\tClass Labels: {}".format( 
    infor.features['label'].num_classes, infor.features['label'].names ) )

print("\nEncoder: {}".format(encoder) ) 
print("Encoder.Vocab_size: {}".format(encoder.vocab_size ) ) 
s = "The quick brown fox" 
es = encoder.encode(s)
print("Encoder.example: {} ==> {}".format(s, es ) ) 
for i in es:
    e = encoder.decode([i])
    print("\t{} -->{}".format(i, e ), end="" ) 

print("\nSample text: {}".format( tfds.as_numpy(train_x) ) )

Dataset Infor: tfds.core.DatasetInfo(
    name='imdb_reviews',
    version=1.0.0,
    description='Large Movie Review Dataset.
This is a dataset for binary sentiment classification containing substantially more data than previous benchmark datasets. We provide a set of 25,000 highly polar movie reviews for training, and 25,000 for testing. There is additional unlabeled data for use as well.',
    homepage='http://ai.stanford.edu/~amaas/data/sentiment/',
    features=FeaturesDict({
        'label': ClassLabel(shape=(), dtype=tf.int64, num_classes=2),
        'text': Text(shape=(None,), dtype=tf.int64, encoder=<SubwordTextEncoder vocab_size=8185>),
    }),
    total_num_examples=100000,
    splits={
        'test': 25000,
        'train': 25000,
        'unsupervised': 50000,
    },
    supervised_keys=('text', 'label'),
    citation="""@InProceedings{maas-EtAl:2011:ACL-HLT2011,
      author    = {Maas, Andrew L.  and  Daly, Raymond E.  and  Pham, Peter T.  and  Huang, Dan  and  Ng, Andr

# 2. Prepare Data

In [6]:
##### 
# 1. Batch datasets and zero pad sequence lengths to longest len or max size 

BUFFER_SIZE = 10000
BATCH_SIZE = 64

train_x = (train_x.shuffle(BUFFER_SIZE).padded_batch(BATCH_SIZE, padded_shapes=([None], [])))
test_x = (test_x.padded_batch(BATCH_SIZE, padded_shapes=([None], [])))


# 3. Build Model
LSTM RNN
- Model Arch = \[ embed, LSTM RNN, Dense, Output(1, p(x)) \]
- Wrap the LSTM with Bidirectional layer so as to learn long range dependencies 

In [7]:
#### Model Arch = [ embed, LSTM RNN, Dense, Output(1, p(x)) ]
# Wrap the LSTM with Bidirectional layer so as to learn long range dependencies 

def get_lstm_model(emb_vocab_size, unitz=64, wrap_bidirection=True, n_lstmz=1):
        
    md = tf.keras.Sequential()
    # embedding 
    md.add( tf.keras.layers.Embedding( emb_vocab_size, unitz) )
    # n lstm layers 
    if n_lstmz == 1:
        lstm = tf.keras.layers.LSTM(unitz)
        md.add( tf.keras.layers.Bidirectional(lstm) if wrap_bidirection else lstm )
    else:
        f_l = 1
        # full unitz on first n-1 and return sequence << TODO: the matrix dimensions math and see how to change unitz
        lstm = tf.keras.layers.LSTM(unitz, return_sequences=True)
        for _ in range(n_lstmz-f_l):  ## less first and last layers
            md.add( tf.keras.layers.Bidirectional(lstm) if wrap_bidirection else lstm )
        
        # half unitz on last 
        md.add( tf.keras.layers.Bidirectional(lstm) if wrap_bidirection else lstm )
        
    # outputs @ relu, dropout=0.5 and p(x)
    md.add(tf.keras.layers.Dense(unitz, activation='relu') )
    if n_lstmz > 1:
        md.add( tf.keras.layers.Dropout(rate=0.5) ) #TODO: move out 
    md.add( tf.keras.layers.Dense( 1 ) )

    return md

def compile_2class_lstm_model(emb_vocab_size, unitz=64, wrap_bidirection=True, n_lstmz=1):
    md = get_lstm_model(emb_vocab_size, unitz, wrap_bidirection, n_lstmz)
    
    loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)
    optimizer = tf.keras.optimizers.Adam(1e-4)
    metrics = ['accuracy']
    md.compile(loss=loss, optimizer=optimizer, metrics=metrics)

    return md

In [8]:
model = compile_2class_lstm_model( encoder.vocab_size)
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 64)          523840    
_________________________________________________________________
bidirectional (Bidirectional (None, 128)               66048     
_________________________________________________________________
dense (Dense)                (None, 64)                8256      
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 65        
Total params: 598,209
Trainable params: 598,209
Non-trainable params: 0
_________________________________________________________________


# 4. Train Model

In [None]:
##### A. Train 
def train_model( model, train_x, epochs=10, test_epochs=3, batch_size=64 ):
    # 1. create callback for early stopping on validation loss if not loss decrease in two consecutive tries
    callbacks = [
        tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=2)
    ]
    
    # 2. train unsupervised?? 
    train_history = model.fit(
        train_x, 
        epochs = epochs,
        callbacks=callbacks,
        validation_data=test_x, 
        verbose=2, # log once per epoch = 2       
#         batch_size=batch_size
    )

    # 3. save model
    model.save( 'IMDB_ltsm_model.rnn') 
    
    return train_history.history

fhist = train_model(model, train_x)

print("\n{}\n".format( fhist ) )

Epoch 1/10
391/391 - 4372s - loss: 0.6493 - accuracy: 0.5594 - val_loss: 0.4673 - val_accuracy: 0.7819
Epoch 2/10
391/391 - 2471s - loss: 0.3486 - accuracy: 0.8515 - val_loss: 0.3293 - val_accuracy: 0.8505
Epoch 3/10
391/391 - 2552s - loss: 0.2508 - accuracy: 0.9016 - val_loss: 0.3225 - val_accuracy: 0.8725
Epoch 4/10
391/391 - 2632s - loss: 0.2066 - accuracy: 0.9238 - val_loss: 0.3497 - val_accuracy: 0.8737
Epoch 5/10


In [None]:
##### B. Validate
def pad_to_size(vec, size):
    zeroz = [0] * (size - len(np.array(vec) ) )
    vec.extend(zeroz)
    return vec

test_loss, test_acc = model.evaluate( test_x )
print("Without Padding: loss = {} \taccuracy={}".format(test_loss, test_acc ) )


# test_loss, test_acc = model.evaluate( pad_to_size(test_x, 64 ) ) 
# print("With Padding: loss = {} \taccuracy={}".format(test_loss, test_acc ) )

In [None]:
##### Graphs 
plot_graphs(fhist, 'accuracy')


In [None]:
plot_graphs(fhist, 'loss')

# 5. Predict

In [None]:
def predict_sentiment(observation, pad=False):
    enc_observation = encoder.encode( observation )
    
    if pad:
        enc_observation = pad_to_size( enc_observation, 64)
        
    enc_observation = tf.cast( enc_observation, tf.float32 )
    
    pred = model.predict( tf.expand_dims(enc_observation, 0 ) )
    
    return pred

In [None]:
sample_reviews = [ "That was such a good movie!", "That was terribly good!", 
                  "That was such a bad movie!", "Amazing! How can something so bad be out there!"]

print("\n ==== NO PADDING ==== \n")
for s in sample_reviews:
    print("{} ===> {}".format(s, predict_sentiment(s) ) ) 
    
# print("\n ==== YES PADDING ==== \n")
# for s in sample_reviews:
#     print("{} ===> {}".format(s, predict_sentiment(s, pad=True) ) ) 

# 6. Stack multiple LSTM layers

In [None]:
n_lsmz = 3

## create model
model = compile_2class_lstm_model( encoder.vocab_size, n_lstmz=n_lsmz)
model.summary()

## train model 
fhist = train_model(model, train_x)
print("\n{}\n".format( fhist ) )

## evaluate
test_loss, test_acc = model.evaluate( test_x )
print("\nWithout Padding: loss = {} \taccuracy={}".format(test_loss, test_acc ) )

# test_loss, test_acc = model.evaluate( pad_to_size(test_x , 64) ) 
# print("\nWith Padding: loss = {} \taccuracy={}".format(test_loss, test_acc ) )

## predict
print("\n ==== NO PADDING ==== \n")
for s in sample_reviews:
    print("{} ===> {}".format(s, predict_sentiment(s) ) ) 
    
# print("\n ==== YES PADDING ==== \n")
# for s in sample_reviews:
#     print("{} ===> {}".format(s, predict_sentiment(s, pad=True) ) ) 

In [None]:
##### Graphs 
plot_graphs(fhist, 'accuracy')
plot_graphs(fhist, 'loss')