## Importing packages

In [1]:
import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np
from scipy.spatial.distance import cdist
import os
import glob

In [2]:
# We need to import several things from Keras.
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import Dense, GRU, Embedding
from tensorflow.python.keras.optimizers import Adam
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

## Setting up the data

Here we are using the movie reviews dataset from imdb. That is classified as positive and negative reviews.

You can download the dataset from http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz

In [3]:
def load_data(train  = True):
    # Part of the path-name for either train or test-set
    train_test_path = "train" if train else "test"
    # Base directory where the extracted data is located
    dir_base = os.path.join(data_dir, "aclImdb", train_test_path)
    # Filename-patterns for the data-files
    path_pattern_pos = os.path.join(dir_base, "pos", "*.txt")
    path_pattern_neg = os.path.join(dir_base, "neg", "*.txt")

    # Get list of all the file-paths for the data
    path_pos = glob.glob(path_pattern_pos)
    path_neg = glob.glob(path_pattern_neg)

    # Read all the text_files
    data_pos = [_read_text_file(path) for path in path_pos]
    data_neg = [_read_text_file(path) for path in path_neg]

    # Concatenate the positive and negative data
    x = data_pos + data_neg

    # Create a list of the sentiments for the text-data
    # where 1.0 is for positive sentiment and 0.0 is for negative sentiment

    y = [1.0]*len(data_pos) + [0.0] * len(data_neg)

    return x,y

def _read_text_file(path):
    # Read and return alll the content of the text file with the given path

    with open(path, 'rt', encoding = 'utf-8') as file:
        # Read a list of string
        lines = file.readlines()

        # Concatenate to a single string.
        text = " ".join(lines)

    return text

In [4]:
# Saving our data to given location
data_dir = "**Add your location to dataset**"

In [5]:
x_train_text, y_train = load_data(train = True)

In [6]:
x_test_text, y_test = load_data(train = False)

In [7]:
print("Train-set size: ", len(x_train_text))
print("Test-set size: ", len(x_test_text))

Train-set size:  0
Test-set size:  0


In [23]:
data_text = x_train_text + x_test_text
print(len(data_text))

50000


<b>sample of training data<b>

In [24]:
x_train_text[2]

'Brilliant over-acting by Lesley Ann Warren. Best dramatic hobo lady I have ever seen, and love scenes in clothes warehouse are second to none. The corn on face is a classic, as good as anything in Blazing Saddles. The take on lawyers is also superb. After being accused of being a turncoat, selling out his boss, and being dishonest the lawyer of Pepto Bolt shrugs indifferently "I\'m a lawyer" he says. Three funny words. Jeffrey Tambor, a favorite from the later Larry Sanders show, is fantastic here too as a mad millionaire who wants to crush the ghetto. His character is more malevolent than usual. The hospital scene, and the scene where the homeless invade a demolition site, are all-time classics. Look for the legs scene and the two big diggers fighting (one bleeds). This movie gets better each time I see it (which is quite often).'

<b>its label</b>

In [25]:
y_train[2]

1.0

## Tokenizer
Tokenization is a technique of converting words to numeric values. Here we'll consider 10000 words to be converted to numeric values. This is done because our all the mathematical operations done inside a model require numeric inputs so to deal with that we tokenize our text data.

In [26]:
num_words = 10000
tokenizer = Tokenizer(num_words = num_words)

In [27]:
%%time
tokenizer.fit_on_texts(data_text)

Wall time: 10.8 s


<b>here we can see that each word is assigned a numeric value<b>

In [128]:
tokenizer.word_index

{'the': 1,
 'and': 2,
 'a': 3,
 'of': 4,
 'to': 5,
 'is': 6,
 'br': 7,
 'in': 8,
 'it': 9,
 'i': 10,
 'this': 11,
 'that': 12,
 'was': 13,
 'as': 14,
 'for': 15,
 'with': 16,
 'movie': 17,
 'but': 18,
 'film': 19,
 'on': 20,
 'not': 21,
 'you': 22,
 'are': 23,
 'his': 24,
 'have': 25,
 'be': 26,
 'one': 27,
 'he': 28,
 'all': 29,
 'at': 30,
 'by': 31,
 'an': 32,
 'they': 33,
 'so': 34,
 'who': 35,
 'from': 36,
 'like': 37,
 'or': 38,
 'just': 39,
 'her': 40,
 'out': 41,
 'about': 42,
 'if': 43,
 "it's": 44,
 'has': 45,
 'there': 46,
 'some': 47,
 'what': 48,
 'good': 49,
 'when': 50,
 'more': 51,
 'very': 52,
 'up': 53,
 'no': 54,
 'time': 55,
 'my': 56,
 'even': 57,
 'would': 58,
 'she': 59,
 'which': 60,
 'only': 61,
 'really': 62,
 'see': 63,
 'story': 64,
 'their': 65,
 'had': 66,
 'can': 67,
 'me': 68,
 'well': 69,
 'were': 70,
 'than': 71,
 'much': 72,
 'we': 73,
 'bad': 74,
 'been': 75,
 'get': 76,
 'do': 77,
 'great': 78,
 'other': 79,
 'will': 80,
 'also': 81,
 'into': 82,
 'p

In [129]:
len(tokenizer.word_index)

124252

In [130]:
x_train_tokens = tokenizer.texts_to_sequences(x_train_text)

In [131]:
x_train_text[1]

'Homelessness (or Houselessness as George Carlin stated) has been an issue for years but never a plan to help those on the street that were once considered human who did everything from going to school, work, or vote for the matter. Most people think of the homeless as just a lost cause while worrying about things such as racism, the war on Iraq, pressuring kids to succeed, technology, the elections, inflation, or worrying if they\'ll be next to end up on the streets.<br /><br />But what if you were given a bet to live on the streets for a month without the luxuries you once had from a home, the entertainment sets, a bathroom, pictures on the wall, a computer, and everything you once treasure to see what it\'s like to be homeless? That is Goddard Bolt\'s lesson.<br /><br />Mel Brooks (who directs) who stars as Bolt plays a rich man who has everything in the world until deciding to make a bet with a sissy rival (Jeffery Tambor) to see if he can live in the streets for thirty days withou

In [132]:
np.array(x_train_tokens[1])

array([  38,   14,  744, 3506,   45,   75,   32, 1771,   15,  153,   18,
        110,    3, 1344,    5,  343,  143,   20,    1,  920,   12,   70,
        281, 1228,  395,   35,  115,  267,   36,  166,    5,  368,  158,
         38, 2058,   15,    1,  504,   88,   83,  101,    4,    1, 4339,
         14,   39,    3,  432, 1148,  136, 8697,   42,  177,  138,   14,
       2791,    1,  295,   20, 5276,  351,    5, 3029, 2310,    1,   38,
       8697,   43, 3611,   26,  365,    5,  127,   53,   20,    1, 2032,
          7,    7,   18,   48,   43,   22,   70,  358,    3, 2343,    5,
        420,   20,    1, 2032,   15,    3, 3346,  208,    1,   22,  281,
         66,   36,    3,  344,    1,  728,  730,    3, 3864, 1320,   20,
          1, 1543,    3, 1293,    2,  267,   22,  281, 2734,    5,   63,
         48,   44,   37,    5,   26, 4339,   12,    6, 2079,    7,    7,
       3425, 2891,   35, 4446,   35,  405,   14,  297,    3,  986,  128,
         35,   45,  267,    8,    1,  181,  366, 69

In [133]:
x_test_tokens = tokenizer.texts_to_sequences(x_test_text)

## Padding and truncating the data

The problem now is that all the sentences that we have have has different lenghts  So we need to make sure that the data must have same length 

In [134]:
num_tokens = [len(tokens) for tokens in x_train_tokens + x_test_tokens]
num_tokens = np.array(num_tokens)

In [135]:
# Average number of words in a sequence is 
np.mean(num_tokens)

221.27716

In [136]:
# max number of words is
np.max(num_tokens)

2209

In [137]:
# The max number of tokens we will allow is set to the average plus 2 standard deviations
max_tokens = np.mean(num_tokens)+ 2 * np.std(num_tokens)

#Converting the value to int
max_tokens = int(max_tokens)
print(max_tokens)

544


In [138]:
# How many are shorter than the limit?
np.sum(num_tokens < max_tokens) / len(num_tokens)

0.9453

What a padding will do is , if the length of the string is less than the desired constant
length of string it gonna fill it up with zeros from the start(if pad = 'pre') or from the end
Now why 'pre' here, cause in this model we've set that the model will know that a text is starting
when the first state is zero so by padding from the start the model will not move to the next layer 
and will continue to change the state to the first state since there are zeros all along till 
the sentence comes


In [139]:
pad = 'pre'


 What if a text is longer than the required length: then we gonna truncate the text so that it comes 
 within our range, but there's an issue there. When we are truncating there's a change that we are 
 losing some important features or say information which is a 'COMPROMISE' that we have to make.

In [140]:
x_train_pad = pad_sequences(x_train_tokens, maxlen = max_tokens, padding= pad, truncating = pad)

In [141]:
x_train_pad

array([[   0,    0,    0, ...,   12,    9,  213],
       [   0,    0,    0, ...,    5,  343,  400],
       [   0,    0,    0, ...,    6,  179,  403],
       ...,
       [   0,    0,    0, ...,   17,   96,   74],
       [   0,    0,    0, ...,  260, 1219,  793],
       [   0,    0,    0, ...,   11,    6, 1377]])

In [142]:
x_test_pad = pad_sequences(x_test_tokens, maxlen=max_tokens , padding = pad, truncating = pad)

## Tokenizer inverse map

We also need a function that will convert our tokenized words back to original. (numbers -> words)

In [143]:
idx = tokenizer.word_index
inverse_map = dict(zip(idx.values(), idx.keys()))

In [144]:
# Helper function for converting a list of tokends back to a string of words
def tokens_to_string(tokens):
    # Map from tokens back to the words:
    words = [inverse_map[token] for token in tokens if token != 0]
    
    #Concatenate all the words
    text = " ".join(words)
    return text

In [145]:
# Lets see how well it converts 
x_train_text[1]

'Homelessness (or Houselessness as George Carlin stated) has been an issue for years but never a plan to help those on the street that were once considered human who did everything from going to school, work, or vote for the matter. Most people think of the homeless as just a lost cause while worrying about things such as racism, the war on Iraq, pressuring kids to succeed, technology, the elections, inflation, or worrying if they\'ll be next to end up on the streets.<br /><br />But what if you were given a bet to live on the streets for a month without the luxuries you once had from a home, the entertainment sets, a bathroom, pictures on the wall, a computer, and everything you once treasure to see what it\'s like to be homeless? That is Goddard Bolt\'s lesson.<br /><br />Mel Brooks (who directs) who stars as Bolt plays a rich man who has everything in the world until deciding to make a bet with a sissy rival (Jeffery Tambor) to see if he can live in the streets for thirty days withou

In [146]:
tokens_to_string(x_train_tokens[1])

"or as george stated has been an issue for years but never a plan to help those on the street that were once considered human who did everything from going to school work or vote for the matter most people think of the homeless as just a lost cause while worrying about things such as racism the war on iraq kids to succeed technology the or worrying if they'll be next to end up on the streets br br but what if you were given a bet to live on the streets for a month without the you once had from a home the entertainment sets a bathroom pictures on the wall a computer and everything you once treasure to see what it's like to be homeless that is lesson br br mel brooks who directs who stars as plays a rich man who has everything in the world until deciding to make a bet with a sissy rival to see if he can live in the streets for thirty days without the if succeeds he can do what he wants with a future project of making more buildings the on where is thrown on the street with a on his leg t

## Defining the Model

In [147]:
model = Sequential()

The first layer in the RNN is called Embedding layer which converts 
each integer-token into a vector value

In [148]:
embedding_size = 8

In [149]:
model.add(Embedding(input_dim = num_words, output_dim = embedding_size, input_length = max_tokens, 
                    name = 'layer_embedding'))


Adding the first recurrent layer: 
Gated Recurrent Unit 
GRU and LSTM are kinda similar in performace on most datasets but the structure of LSTM is quite 
complex as compared to GRU where GRU is simpler
Here we want an output dimensionality of 16

In [150]:
model.add(GRU(16, return_sequences=True))

This is the second GRU with 8 output units. Ths will be followed by another GRU 
so it must also reutrn sequences

In [None]:
model.add(GRU(8, return_sequences = True))

 Now we have the third and the final GRU with 4 output units. This will be followed by a dense layer
 so it should only give the final output of the GRU and not a whole sequence of outputs

In [152]:
model.add(GRU(4))

Adding a fully connected layer which computes a value between 0.0 and 1.0 that will be
used as the classification output

In [153]:
model.add(Dense(1, activation='sigmoid'))

In [154]:
optimizer = Adam(lr=1e-3)

In [155]:
model.compile(loss ='binary_crossentropy', optimizer =  optimizer, metrics = ['accuracy'])

In [157]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
layer_embedding (Embedding)  (None, 544, 8)            80000     
_________________________________________________________________
gru_25 (GRU)                 (None, 544, 16)           1200      
_________________________________________________________________
gru_26 (GRU)                 (None, 544, 8)            600       
_________________________________________________________________
gru_27 (GRU)                 (None, 4)                 156       
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 5         
Total params: 81,961
Trainable params: 81,961
Non-trainable params: 0
_________________________________________________________________


## Training the model with our processed data

In [160]:
%%time
model.fit(x_train_pad, y_train, validation_split = 0.05, epochs = 3, batch_size = 64)

Train on 23750 samples, validate on 1250 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
Wall time: 10min 9s


<tensorflow.python.keras.callbacks.History at 0x1e0310d58d0>

In [161]:
%%time
result = model.evaluate(x_test_pad, y_test)

Wall time: 1min 13s


In [163]:
print("Accuracy : {0:.2%}".format(result[1]))

Accuracy : 86.78%


In [166]:
# Examples of mis-classified text

y_pred = model.predict(x = x_test_pad[0:1000])
y_pred = y_pred.T[0]

In [168]:
cls_pred= np.array([1.0 if p>0.5 else 0.0 for p in y_pred])

In [169]:
cls_true = np.array(y_test[0:1000])

In [176]:
incorrect = np.where(cls_pred != cls_true)
incorrect = incorrect[0]

In [178]:
len(incorrect)

125

In [179]:
idx = incorrect[0]
idx

7

In [180]:
text = x_test_text[idx]
text

"I felt this film did have many good qualities. The cinematography was certainly different exposing the stage aspect of the set and story. The original characters as actors was certainly an achievement and I felt most played quite convincingly, of course they are playing themselves, but definitely unique. The cultural aspects may leave many disappointed as a familiarity with the Chinese and Oriental culture will answer a lot of questions regarding parent/child relationships and the stigma that goes with any drug use. I found the Jia Hongsheng story interesting. On a down note, the story is in Beijing and some of the fashion and music reek of early 90s even though this was made in 2001, so it's really cheesy sometimes (the Beatles crap, etc). Whatever, not a top ten or twenty but if it's on the television, check it out."

In [185]:
#what model said
y_pred[idx]

0.34652925

In [186]:
#What it actually is
cls_true[idx]

1.0

## Testing our model with our own reviews

 Now lets see how well our model performs on our own data
you can edit and try it on your own

In [236]:
text1 = "This is a great movie! I really like it because it was so good you know!"
text2 = "Good one, really this is how you make a movie. But the sad part is the hero died in the end"
text3 = "Worst thing that i've ever seen in my life"
text4 = "You call that acting? It was pathetic"
text5 = "This movie is so bad man. Can I get my money back??"
text6 = "I really like the way they played their roles. I love the story and the climax"
text7 = " extremely  bad acting"
text8 = "I really hate this movie, it sucks"

texts = [text1,text2,text3,text4,text5,text6,text7,text8]


In [237]:
# Convert the text to integer tokens because that is what our machine demands
tokens = tokenizer.texts_to_sequences(texts)

In [238]:
# Padding out input so that it gonna be of the same length
tokens_pad = pad_sequences(tokens, maxlen=max_tokens, padding = pad, truncating= pad)

In [239]:
tokens_pad.shape

(8, 544)

In [240]:
model_out = model.predict(tokens_pad)

In [255]:
print(model_out)

[[0.9501376 ]
 [0.53903544]
 [0.17056009]
 [0.71476394]
 [0.41013265]
 [0.94253695]
 [0.3093451 ]
 [0.48402002]]


In [256]:
# Now lets make it human friendly
for i in model_out:
    if i > 0.5:
        print("POSITIVE")
    else:
        print("NEGATIVE")

POSITIVE
POSITIVE
NEGATIVE
POSITIVE
NEGATIVE
POSITIVE
NEGATIVE
NEGATIVE


In [257]:
# Lets analyse the model

layer_embedding = model.get_layer('layer_embedding')


In [259]:
weights_embedding = layer_embedding.get_weights()[0]


In [262]:
weights_embedding.shape

(10000, 8)

In [267]:
# Lets us get the integer-token for the word 'good', which is just an index into the vocabulary
token_good = tokenizer.word_index['good']
token_good

49

In [268]:
# Lets see for another word
token_suck = tokenizer.word_index['suck']
token_suck

2679

In [271]:
# Now lets see the embedding vectors
weights_embedding[token_good]


array([-0.06454835, -0.02658044, -0.03438243,  0.06982238, -0.0149335 ,
        0.07477991, -0.05760748, -0.00995041], dtype=float32)

In [272]:
weights_embedding[token_suck]

array([ 0.03425425,  0.04777166,  0.04658137, -0.03020082,  0.0352446 ,
       -0.03772571,  0.09680279,  0.09218275], dtype=float32)