In [9]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import matplotlib.pyplot as plt

data = keras.datasets.imdb

(train_data, train_labels),(test_data,test_labels) = data.load_data(num_words = 88000)
print(train_data[0])

word_index = data.get_word_index()
# this gives us the tuple where the key and value of every word is stored 

word_index = {k:(v+3) for k,v in word_index.items()}
# we started at v+3 b/c the first three keys describe a special character for our word mappings

word_index["<PAD>"] = 0
word_index["<START>"] = 1 # <START> adds to the beginning of the text 
word_index["<UNK>"] = 2 #UNK = <UNKNOWN> is added where the definition is missing 
word_index["<UNUSED>"] = 3
# since all the keys and values in the dictionary start from 0 here we are assigning our own values so that we can start the numbering from v+3
# so that if we get values that are not defined we can assign them to these values 

# what PAD - padding does is that it makes the length of all movie reviews to be of the same length
# by adding extra 0's at the end of the list

reverse_word_index = dict([(value,key) for (key,value) in word_index.items()])
# this actually swaps all the values and the keys and puts it in another dictionary
# intially the strings are pointing to the numbers and what we want is the numbers to point to the string

# in this we are preprocessing the data so that our model can accept the data
train_data = keras.preprocessing.sequence.pad_sequences(train_data, value = word_index["<PAD>"], padding ="post", maxlen = 260 )
test_data = keras.preprocessing.sequence.pad_sequences(test_data, value = word_index["<PAD>"], padding ="post", maxlen = 260 )

# print(len(train_data),len(test_data))

def decode_review(text):
    return " ".join([reverse_word_index.get(i,"?") for i in text])

# print(len(test_data[0]),len(test_data[1]))
print(len(test_data[0]),len(test_data[1]))


[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 22665, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 21631, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 19193, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 10311, 8, 4, 107, 117, 5952, 15, 256, 4, 31050, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 12118, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 7486, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 5535, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 5345, 19, 178, 32]
260 260


In [10]:
# ========model down here=========

# model can be defined in two ways
#1
# model = keras.Sequential([
#     keras.layers.Embedding(10000,16),
#     keras.layers.GlobalAveragePooling1D(),
#     keras.layers.Dense(16,activation = "relu"),
#     keras.layers.Dense(1,activation = "sigmoid")
# ])
#2
model = keras.Sequential()
model.add(keras.layers.Embedding(88000,16)) # makes word vecotors and groups similiar words into groups => gives an output vector in 16 dimnensions
model.add(keras.layers.GlobalAveragePooling1D()) # reduces the dimensions of the input data i.e. reduces the input 16 dimensions into lesser dimensions
model.add(keras.layers.Dense(16,activation = "relu")) # this layer contains 16 units
model.add(keras.layers.Dense(1,activation = "sigmoid")) # this layer contains only 1 o/p unit

# model.summary()
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"] )

X_val = train_data[:10000] # taking the first 10000 entries of the data as validation data
X_train = train_data[10000:] # taking the entries left of the data as training data

y_val = train_labels[:10000]
y_train = train_labels[10000:]

fitModel = model.fit(X_train, y_train, epochs = 40 , batch_size = 512, validation_data=(X_val, y_val),verbose=1)
# batch size = how many movie reviews we are going to take in/ load  at one time

results = model.evaluate(test_data, test_labels)
print(results)





Train on 15000 samples, validate on 10000 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


[0.3296835299110413, 0.87212]


In [11]:
test_review = test_data[0]
predict = model.predict([test_review])
print("Review: ")
print(decode_review(test_review))
print("Prediction: "+ str(predict[0]))
print("Actual: "+ str(test_labels[0]))
# 0 means the review is negative
# 1 means the reviwe is positive



Review: 
<START> please give this one a miss br br kristy swanson and the rest of the cast rendered terrible performances the show is flat flat flat br br i don't know how michael madison could have allowed this one on his plate he almost seemed to know this wasn't going to work out and his performance was quite lacklustre so all you madison fans give this a miss <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD

In [12]:
def review_encode(s):
    encoded = [1]
    for word in s:
        if word.lower() in word_index:
            encoded.append(word_index[word.lower()])
        else:
            encoded.append(2)
    return encoded 



In [15]:
with open("test.txt", encoding = "utf-8") as f:
    for line in f.readlines():
        nline = line.replace(",", "").replace(".", "").replace("(", "").replace(")", "").replace(":", "").replace(" \" ", "").strip().split(" ")
        encode = review_encode(nline)
        encode = keras.preprocessing.sequence.pad_sequences([encode], value = word_index["<PAD>"], padding ="post", maxlen = 250 )
#         print(encode)
        predicted = model.predict(encode)
        print(line)
        print(encode)                                                                                                          
        print(predicted[0])

Of all the animation classics from the Walt Disney Company, there is perhaps none that is more celebrated than "The Lion King." Its acclaim is understandable: this is quite simply a glorious work of art."The Lion King" gets off to a fantastic start. The film's opening number, "The Circle of Life," is outstanding. The song lasts for about four minutes, but from the first sound, the audience is floored. Not even National Geographic can capture something this beautiful and dramatic. Not only is this easily the greatest moment in film animation, this is one of the greatest sequences in film history. The story that follows is not as majestic, but the film has to tell a story. Actually, the rest of the film holds up quite well. The story takes place in Africa, where the lions rule. Their king, Mufasa (James Earl Jones) has just been blessed with a son, Simba (Jonathan Taylor Thomas), who goes in front of his uncle Scar (Jeremy Irons) as next in line for the throne. Scar is furious, and sets 