# Neural Network Text Classification

In [5]:
import tensorflow as tf
from tensorflow import keras
import numpy as np

In [6]:
data = keras.datasets.imdb

In [7]:
# (train_data, train_labels), (test_data, test_labels) = data.load_data(num_words=88000) # only take the words that are 88,000 most frequent

(train_data, train_labels), (test_data, test_labels) = data.load_data(num_words=200000)


In [8]:
# The test data is integer encoded words

print(test_data[0])

[1, 591, 202, 14, 31, 6, 717, 10, 10, 18142, 10698, 5, 4, 360, 7, 4, 177, 5760, 394, 354, 4, 123, 9, 1035, 1035, 1035, 10, 10, 13, 92, 124, 89, 488, 7944, 100, 28, 1668, 14, 31, 23, 27, 7479, 29, 220, 468, 8, 124, 14, 286, 170, 8, 157, 46, 5, 27, 239, 16, 179, 15387, 38, 32, 25, 7944, 451, 202, 14, 6, 717]


# Need to create mapping for words

In [9]:
word_index = data.get_word_index()

In [10]:
# All of the words in the training and testing dataset have keys and values associated with them
# Starts at 3 because we want to add some of our own indexes at the beginning

word_index = {k:(v+3) for k, v in word_index.items()}
word_index["<PAD>"] = 0 # Padding tag to make the words in each movie review the same length
word_index["<START>"] = 1
word_index["<UNK>"] = 2
word_index["<UNUSED>"] = 3

In [11]:
# Swap all the values in the keys, values pointing to a key 

reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

In [12]:
# A function to decode the training and testing data into human readable words

def decode_review(text):
    return " ".join([reverse_word_index.get(i, "?") for i in text])

print(decode_review(test_data[0]))

# Start will be added automatically in all the text

<START> please give this one a miss br br kristy swanson and the rest of the cast rendered terrible performances the show is flat flat flat br br i don't know how michael madison could have allowed this one on his plate he almost seemed to know this wasn't going to work out and his performance was quite lacklustre so all you madison fans give this a miss


In [219]:
# Not all the reviews are at the same length!
# Difficult to define the input and output neurons

print(len(test_data[0]), len(test_data[1]))

68 260


In [13]:
# Need to use padding tag to set a definite length for all the data
# Limit the word intake to 250 words
# reprocess data again

train_data = keras.preprocessing.sequence.pad_sequences(train_data, value=word_index["<PAD>"], padding="post", maxlen=500)
test_data = keras.preprocessing.sequence.pad_sequences(test_data, value=word_index["<PAD>"], padding="post", maxlen=500)

In [221]:
print(train_data[0])

[    1    14    22    16    43   530   973  1622  1385    65   458  4468
    66  3941     4   173    36   256     5    25   100    43   838   112
    50   670 22665     9    35   480   284     5   150     4   172   112
   167 21631   336   385    39     4   172  4536  1111    17   546    38
    13   447     4   192    50    16     6   147  2025    19    14    22
     4  1920  4613   469     4    22    71    87    12    16    43   530
    38    76    15    13  1247     4    22    17   515    17    12    16
   626    18 19193     5    62   386    12     8   316     8   106     5
     4  2223  5244    16   480    66  3785    33     4   130    12    16
    38   619     5    25   124    51    36   135    48    25  1415    33
     6    22    12   215    28    77    52     5    14   407    16    82
 10311     8     4   107   117  5952    15   256     4 31050     7  3766
     5   723    36    71    43   530   476    26   400   317    46     7
     4 12118  1029    13   104    88     4   381   

In [222]:
print(len(test_data[0]), len(test_data[1]))

500 500


In [223]:
print(decode_review(test_data[0]))

<START> please give this one a miss br br kristy swanson and the rest of the cast rendered terrible performances the show is flat flat flat br br i don't know how michael madison could have allowed this one on his plate he almost seemed to know this wasn't going to work out and his performance was quite lacklustre so all you madison fans give this a miss <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <

# Modeling

In [31]:
# Adding layers
# Architecture of the network
# Embedding layer will try to find words similar to each other, uses word vectors and dimensions (coefficients) to match similarly angled words
# GlobalAveragePooling1D - takes whatever dimension the data is in and puts it into a low dimension, to shrink the data down

model = keras.Sequential()  
model.add(keras.layers.Embedding(200000, 16)) # 88,000 Word vectors in 16 dimensions
model.add(keras.layers.GlobalAveragePooling1D()) # Average out data from word vectors
model.add(keras.layers.Dense(16, activation="relu")) 
model.add(keras.layers.Dense(1, activation="sigmoid")) # Want the final output to be review is good/bad, binary , or a probability value we can then associate with a binary

model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 16)          3200000   
_________________________________________________________________
global_average_pooling1d_1 ( (None, 16)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 16)                272       
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 17        
Total params: 3,200,289
Trainable params: 3,200,289
Non-trainable params: 0
_________________________________________________________________


# Compiling

In [15]:
# Find out adam optimiser
# loss function - binary_crossentropy, loss function will calculate difference between if the output was 0.2 compared to the expected binary output of 0
# Binary classification problem 
# Could also use mean_sequared_error, but binary_crossentropy is better dealing with probabilities


model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

In [16]:
train_data[:10]

array([[   1,   14,   22, ...,    0,    0,    0],
       [   1,  194, 1153, ...,    0,    0,    0],
       [   1,   14,   47, ...,    0,    0,    0],
       ...,
       [   4, 3231,  152, ...,   72,   33,   32],
       [   1,   43,  188, ...,    0,    0,    0],
       [   1,   14,   20, ...,    0,    0,    0]])

In [17]:
# Split the data for validation
# Validation data to check how well our model is performing based on tunes and tweaks on the training data
# To get a more accuracte sense of how well our model is
# So the model doesn't memorise the reviews
# Is training different from validation?
# Needed to convert the elements in the array as np as the train_data split causes a list to be created instead...

x_val = np.asarray(train_data[:10000]) # Indexes up to 10000
x_train = np.asarray(train_data[10000:]) # Indexes after 10000

y_val = np.asarray(train_labels[:10000])
y_train = np.asarray(train_labels[10000:])

In [18]:
# Batch size - how many movie reviews to load each time
# To save on memory instead of storing all in one go

fitModel = model.fit(x_train, y_train, epochs=20, batch_size=512, validation_data=(x_val,y_val), verbose=2)

Epoch 1/20
30/30 - 2s - loss: 0.6927 - accuracy: 0.5519 - val_loss: 0.6919 - val_accuracy: 0.5604
Epoch 2/20
30/30 - 1s - loss: 0.6902 - accuracy: 0.5876 - val_loss: 0.6883 - val_accuracy: 0.6275
Epoch 3/20
30/30 - 1s - loss: 0.6846 - accuracy: 0.6797 - val_loss: 0.6810 - val_accuracy: 0.6947
Epoch 4/20
30/30 - 1s - loss: 0.6734 - accuracy: 0.7432 - val_loss: 0.6675 - val_accuracy: 0.7401
Epoch 5/20
30/30 - 1s - loss: 0.6547 - accuracy: 0.7604 - val_loss: 0.6464 - val_accuracy: 0.7623
Epoch 6/20
30/30 - 1s - loss: 0.6270 - accuracy: 0.7849 - val_loss: 0.6178 - val_accuracy: 0.7819
Epoch 7/20
30/30 - 1s - loss: 0.5914 - accuracy: 0.8095 - val_loss: 0.5841 - val_accuracy: 0.7989
Epoch 8/20
30/30 - 1s - loss: 0.5507 - accuracy: 0.8297 - val_loss: 0.5477 - val_accuracy: 0.8102
Epoch 9/20
30/30 - 1s - loss: 0.5082 - accuracy: 0.8455 - val_loss: 0.5120 - val_accuracy: 0.8240
Epoch 10/20
30/30 - 1s - loss: 0.4665 - accuracy: 0.8597 - val_loss: 0.4776 - val_accuracy: 0.8349
Epoch 11/20
30/30 -

In [19]:
results = model.evaluate(test_data, test_labels)

# Some overfitting going on here



In [20]:
print(results)

[0.3304603397846222, 0.8715599775314331]


In [21]:
i = 430

test_review = test_data[i]
predict = model.predict([test_review])
print("Review: ")
print(decode_review(test_review))
print("Prediction: " + str(predict[i]))
print("Actual: " + str(test_labels[i]))

Review: 
<START> all in all big bad john was a hilarious and touching movie if you want romance tragedy and humor this movie's got it if you're a fan of the song like i am you pretty much know how it ends but if you don't or do and want to watch it anyway i strongly recommend this movie jack elam and jimmy dean are a hilarious pair with great chemistry however i wouldn't recommend this movie to strict urban folk you have to understand where these people are supposed to be coming from and only a handful of us are left but even a few urban folk might understand it and appreciate it for what it is a good down home movie that'll make you laugh cry and be inspired <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <P

In [27]:
i = 0

test_review = test_data[i]
predict = model.predict([test_review])
print("Review: ")
print(decode_review(test_review))
print("Prediction: " + str(np.argmax(predict[i])))
print("Actual: " + str(test_labels[i]))

Review: 
<START> please give this one a miss br br kristy swanson and the rest of the cast rendered terrible performances the show is flat flat flat br br i don't know how michael madison could have allowed this one on his plate he almost seemed to know this wasn't going to work out and his performance was quite lacklustre so all you madison fans give this a miss <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD

# Need to save the model to save computational time

In [23]:
# Tensorflow uses the .h5 file extension to save the model
# Saving the model in binary data, which allows model to load quickly and make predictions

model.save("model.h5")

# Loading the model

In [24]:
# Possibility to save models with different tweaks and compare them easily here
# E.g. changing the number of neurons in the dense layer, changing the activation function

model = keras.models.load_model("model.h5")

# Testing data outside from the provided ones in Keras

In [30]:
# Need to convert the test.txt into encoded list of numbers
# Need to set the limit of the max number of words

def review_encode(s):
    encoded = [1] # Add a starting tag like the exisiting data already has
    
    for word in s:
        if word.lower() in word_index: # convert to lowercase
            encoded.append(word_index[word.lower()])
        else:
            encoded.append(2) # Add unknown tag for words not in the word index
            
    return encoded


with open("test.txt", encoding="utf-8") as f:
    for line in f.readlines():
        nline = line.replace(",", "").replace("!","").replace(".","").replace("(","").replace(")","").replace(":","").replace("\"","").replace("\'","").strip().split(" ") # Need to remove all the punctuations as there is no mapping for punctuations
        encode = review_encode(nline)
        encode = keras.preprocessing.sequence.pad_sequences([encode], value=word_index["<PAD>"], padding="post", maxlen=500)
        predict = model.predict(encode)
        print(line)
        print(encode)
        print(predict[0])
        
        

shell is a very guai i love her she is the best very cool good great wow
[[   1 6186    9    6   55    2   13  119   41   59    9    4  118   55
   646   52   87 1318    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0  