# Text Classification Using Neural Network

<font color='steelblue'>

<font size = 5>
    Use the movie review dataset for sentiment analysis<br><br>
</font>
</font>

<font color = 'grey'>
<font size = 4>
    
**Following examples are included in the processing:**

- `Download` data set from Google Cloud Storage
- `Load` dataset from csv file
- `Preprocess` data
- `Create` training and test dataset
- `Tokenize` the sequences
- `Explore` the tokenization
- `Create` a neural network
- `Train` model and make predictions
- `Explore` model performance

</font>
</font>

In [None]:
import matplotlib.pyplot as plt
import os
import re
import shutil
import string
import tensorflow as tf
import pandas as pd
import numpy as np
import timeit, time

from tensorflow.keras import layers
#from tensorflow.keras import losses
from tensorflow.keras import preprocessing

## Understanding Keras Embedding Layer<br>

<font size = 3>
Let us take these 2 sentences:
    
1. `Hello how are you doing`
2. `Hello how are you feeling`
    
- We have 6 unique words in our vocabulary
- Say we want to learn 2 weights for each of these words
- Say we assign integers to the words we have, Hello = 0, how = 1, are = 2 ....
</font>

In [None]:
tf.random.set_seed(2345)
emModel = tf.keras.Sequential()
emModel.add(tf.keras.layers.Embedding(input_dim = 6, output_dim = 2, input_length = 6))

In [None]:
data = np.expand_dims(np.array([0,1,2,3,4,5]), axis=0)

In [None]:
data

In [None]:
outdata = emModel.predict(data)

In [None]:
outdata

<font size = 3>
 
`Note: ` These are initial weights for our embedding vector for each word
    
</font>

## Movie Review Classification

## Download the dataset from Google Cloud Storage

In [None]:
# Change the location to datasets folder
currLoc = os.getcwd()
print(f'current folder: {currLoc}')
os.chdir('..')
cwd = os.getcwd()
print(f'current working directory: {cwd}')
path = cwd + '/' + 'datasets'
print(f'path: {path}')

In [None]:
# define dataset to download

dnfile = "https://storage.googleapis.com/courses-datasets/AI-ML-Toolkit/IMDBDataset.csv"
print(f'Cloud file location: {dnfile}')

In [None]:
# download the dataset from Google Cloud

!wget {dnfile} -NP {path}

In [None]:
# change working directory to where code is

os.chdir(currLoc)
cwd = os.getcwd()
print(f'current working directory: {cwd}')

## Load Data

In [None]:
# 50,000 movie reviews labeled as "positive" or "negative"
df = pd.read_csv("../datasets/IMDBDataset.csv")

In [None]:
df.head(1)

In [None]:
# Map the sentiment (target variable) to numbers
df["sentiment"] = df["sentiment"].map({"positive": 1, "negative":0})

In [None]:
pd.set_option('display.max_colwidth', None)
df.head(1)

In [None]:
df.shape

## Preprocess function
<span style="font-family:verdana; font-size:1.2em;">
    Preprocess the input review to do the following:
    <ol>
        <li>Convert to lower case</li>
        <li>Remove html tags if found</li>
        <li>Remove puncutations</li>
    </ol>
</span>    

In [None]:
import re, string

def preprocessData(input_data):
    lowercase = input_data.lower()
    stripped_html = lowercase.replace('<br />', ' ')
    retval = re.sub(r'[^\w\s]','', stripped_html)
    return retval

In [None]:
# preprocess the review column
df['review'] = df['review'].map(preprocessData)

In [None]:
pd.set_option('display.max_colwidth', None)
df.head()

## Create training and test datasets

In [None]:
df.sentiment.value_counts()

In [None]:
from sklearn.model_selection import train_test_split
X = df["review"].values
y = df["sentiment"].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.50, 
                                                    random_state = 2345)

In [None]:
X_train.shape

In [None]:
X_test.shape

In [None]:
y_train.shape

In [None]:
X_train[0]

In [None]:
y_train[0]

In [None]:
len(y_train[y_train == 0]), len(y_train[y_train == 1])

In [None]:
len(y_test[y_test == 0]), len(y_test[y_test == 1])

## Tokenize

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# define hyper parameters that can be modified
vocab_size = 20000
embedding_dim = 32
# If a sentence is shorter than max_length it will be padded, 
# longer sentences will be truncated
max_length = 768

In [None]:
# Use Out of Vocabulary token rather than throwing away unknown words
tokenizer = Tokenizer(num_words=vocab_size, oov_token = "<oov>")
tokenizer.fit_on_texts(X_train)
word_index = tokenizer.word_index

In [None]:
# convert words to numbers and pad for the neural network to use as input
# if the sequence length is greater than max length then truncate it at the end
# if the sequence length is less than max length then pad it at the end
train_sequences = tokenizer.texts_to_sequences(X_train)
train_padded = pad_sequences(train_sequences, maxlen=max_length, 
                             padding = "post", truncating="post")


# tokenized using the word_index learned from the training data
testing_sequences = tokenizer.texts_to_sequences(X_test)
test_padded = pad_sequences(testing_sequences, maxlen=max_length, 
                            padding = "post", truncating="post")

In [None]:
train_padded[1]

# Explore Tokenization

In [None]:
# Reverses keys: keys become the values, and values become the keys so that 
# we can look a word up (display padded as ?)
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

def decode_review(text):
    return ' '.join([reverse_word_index.get(i, '?') for i in text])

# This is what will be fed in
print(decode_review(train_padded[1]))

In [None]:
# This is the original text
print(X_train[1])

## Create Model
<span style="font-family:verdana; font-size:1.2em;">
    Model Parameters:
    <ol>
        <li><b>Embedding: </b>
            <ul>
                <li>Embedding layer stores one vector per word</li>
                <li>Converts sequences of word indices to sequences of vectors</li>
                <li>These vectors are trainable</li>
                <li>Once trained, words with similar meanings often have similar vectors</li>
                <li>This approach is more efficient than using a dense layer with one hot encoding</li>
            </ul>
        </li>
        <li><b>GlobalAveragePooling1D: </b>
            <ul>
                <li>Returns a fixed length output vector for each example by averaging over the sequence dimension</li>
                <li>Allows the model to handle input of variable length</li>
            </ul>
        </li>
        <li><b>Couple of Dense layers: </b>
            <ul>
                <li>Apply the dense layer with ReLU activation</li>
            </ul>
        </li>
    </ol>
    The last output layer use sigmoid to get probability of positive or negative sentiment
</span>

In [None]:
# Create model
tf.random.set_seed(2345)
model = tf.keras.Sequential([
    # The Embedding layer is the key to text sentiment analysis
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.Dense(128, activation = 'relu', name = 'FirstHidden'),
    tf.keras.layers.GlobalAveragePooling1D(name = 'AvgPooling1'),
    tf.keras.layers.Dense(1, activation = 'sigmoid', name = 'Output')
])

In [None]:
# Compile model
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()

In [None]:
%%time
tf.random.set_seed(2345)
history = model.fit(train_padded, y_train, epochs=3, validation_split = 0.3)

In [None]:
metrics_names = model.metrics_names
metrics_names

In [None]:
import matplotlib.pyplot as plt

def plot_graphs(history, string1, string2):
    # 2 rows 1 column
    plt.subplots(2, 1, sharex=False, sharey=False, figsize=(8,6))
    # plot 1
    plt.subplot(211)
    plt.plot(history.history[string1])
    plt.plot(history.history['val_'+string1])
    plt.ylabel(string1)
    plt.legend([string1, 'val_'+string1]);
    
    # plot 2
    plt.subplot(212)
    plt.plot(history.history[string2])
    plt.plot(history.history['val_'+string2])
    plt.xlabel('Epochs')
    plt.ylabel(string2)
    plt.legend([string2, 'val_'+string2]);

In [None]:
plot_graphs(history, metrics_names[0], metrics_names[1])

## Explore Embeddings

In [None]:
# Output from the Embedding layer
embeddings = model.layers[0]

weights = embeddings.get_weights()[0]
print(f"Vocabulary size: {weights.shape[0]},  Embedding dimensions: {weights.shape[1]}")

In [None]:
# the shape is: 
# (the number of words in the corpus, the embedding dimensions)
weights.shape

In [None]:
weights

## Evaluate the model

In [None]:
test_loss, test_acc = model.evaluate(test_padded, y_test)

print("Test accuracy: ", test_acc)
print("Test loss: ", test_loss)

In [None]:
# predict on a positive sample

sample_text_to_predict = \
["The movie was great. The animation and the graphics was excellent. I would recommend this movie."]

train_sequences = tokenizer.texts_to_sequences(sample_text_to_predict)
pos_padded = pad_sequences(train_sequences, maxlen=max_length, padding = "post", 
                           truncating="post")

#  make prediction
prediction = model.predict(pos_padded)

print(prediction)

In [None]:
# predict on a negative sample

sample_text_to_predict = \
["The movie was horrible. The animation and the graphics were worst. I would not recommend this movie."]

train_sequences = tokenizer.texts_to_sequences(sample_text_to_predict)
neg_padded = pad_sequences(train_sequences, maxlen=max_length, padding = "post", 
                           truncating="post")

#  make prediction
prediction = model.predict(neg_padded)

print(prediction)

In [None]:
import io

# writing the vectors and their metadata out to file. 
# these 2 files ('vecs.tsv', 'meta.tsv') are used by the 
# TensorFlow projector (http://projector.tensorflow.org/)
# to plot/visualize the vectors/embeddings in 3D

# Output of the 16 values per word representation (embedding)
#      out_v are the weights (embedding)
#      out_m are the actual words associated with each embedding

out_v = io.open('vecs.tsv', 'w', encoding='utf-8')
out_m = io.open('meta.tsv', 'w', encoding='utf-8')
for word_num in range(1, vocab_size):
  word = reverse_word_index[word_num]
  embeddings = weights[word_num]
  out_m.write(word + "\n")
  out_v.write('\t'.join([str(x) for x in embeddings]) + "\n")
out_v.close()
out_m.close()