In [1]:
import numpy as np
import pandas as pd
import matplotlib as plt
%matplotlib inline

In [2]:
import os

imdb = pd.read_csv(os.getcwd() + '/Data Sets/IMDB.csv')
print(imdb.shape)
imdb.head(10)

(50000, 2)


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
5,"Probably my all-time favorite movie, a story o...",positive
6,I sure would like to see a resurrection of a u...,positive
7,"This show was an amazing, fresh & innovative i...",negative
8,Encouraged by the positive comments about this...,negative
9,If you like original gut wrenching laughter yo...,positive


In [3]:
imdb.describe()

Unnamed: 0,review,sentiment
count,50000,50000
unique,49582,2
top,Loved today's show!!! It was a variety and not...,positive
freq,5,25000


In [4]:
imdb['sentiment'].value_counts()

positive    25000
negative    25000
Name: sentiment, dtype: int64

# Splitting the data set

In [5]:
# train set
train_reviews = imdb.review[:40000]
train_sentiments = imdb.sentiment[:40000]

# test dataset
test_reviews = imdb.review[40000:]
test_sentiments = imdb.sentiment[40000:]

print(train_reviews.shape,train_sentiments.shape)
print(test_reviews.shape,test_sentiments.shape)

(40000,) (40000,)
(10000,) (10000,)


# Text normalization

In [6]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\asgha\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [7]:
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.stem import LancasterStemmer, WordNetLemmatizer

# Tokenization of text
tokenizer = ToktokTokenizer()

# Setting English stopwords
stopword_list = nltk.corpus.stopwords.words('english')

# Removing HTML strips and noise text

In [8]:
from bs4 import BeautifulSoup
import re, string, unicodedata

# Removing the html strips
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

# Removing the square brackets
def remove_between_square_brackets(text):
    return re.sub('\[[^]]*\]', '', text)

# Removing the noisy text
def denoise_text(text):
    text = strip_html(text)
    text = remove_between_square_brackets(text)
    return text

# Apply function on review column
imdb['review'] = imdb['review'].apply(denoise_text)

# Removing special characters

In [9]:
# Define function for removing special characters
def remove_special_characters(text, remove_digits = True):
    pattern = r'[^a-zA-z0-9\s]'
    text = re.sub(pattern, '', text)
    return text

# Apply function on review column
imdb['review'] = imdb['review'].apply(remove_special_characters)

# Text stemming

In [10]:
# Stemming is the process of reducing inflected 
# (or sometimes derived) words to their word stem, 
# base or root form—generally a written word form

# Stemming the text
def simple_stemmer(text):
    ps = nltk.porter.PorterStemmer()
    text = ' '.join([ps.stem(word) for word in text.split()])
    return text

# Apply function on review column
imdb['review'] = imdb['review'].apply(simple_stemmer)

# Removing stopwords

In [11]:
# set stopwords to english
stop = set(stopwords.words('english'))
print(stop)

# Stop Words: A stop word is a commonly used word 
# (such as “the”, “a”, “an”, “in”) that a search engine 
# has been programmed to ignore, both when indexing entries 
# for searching and when retrieving them as the result of a search query.

# removing the stopwords
def remove_stopwords(text, is_lower_case = False):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text

# Apply function on review column
imdb['review'] = imdb['review'].apply(remove_stopwords)

{'whom', "isn't", 'off', "won't", 'than', 'very', 'him', 'through', 'did', 'me', 'were', 'for', "she's", 'itself', 'during', 'nor', 'your', "you've", 'doesn', "needn't", 'ma', 'yourself', 'himself', 'yourselves', 'own', 'more', 'each', 'am', 'not', "weren't", 'ours', 'most', "it's", 'them', 'doing', 'd', 'our', 'he', 'few', 'ourselves', 'hadn', 'a', 've', "that'll", 'weren', "you'll", 'some', 'have', "haven't", 'needn', 'and', 'by', 'such', 'y', 'shan', 'it', 'mustn', 'when', 'under', 'yours', "aren't", 'below', 'or', 'how', 'mightn', "doesn't", 'themselves', 'those', 'being', 'she', 'any', 'other', "mightn't", 'ain', 'these', 'as', 'my', 'about', 'will', "don't", 'down', 'hasn', 'before', 're', 'had', 'their', 'o', "wasn't", 'if', 'both', 'her', 'until', 'of', "didn't", "you're", 'here', 'isn', 'his', 'in', 'again', 'so', 'at', "mustn't", 'was', 'too', 'haven', 'herself', 'they', 'above', 'against', "should've", "wouldn't", 'all', 'an', 'only', 'wasn', 'what', 'we', 'should', 'to', 't

In [31]:
# conda install pip

In [32]:
# pip install --upgrade tensorflow

In [33]:
# pip install keras

# LSTM Approach

In [2]:
import numpy as np
import pandas as pd
import matplotlib as plt
%matplotlib inline

from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
# fix random seed for reproducibility
np.random.seed(7)

Using TensorFlow backend.


In [3]:
# load the dataset but only keep the top n words, zero the rest
top_words = 5000
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=top_words)

In [4]:
# truncate and pad input sequences
max_review_length = 500

# .pad_sequences transforms a list of integers into a a 2D numpy array of shape (num_samples, num_timesteps)
# in this case the shape ends up being (25000, 500) [X_train and X_test are both halved, 25000, and we 
# defined max_review_length to be 500]
X_train = sequence.pad_sequences(X_train, maxlen=max_review_length)
X_test = sequence.pad_sequences(X_test, maxlen=max_review_length)

(25000, 500)

In [23]:
# The first layer is the Embedded layer that uses 32 length vectors to represent each word. 
# The next layer is the LSTM layer with 100 memory units (smart neurons). Finally, because this 
# is a classification problem we use a Dense output layer with a single neuron and a sigmoid 
# activation function to make 0 or 1 predictions for the two classes (positive and negative) in the problem.

# Because it is a binary classification problem, log loss is used as the loss function 
# (binary_crossentropy in Keras). The efficient ADAM optimization algorithm is used. The model 
# is fit for only 2 epochs because it quickly overfits the problem. A large batch size of 64 reviews 
# is used to space out weight updates.

In [24]:
# create the model

embedding_vector_length = 32
model = Sequential()

# from keras, what the Embedding layer does is it takes in (input_dimension, output_dimension, input_length)
# from before we specified top_words to only take top 5000 words, embedding_vector_length is 32 meaning we take in 
# a vector of size 32 to represent each word, and finally the input_length specifies how long the review should be, 500 words
model.add(Embedding(top_words, embedding_vector_length, input_length=max_review_length))
# then we add in the LSTM layer that contains 100 neurons
model.add(LSTM(100))
# Dense layer is simply a densely-connect neural network layer, in this case we're using the sigmoid activation function
model.add(Dense(1, activation='sigmoid'))
# our loss function is binary_crossentropy since this a classification problem (either positive or negative)
# adam is generalized version of gradient descent that uses momentums to adapt the learning rate for each 
# weight of the neural network, and we are measuring accuracy
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
# we run for 3 epochs (training cycle iteration on the training set) and use a batch_size of 64 meaning of the 32x500 size
# matrices we give to our model, we give 64 samples of this
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=3, batch_size=64)

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 500, 32)           160000    
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               53200     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 101       
Total params: 213,301
Trainable params: 213,301
Non-trainable params: 0
_________________________________________________________________
None


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 25000 samples, validate on 25000 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.callbacks.History at 0x1a611ea3448>

In [25]:
# Final evaluation of the model
scores = model.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Accuracy: 86.18%


In [26]:
# Recurrent Neural networks like LSTM generally have the problem of overfitting.

# Dropout can be applied between layers using the Dropout Keras layer. We can do 
# this easily by adding new Dropout layers between the Embedding and LSTM layers and 
# the LSTM and Dense output layers.

# LSTM with Dropout

In [27]:
from keras.layers import Dropout

# create the model
embedding_vecor_length = 32
model = Sequential()
model.add(Embedding(top_words, embedding_vecor_length, input_length=max_review_length))
# it is the same model as before but we are adding in a dropout layer which take the same neurons as before,
# but we randomly discard 20% of them in the calculations
model.add(Dropout(0.2)) # added this
model.add(LSTM(100))
# added another dropout layer 
model.add(Dropout(0.2)) # added this 
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
model.fit(X_train, y_train, epochs=3, batch_size=64)

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 500, 32)           160000    
_________________________________________________________________
dropout_1 (Dropout)          (None, 500, 32)           0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 100)               53200     
_________________________________________________________________
dropout_2 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 101       
Total params: 213,301
Trainable params: 213,301
Non-trainable params: 0
_________________________________________________________________
None


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.callbacks.History at 0x1a614c683c8>

In [28]:
# Final evaluation of the model
scores = model.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Accuracy: 84.34%


# Comparing to CNN

In [29]:
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D

# create the model
embedding_vecor_length = 32
model = Sequential()
model.add(Embedding(top_words, embedding_vecor_length, input_length=max_review_length))
# here we add the convolution leayer where the kernel_size (the size of the sliding window) is 3, and we have 32 
# filters (how many different windows we will have)
# the Conv1D layer expects these dimensions (batchSize, length, channels), and we use relu as our activation function
model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
# pool_size takes in an integer or tuple of 2 integers, which are factors by which to downscale (vertical, horizontal)
# if only one integer is specified, the same window length will be used for both dimensions, which is what we do here
model.add(MaxPooling1D(pool_size=2))
model.add(LSTM(100))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
model.fit(X_train, y_train, epochs=3, batch_size=64)

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 500, 32)           160000    
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 500, 32)           3104      
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 250, 32)           0         
_________________________________________________________________
lstm_3 (LSTM)                (None, 100)               53200     
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 101       
Total params: 216,405
Trainable params: 216,405
Non-trainable params: 0
_________________________________________________________________
None


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.callbacks.History at 0x1a615517708>

In [30]:
# Final evaluation of the model
scores = model.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Accuracy: 87.87%


# Conclusion

Although LSTM was a good choice initially, it is the natural choice to think of when 
it comes to binary classification such as sentiment analysis, it was outperformed by a CNN 
implementation. Interestingly adding dropout made the model perform slightly worse, even though 
it was meant to solve the problem of overfitting which RNN models typically have to deal with. We could play around
with the activation function, loss function, and all other variables if our goal was to improve accuracy.