# Sentiment analysis project

In [223]:
# General
%matplotlib inline
import matplotlib.pyplot as plt
import os
from collections import Counter
from typing import List

# Keras
import keras
from keras.datasets import imdb
from keras.preprocessing import sequence
from keras.models import Sequential, load_model
from keras.layers import (Embedding, LSTM, Dense, Dropout,
                          Flatten, GRU, Conv1D, MaxPooling1D)

# Other
import nltk
from nltk.corpus import stopwords
import pandas as pd

## Load and preprocess the data

In [224]:
VOCAB_SIZE = 10000

In [225]:
(train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=VOCAB_SIZE)

### Mappers

In [226]:
class Preprocessing:
    def __init__(self):
        self.word2ind = imdb.get_word_index()
        self.ind2word = {ind: word for word, ind in self.word2ind.items()}
        
    def map_inds_to_words(self, inds: List) -> List:
        return [self.ind2word.get(ind, '') for ind in inds]
    
    def map_words_to_inds(self, tokens: List) -> List:
        return [self.word2ind.get(token, '') for token in tokens]

In [227]:
preprocessing = Preprocessing()

In [228]:
# Example part of a review
preprocessing.map_inds_to_words(train_data[0])[:10]

['the',
 'as',
 'you',
 'with',
 'out',
 'themselves',
 'powerful',
 'lets',
 'loves',
 'their']

### Load the stopwords and delete them from the reviews

In [229]:
nltk.download('stopwords')
_stopwords = stopwords.words('english')

# Check some english stopwords
_stopwords[:5]

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/pozdrowiony/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['i', 'me', 'my', 'myself', 'we']

#### Map inds to words

In [230]:
_train_data = [preprocessing.map_inds_to_words(rev) for rev in train_data]
_test_data = [preprocessing.map_inds_to_words(rev) for rev in test_data]

#### Remove stopwords

In [231]:
_train_data = [[token for token in rev if token not in _stopwords] for rev in _train_data]
_test_data = [[token for token in rev if token not in _stopwords] for rev in _test_data]

#### Map words to index

In [232]:
_train_data = [preprocessing.map_words_to_inds(rev) for rev in _train_data]
_test_data = [preprocessing.map_words_to_inds(rev) for rev in _test_data]

### Pad the sentences

In [233]:
MAX_LEN = len(max((_train_data + _test_data), key=len))

In [234]:
_train_data = sequence.pad_sequences(_train_data, maxlen=MAX_LEN)
_test_data = sequence.pad_sequences(_test_data, maxlen=MAX_LEN)

### Batch constants

In [235]:
BATCH_SIZE = 64
N_EPOCHS = 10

## Test various NN architectures

### Define neccessery elements

#### Constants

In [236]:
EMBEDDING_SIZE=32

# Define model paths
DENSE_PATH = './data/dense_net.h5'
DENSE_EMB_NOT_TRAIN_PATH = './data/dense_net_emb_not_train.h5'
DENSE_1_HIDDEN_PATH = './data/dense_net_1_hidden.h5'

LSTM_PATH = './data/lstm_net.h5'
GRU_PATH = './data/gru_net.h5'

CONV_1_LAYER_PATH = './data/conv_net_1_layer.h5'
CONV_1_LAYER_EMB_NOT_TRAIN_PATH = './data/conv_net_emb_not_train.h5'
CONV_1_LAYER_1_HIDDEN_PATH = './data/conv_net_1_1_hidden.h5'
CONV_2_LAYERS_PATH = './data/conv_net_2.h5'
CONV_2_LAYERS_1_HIDDEN_PATH = './data/conv_net_2_1_hidden.h5'

In [237]:
def create_and_train_net(model: Sequential, file_path: str,
                         override: bool = False) -> None:
    """ Create or import the net if created """
    
    model.compile(loss='binary_crossentropy', optimizer='adam',
                  metrics=['accuracy'])
    
    # Try to load the pretrained model
    if not os.path.isfile(file_path) or override:     
        model.fit(_train_data, train_labels,
                  validation_split=0.2,
                  batch_size=BATCH_SIZE,
                  epochs=N_EPOCHS)
        
        model.save(file_path)
    else:
        model = load_model(file_path)
        
    print(model.summary())

#### Define embedding layer constant for each architecture

In [238]:
EMBEDDING = Embedding(VOCAB_SIZE, EMBEDDING_SIZE, input_length=MAX_LEN)

#### Define list of the names of below nns and coresponding accuracies

In [239]:
nns = [
    ['dense', 0, 'YES', 0],
    ['dense', 0, 'NO', 0],
    ['dense', 1, 'YES', 0],
    ['lstm', 0, 'YES', 0],
    ['gru', 0, 'YES', 0],
    ['conv_1', 0, 'YES', 1],
    ['conv_1', 0, 'NO', 1],
    ['conv_1', 1, 'YES', 1],
    ['conv_2', 0, 'YES', 2],
    ['conv_2', 1, 'YES', 2],
]
columns = ['kind', 'n_hidden', 'emb_backprop', 'n_conv']

In [240]:
all_scores = []

### Dense

#### Only input and output layer

In [241]:
dense_net = Sequential()
dense_net.add(EMBEDDING)
dense_net.add(Flatten())
dense_net.add(Dense(1, activation='sigmoid'))

In [242]:
create_and_train_net(dense_net, DENSE_PATH)

Train on 20000 samples, validate on 5000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_16 (Embedding)     (None, 1300, 32)          320000    
_________________________________________________________________
flatten_37 (Flatten)         (None, 41600)             0         
_________________________________________________________________
dense_42 (Dense)             (None, 1)                 41601     
Total params: 361,601
Trainable params: 361,601
Non-trainable params: 0
_________________________________________________________________
None


In [243]:
scores = dense_net.evaluate(_test_data, test_labels, verbose=1)[1]
all_scores.append(scores)
scores



0.8616

#### Only input and output layer without backpropagation on the embeddings

In [244]:
dense_net_emb_not_train = Sequential()
dense_net_emb_not_train.add(EMBEDDING)
dense_net_emb_not_train.add(Flatten())
dense_net_emb_not_train.add(Dense(1, activation='sigmoid'))

# Turn off the backprop on the embeddings
dense_net_emb_not_train.layers[0].trainable = False

In [245]:
create_and_train_net(dense_net_emb_not_train, DENSE_EMB_NOT_TRAIN_PATH)

Train on 20000 samples, validate on 5000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_16 (Embedding)     (None, 1300, 32)          320000    
_________________________________________________________________
flatten_38 (Flatten)         (None, 41600)             0         
_________________________________________________________________
dense_43 (Dense)             (None, 1)                 41601     
Total params: 361,601
Trainable params: 41,601
Non-trainable params: 320,000
_________________________________________________________________
None


In [246]:
scores = dense_net_emb_not_train.evaluate(_test_data, test_labels, verbose=1)[1]
all_scores.append(scores)
scores



0.86128

#### One hidden layer and dropout

In [247]:
OUTPUT_DENSE_HIDDEN = 100

In [248]:
dense_net_1_hidden = Sequential()
dense_net_1_hidden.add(EMBEDDING)
dense_net_1_hidden.add(Flatten())
dense_net_1_hidden.add(Dense(OUTPUT_DENSE_HIDDEN, activation='sigmoid'))
dense_net_1_hidden.add(Dropout(0.1))
dense_net_1_hidden.add(Dense(1, activation='sigmoid'))

In [249]:
create_and_train_net(dense_net_1_hidden, DENSE_1_HIDDEN_PATH)

Train on 20000 samples, validate on 5000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_16 (Embedding)     (None, 1300, 32)          320000    
_________________________________________________________________
flatten_39 (Flatten)         (None, 41600)             0         
_________________________________________________________________
dense_44 (Dense)             (None, 100)               4160100   
_________________________________________________________________
dropout_10 (Dropout)         (None, 100)               0         
_________________________________________________________________
dense_45 (Dense)             (None, 1)                 101       
Total params: 4,480,201
Trainable params: 4,160,201
Non-trainable params: 320,000
________________________________

In [250]:
scores = dense_net_1_hidden.evaluate(_test_data, test_labels, verbose=1)[1]
all_scores.append(scores)
scores



0.86116

### Recurrent

#### LSTM

In [251]:
lstm_net = Sequential()
lstm_net.add(EMBEDDING)
lstm_net.add(LSTM(100))
lstm_net.add(Dense(1, activation='sigmoid'))

In [252]:
create_and_train_net(lstm_net, LSTM_PATH)

Train on 20000 samples, validate on 5000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_16 (Embedding)     (None, 1300, 32)          320000    
_________________________________________________________________
lstm_5 (LSTM)                (None, 100)               53200     
_________________________________________________________________
dense_46 (Dense)             (None, 1)                 101       
Total params: 373,301
Trainable params: 53,301
Non-trainable params: 320,000
_________________________________________________________________
None


In [253]:
scores = lstm_net.evaluate(_test_data, test_labels, verbose=1)[1]
all_scores.append(scores)
scores



0.85572

#### GRU

In [254]:
gru_net = Sequential()
gru_net.add(EMBEDDING)
gru_net.add(GRU(100))
gru_net.add(Dense(1, activation='sigmoid'))

In [255]:
create_and_train_net(gru_net, GRU_PATH)

Train on 20000 samples, validate on 5000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_16 (Embedding)     (None, 1300, 32)          320000    
_________________________________________________________________
gru_2 (GRU)                  (None, 100)               39900     
_________________________________________________________________
dense_47 (Dense)             (None, 1)                 101       
Total params: 360,001
Trainable params: 40,001
Non-trainable params: 320,000
_________________________________________________________________
None


In [256]:
scores = gru_net.evaluate(_test_data, test_labels, verbose=1)[1]
all_scores.append(scores)
scores



0.85868

### Conv

#### 1 Conv layer

In [257]:
conv_1_layer_net = Sequential()
conv_1_layer_net.add(EMBEDDING)
conv_1_layer_net.add(Conv1D(10, 5))
conv_1_layer_net.add(Flatten())
conv_1_layer_net.add(Dense(1, activation='sigmoid'))

In [258]:
create_and_train_net(conv_1_layer_net, CONV_1_LAYER_PATH)

Train on 20000 samples, validate on 5000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_16 (Embedding)     (None, 1300, 32)          320000    
_________________________________________________________________
conv1d_23 (Conv1D)           (None, 1296, 10)          1610      
_________________________________________________________________
flatten_40 (Flatten)         (None, 12960)             0         
_________________________________________________________________
dense_48 (Dense)             (None, 1)                 12961     
Total params: 334,571
Trainable params: 14,571
Non-trainable params: 320,000
_________________________________________________________________
None


In [259]:
scores = conv_1_layer_net.evaluate(_test_data, test_labels, verbose=1)[1]
all_scores.append(scores)
scores



0.85812

#### 1 Conv layer without backpropagation on the embeddings

In [260]:
conv_1_layer_net_emb_not_train = Sequential()
conv_1_layer_net_emb_not_train.add(EMBEDDING)
conv_1_layer_net_emb_not_train.add(Conv1D(10, 5))
conv_1_layer_net_emb_not_train.add(Flatten())
conv_1_layer_net_emb_not_train.add(Dense(1, activation='sigmoid'))

# Turn off the backprop on the embeddings
conv_1_layer_net_emb_not_train.layers[0].trainable = False

In [261]:
create_and_train_net(conv_1_layer_net_emb_not_train, CONV_1_LAYER_EMB_NOT_TRAIN_PATH)

Train on 20000 samples, validate on 5000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_16 (Embedding)     (None, 1300, 32)          320000    
_________________________________________________________________
conv1d_24 (Conv1D)           (None, 1296, 10)          1610      
_________________________________________________________________
flatten_41 (Flatten)         (None, 12960)             0         
_________________________________________________________________
dense_49 (Dense)             (None, 1)                 12961     
Total params: 334,571
Trainable params: 14,571
Non-trainable params: 320,000
_________________________________________________________________
None


In [262]:
scores = conv_1_layer_net_emb_not_train.evaluate(_test_data, test_labels, verbose=1)[1]
all_scores.append(scores)
scores



0.85812

#### 1 Conv layer + 1 hidden dense

In [263]:
conv_1_layer_1_hidden_net = Sequential()
conv_1_layer_1_hidden_net.add(EMBEDDING)
conv_1_layer_1_hidden_net.add(Conv1D(10, 5))
conv_1_layer_1_hidden_net.add(Flatten())
conv_1_layer_1_hidden_net.add(Dense(OUTPUT_DENSE_HIDDEN, activation='sigmoid'))
conv_1_layer_1_hidden_net.add(Dropout(0.1))
conv_1_layer_1_hidden_net.add(Dense(1, activation='sigmoid'))

In [264]:
create_and_train_net(conv_1_layer_1_hidden_net, CONV_1_LAYER_1_HIDDEN_PATH)

Train on 20000 samples, validate on 5000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_16 (Embedding)     (None, 1300, 32)          320000    
_________________________________________________________________
conv1d_25 (Conv1D)           (None, 1296, 10)          1610      
_________________________________________________________________
flatten_42 (Flatten)         (None, 12960)             0         
_________________________________________________________________
dense_50 (Dense)             (None, 100)               1296100   
_________________________________________________________________
dropout_11 (Dropout)         (None, 100)               0         
_________________________________________________________________
dense_51 (Dense)             (None, 1)          

In [265]:
scores = conv_1_layer_1_hidden_net.evaluate(_test_data, test_labels, verbose=1)[1]
all_scores.append(scores)
scores



0.85884

#### 2 Conv layers

In [266]:
conv_2_layers_net = Sequential()
conv_2_layers_net.add(EMBEDDING)
conv_2_layers_net.add(Conv1D(2, 20))
conv_2_layers_net.add(Conv1D(10, 5))
conv_2_layers_net.add(Flatten())
conv_2_layers_net.add(Dense(1, activation='sigmoid'))

In [267]:
create_and_train_net(conv_2_layers_net, CONV_2_LAYERS_PATH)

Train on 20000 samples, validate on 5000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_16 (Embedding)     (None, 1300, 32)          320000    
_________________________________________________________________
conv1d_26 (Conv1D)           (None, 1281, 2)           1282      
_________________________________________________________________
conv1d_27 (Conv1D)           (None, 1277, 10)          110       
_________________________________________________________________
flatten_43 (Flatten)         (None, 12770)             0         
_________________________________________________________________
dense_52 (Dense)             (None, 1)                 12771     
Total params: 334,163
Trainable params: 14,163
Non-trainable params: 320,000
_____________________________________

In [268]:
scores = conv_2_layers_net.evaluate(_test_data, test_labels, verbose=1)[1]
all_scores.append(scores)
scores



0.85684

#### 2 Conv Layers + 1 hidden dense

In [269]:
conv_2_layers_1_hidden_net = Sequential()
conv_2_layers_1_hidden_net.add(EMBEDDING)
conv_2_layers_1_hidden_net.add(Conv1D(2, 20))
conv_2_layers_1_hidden_net.add(Conv1D(10, 5))
conv_2_layers_1_hidden_net.add(Flatten())
conv_2_layers_1_hidden_net.add(Dense(OUTPUT_DENSE_HIDDEN, activation='sigmoid'))
conv_2_layers_1_hidden_net.add(Dropout(0.1))
conv_2_layers_1_hidden_net.add(Dense(1, activation='sigmoid'))

In [270]:
create_and_train_net(conv_2_layers_1_hidden_net, CONV_2_LAYERS_1_HIDDEN_PATH)

Train on 20000 samples, validate on 5000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_16 (Embedding)     (None, 1300, 32)          320000    
_________________________________________________________________
conv1d_28 (Conv1D)           (None, 1281, 2)           1282      
_________________________________________________________________
conv1d_29 (Conv1D)           (None, 1277, 10)          110       
_________________________________________________________________
flatten_44 (Flatten)         (None, 12770)             0         
_________________________________________________________________
dense_53 (Dense)             (None, 100)               1277100   
_________________________________________________________________
dropout_12 (Dropout)         (None, 100)        

In [271]:
scores = conv_2_layers_1_hidden_net.evaluate(_test_data, test_labels, verbose=1)[1]
all_scores.append(scores)
scores



0.85488

## Summary

In [272]:
results = pd.DataFrame(nns)
results.columns = columns
results['accuracy'] = pd.Series(all_scores)
results.set_index('kind')

Unnamed: 0_level_0,n_hidden,emb_backprop,n_conv,accuracy
kind,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
dense,0,YES,0,0.8616
dense,0,NO,0,0.86128
dense,1,YES,0,0.86116
lstm,0,YES,0,0.85572
gru,0,YES,0,0.85868
conv_1,0,YES,1,0.85812
conv_1,0,NO,1,0.85812
conv_1,1,YES,1,0.85884
conv_2,0,YES,2,0.85684
conv_2,1,YES,2,0.85488
