<h2>Capsule Network on the classic IMDB Sentiment</h2>
<p>The goal of this notebook is to explore the Capsule Network on the classic IMDB Sentiment Analysis dataset. I would like to investigate how well the CapsNet does on NLP task such as sentiment analysis and compare it to the state of the art.</p>

In [1]:
### Import Libraries
import pandas as pd
import numpy as np

import gensim
from gensim.models import KeyedVectors

import torch
import torch.nn as nn

from torchtext import data
from torchtext import datasets
from torchtext.vocab import Vectors

### Get Data and Prepare it for training

In [2]:
### Settings to tokenize sentences and convert labels to torch floats
TEXT = data.Field(tokenize = 'spacy')
LABEL = data.LabelField(dtype = torch.float)

### Get test/train split for torchtext
train, test = datasets.IMDB.splits(TEXT, LABEL)

In [3]:
print('Number of training examples: {}'.format(len(train)))
print('Number of training examples: {}'.format(len(test)))

Number of training examples: 25000
Number of training examples: 25000


In [4]:
### Example Review and Label
print(vars(train.examples[1]))

{'text': ['This', 'movie', 'had', 'me', 'smiling', 'from', 'beginning', 'to', 'end', ',', 'partly', 'at', 'the', 'humor', ',', 'partly', 'at', 'Meg', 'Ryan', '(', 'this', 'is', 'the', 'perfect', 'character', 'for', 'her', ')', ',', 'and', 'always', 'because', 'it', "'s", 'just', 'one', 'of', 'the', 'best', 'feel', '-', 'good', 'movies', 'I', "'ve", 'seen', '.', 'Hopefully', 'the', 'DVD', 'will', 'be', 'out', 'soon', '.'], 'label': 'pos'}


In [5]:
# build the vocabulary
TEXT.build_vocab(train)
LABEL.build_vocab(train)
vocab_dict = dict(TEXT.vocab.stoi)

In [8]:
def create_wv_matrix(vocab_dict):
    print ('... Loading Word Vectors')
    word_vectors = KeyedVectors.load_word2vec_format("./models/GoogleNews-vectors-negative300.bin", binary=True)
    wv_matrix = []
    count = 0
    
    for each in vocab_dict.items():
        count += 1
        
        word = str(each[0])
        index = int(each[1])
        
        if word in word_vectors.vocab:
            wv_matrix.append(word_vectors.word_vec(word))
        else:
            wv_matrix.append(np.random.uniform(-0.01, 0.01, 300).astype("float32"))
        
        if count %10000 == 0:
            print ("On Index {}".format(count))
            
    ### Add Unknown Token
    wv_matrix.append(np.random.uniform(-0.01, 0.01, 300).astype("float32"))
    ### Add Pad Token
    wv_matrix.append(np.zeros(300).astype("float32"))
    print ('... Finished Creating Matrix')
    return np.array(wv_matrix)


In [9]:
wv_matrix = create_wv_matrix(vocab_dict)

... Loading Word Vectors
On Index 10000
On Index 20000
On Index 30000
On Index 40000
On Index 50000
On Index 60000
On Index 70000
On Index 80000
On Index 90000
On Index 100000
On Index 110000
On Index 120000
... Finished Creating Matrix


In [29]:
### Check if wv_matrix represents googles word2vec
word = "check"
word_array1 = wv_matrix[vocab_dict[word]]

In [30]:
#word_vectors = KeyedVectors.load_word2vec_format("./models/GoogleNews-vectors-negative300.bin", binary=True)
word_array2 =word_vectors.word_vec(word)

In [33]:
(word_array1 == word_array2).sum()

300

### Create CNN Model

In [40]:
class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()
        
        ### Convolution Layer 1
        self.conv1 = nn.Sequential(         # input shape (1, 300, 300)
            nn.Conv2d(
                in_channels=1,              # input height
                out_channels=32,            # n_filters
                kernel_size=7,              # filter size
                stride=1,                   # filter movement/step
                padding=3,                  # if want same width and length of this image after Conv2d, padding=(kernel_size-1)/2 if stride=1
            ),                              # output shape (32, 300, 300)
            nn.ReLU(),                      # activation
            nn.MaxPool2d(kernel_size=2),    # (300-2 / 2) choose max value in 2x2 area, output shape (32, 150, 150)
        )
        
        ### Convolution Layer 2
        self.conv2 = nn.Sequential(        # input shape (32, 150, 150)
            nn.Conv2d(32, 64, 7, 1, 3),     # output shape (64, 150, 150)
            nn.ReLU(),                      # activation
            nn.MaxPool2d(2),                # output shape (64, 75, 75)
        )
            
        ### Fully Connected Layer 3
        self.FC1 = nn.Linear(64 * 75 * 75, 150000)
        
        ### Fully Connected Layer 4
        self.FC2 = nn.Linear(150000, 75000)
        
        # Output 2 classes
        self.out = nn.Linear(75000, 1)
        
    def forward_pass(self, x):
        x = self.conv1(x)
        x = self.conv2(x)
        x = x.view(x.size(0), -1)           # flatten the output of conv2
        x = self.FC1(x)
        x = self.FC2(x)
        output = self.out(x)
        
        return output, x    # return x for visualization

### Train CNN Model

### Evaluate CNN Model