<h2>Transfer Learning with CNN on the classic IMDB Sentiment</h2>

In [1]:
### Import Libraries
import pandas as pd
import numpy as np
import time
import gc
import re

import nltk
from nltk.corpus import stopwords
#nltk.download('stopwords')
stop_words = set(stopwords.words('english')) 

import gensim
from gensim.models import KeyedVectors

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch.utils.data as data_utils


from torchtext import data
from torchtext import datasets
from torchtext.vocab import Vectors

### Get Data and Prepare it for training

In [2]:
df = pd.read_csv('./data/imdb_master.csv', encoding="latin-1")
df = df.drop(['Unnamed: 0', 'file'], axis=1)

df = df[df['label'] != 'unsup']
df['label'] = df['label'].replace('neg', 0)
df['label'] = df['label'].replace('pos', 1)

In [3]:
df['label'] = df['label'].replace('neg', 0)
df['label'] = df['label'].replace('pos', 1)

In [4]:
### Contractions Dictionary
contractions = { 
"ain't": "am not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he had",
"he'd've": "he would have",
"he'll": "he will",
"he'll've": "he will have",
"he's": " he is",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how is",
"I'd": "I had",
"I'd've": "I would have",
"I'll": "I will",
"i'll": "I will",
"I'll've": "I will have",
"I'm": "I am",
"I've": "I have",
"isn't": "is not",
"it'd": "it would",
"it'd've": "it would have",
"it'll": "it will",
"it'll've": "it will have",
"it's": "it is",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she had",
"she'd've": "she would have",
"she'll": "she will",
"she'll've": "she will have",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"so's": "so is",
"that'd": "that had",
"that'd've": "that would have",
"that's": "that is",
"there'd": "there would",
"there'd've": "there would have",
"there's": "there is",
"they'd": "they had / they would",
"they'd've": "they would have",
"they'll": "they will",
"they'll've": "they will have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
"we'd": "we would",
"we'd've": "we would have",
"we'll": "we will",
"we'll've": "we will have",
"We're": "we are",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what'll've": "what will have",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"when's": "when is",
"when've": "when have",
"where'd": "where did",
"where's": "where is",
"where've": "where have",
"who'll": "who will",
"who'll've": "who will have",
"who's": "who has",
"who've": "who have",
"why's": "why is",
"why've": "why have",
"will've": "will have",
"won't": "will not",
"won't've": "will not have",
"would've": "would have",
"wouldn't": "would not",
"wouldn't've": "would not have",
"y'all": "you all",
"y'all'd": "you all would",
"y'all'd've": "you all would have",
"y'all're": "you all are",
"y'all've": "you all have",
"you'd": "you would",
"you'd've": "you would have",
"you'll": "you will",
"you'll've": "you will have",
"you're": "you are",
"you've": "you have"
}

contractions_re = re.compile('(%s)' % '|'.join(contractions.keys()))

In [5]:
### function to clean the comment_text data
def expand_contractions(s, contractions_dict=contractions):
    def replace(match):
        return contractions_dict[match.group(0)]
    return contractions_re.sub(replace, s)


def clean_data(text):
    text = expand_contractions(text, contractions)
    text = str(text).lower()
    text = re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', ' url ', text)
    text = re.sub('(!+)', '!', text)
    text = re.sub('(\?+)', '?', text)
    text = re.sub('(\s+)', ' ', text)
    text = re.sub('(\"+)', ' \" ', text)
    text = re.sub('(\.+)', '\.', text)
    text = re.sub('(<+)', ' < .', text)
    text = re.sub('(>+)', ' > .', text)
    text = re.sub('\n', ' ', text)
    text = re.sub('\t', ' ', text)
    text = text.replace("\\", " ")
    text = text.replace("-", " ")
    text = text.replace("—", " ")
    text = text.replace("/", " ")
    text = text.replace("_", " ")
    text = re.sub('([.,!?()])', r' \1 ', text)
    text = re.sub('\s{2,}', ' ', text)
    text = re.sub('[^A-Za-z0-9\?\!\.<>,\s]+', '', text)
    return text

In [6]:
def tokenize_pad_slice_column_to_int(df, row_name, vocab_dict, max_sequence_length, stop_words):
    processed_row = []
    
    for i, row in df.iterrows():
        text = clean_data(str(row[row_name]))
        text_arr = text.split(" ")
        text_arr = [w for w in text_arr if not w in stop_words] 
        
        if len(text_arr) > max_sequence_length:
            text_arr = text_arr[:max_sequence_length]
        elif len(text_arr) < max_sequence_length:
            pad_to_add = max_sequence_length-len(text_arr)
            text_arr.extend(['<pad>' for i in range(pad_to_add)])
        
        for i in range(len(text_arr)):
            if text_arr[i] in vocab_dict:
                text_arr[i] = vocab_dict[text_arr[i]]
            else:
                text_arr[i] = vocab_dict['<unk>']
            
        processed_row.append(np.array(text_arr))
        
    df["processed_row"] = processed_row
    return df

def create_vocab_from_df_column(df, row_name):
    vocab_dict = {'<pad>' : 0,
                  '<unk>' : 1}
    index = 2
    
    for i, row in df.iterrows():
        text = clean_data(str(row[row_name]))
        text_arr = text.split(" ")
        for word in text_arr:
            if word in vocab_dict:
                continue
            else:
                vocab_dict[word] = index
                index += 1
                
    return vocab_dict
                

In [7]:
vocab_dict = create_vocab_from_df_column(df, 'review')

In [8]:
df = tokenize_pad_slice_column_to_int(df, 'review', vocab_dict, 300, stop_words)

In [9]:
df.head()

Unnamed: 0,type,review,label,processed_row
0,test,Once again Mr. Costner has dragged out a movie...,0,"[4, 5, 6, 8, 11, 13, 14, 16, 5, 17, 20, 21, 22..."
1,test,This is an example of why the majority of acti...,0,"[123, 124, 125, 126, 5, 128, 129, 24, 60, 130,..."
2,test,"First of all I hate those moronic rappers, who...",0,"[236, 237, 239, 240, 24, 241, 242, 244, 245, 2..."
3,test,Not even the Beatles could write songs everyon...,0,"[324, 325, 116, 326, 327, 328, 329, 24, 330, 2..."
4,test,Brass pictures (movies is not a fitting word f...,0,"[441, 442, 122, 336, 443, 444, 122, 60, 201, 4..."


In [10]:
### Create batch generator
df_train = df[df['type'] == 'train'].sample(frac=1).reset_index(drop=True).drop(['type'], axis = 1)
df_train.label = df_train.label.astype('float64')

df_test = df[df['type'] == 'test'].sample(frac=1).reset_index(drop=True).drop(['type'], axis = 1)
df_test.label = df_test.label.astype('float64')

In [11]:
X_train = df_train.processed_row.tolist()
X_train = torch.from_numpy(np.vstack(X_train))

X_test = df_test.processed_row.tolist()
X_test = torch.from_numpy(np.vstack(X_test))

y_train = df_train.label.tolist()
#y_train = torch.from_numpy(np.vstack(y_train), dtype=torch.long)
y_train = torch.tensor(np.vstack(y_train), dtype=torch.float)

y_test = df_test.label.to_list()
#y_test = torch.from_numpy(np.vstack(y_test))
y_test = torch.tensor(np.vstack(y_train), dtype=torch.float)

print("Train Input Shape: {} Train Target Shape: {}".format(X_train.shape, y_train.shape))
print("Test Input Shape: {} Test Target Shape: {}".format(X_test.shape, y_test.shape))

Train Input Shape: torch.Size([25000, 300]) Train Target Shape: torch.Size([25000, 1])
Test Input Shape: torch.Size([25000, 300]) Test Target Shape: torch.Size([25000, 1])


In [12]:
train = data_utils.TensorDataset(X_train, y_train)
train_loader = data_utils.DataLoader(train, batch_size=50, shuffle=True)

In [13]:
test = data_utils.TensorDataset(X_test, y_test)
test_loader = data_utils.DataLoader(train, batch_size=50, shuffle=True)

In [14]:
def create_wv_matrix(vocab_dict):
    print ('... Loading Word Vectors')
    word_vectors = KeyedVectors.load_word2vec_format("./models/GoogleNews-vectors-negative300.bin", binary=True)
    wv_matrix = []
    count = 0
    print ('... Finish Loading Word Vectors')
    
    for each in vocab_dict.items():
        count += 1
        
        word = str(each[0]).lower()
        index = int(each[1])
        
        if word in word_vectors.vocab:
            wv_matrix.append(word_vectors.word_vec(word))
        else:
            wv_matrix.append(np.random.uniform(-0.01, 0.01, 300).astype("float32"))
        
        if count %10000 == 0:
            print ("On Index {}".format(count))
            
    ### Add Unknown Token
    wv_matrix.append(np.random.uniform(-0.01, 0.01, 300).astype("float32"))
    ### Add Pad Token
    wv_matrix.append(np.zeros(300).astype("float32"))
    print ('... Finished Creating Matrix')
    
    del(word_vectors)
    
    return np.array(wv_matrix)

def create_emb_layer(weights_matrix, non_trainable=False):
    num_embeddings, embedding_dim = weights_matrix.shape
    emb_layer = nn.Embedding(num_embeddings, embedding_dim)
    emb_layer.weight.data.copy_(torch.from_numpy(weights_matrix))
    
    if non_trainable:
        emb_layer.weight.requires_grad = False

    return emb_layer, num_embeddings, embedding_dim

In [15]:
### Get Word Vector Matrix
wv_matrix = create_wv_matrix(vocab_dict)

... Loading Word Vectors
... Finish Loading Word Vectors
On Index 10000
On Index 20000
On Index 30000
On Index 40000
On Index 50000
On Index 60000
On Index 70000
On Index 80000
On Index 90000
On Index 100000
... Finished Creating Matrix


In [16]:
gc.collect()

22

In [17]:
### Test the Embedding Layer
emb_layer, num_embeddings, embedding_dim = create_emb_layer(wv_matrix)
batch = next(iter(train_loader))
emb_layer(batch[0]).shape

torch.Size([50, 300, 300])

### Create CNN Model

In [18]:
class CNN(nn.Module):
    def __init__(self, weights_matrix):
        super(CNN, self).__init__()
        
        ### Embedding Layer
        self.embedding, num_embeddings, embedding_dim = create_emb_layer(weights_matrix, True)
       
        ### Convolution Layer 1
        self.conv1 = nn.Sequential(         # input shape (1, 300, 300)
            nn.Conv2d(
                in_channels=1,              # input height
                out_channels=3,             # n_filters
                kernel_size=10,             # filter size
                stride=2,                   # filter movement/step
                padding=0,                  # if want same width and length of this image after Conv2d, padding=(kernel_size-1)/2 if stride=1
            ),                              # output shape (3, 146, 146)
            nn.ReLU(),                      # activation
            nn.MaxPool2d(kernel_size=2),    # (146-2 / 2) choose max value in 2x2 area, output shape (3, 73, 73)
        )
        
        ### Convolution Layer 2
        self.conv2 = nn.Sequential(        # input shape (3, 73, 73)
            nn.Conv2d(3, 3, 5, 2, 1),      # output shape (3, 36, 36)
            nn.ReLU(),                     # activation
            nn.MaxPool2d(2),               # output shape (3, 18, 18)
        )
            
        ### Fully Connected Layer 3
        self.FC1 = nn.Linear(3 * 18 * 18, 800)
        
        ### Fully Connected Layer 4
        self.FC2 = nn.Linear(800, 200)
        
        # Output 2 classes
        self.out = nn.Linear(200, 1)
        
    def forward(self, x):
        x = self.embedding(x)
        #print (x.shape)
        x = x.unsqueeze(1)
        #print (x.shape)
        x = self.conv1(x)
        #print (x.shape)
        x = self.conv2(x)
        #print (x.shape)
        x = x.view(x.size(0), -1)           # flatten the output of conv2
        x = F.relu(self.FC1(x))
        #print (x.shape)
        x = F.relu(self.FC2(x))
        #print (x.shape)
        output =  torch.sigmoid(self.out(x))
        #print (output.shape)
        
        return output

### Train CNN Model

In [19]:
### Hyperparameters
num_epochs = 30
cnn_classifier = CNN(weights_matrix=wv_matrix)

optimizer = optim.Adam(cnn_classifier.parameters())
criterion = nn.MSELoss()


In [20]:
def binary_accuracy(preds, y):
    #round predictions to the closest integer
    rounded_preds = torch.round(preds)
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

In [21]:
def train(model, iterator, optimizer, criterion):
    epoch_loss = 0
    epoch_acc = 0
    count = 1
    model.train()
    
    for batch in iterator:
        count +=1
        optimizer.zero_grad()
        predictions = model(batch[0])
        loss = criterion(predictions, batch[1])
        acc = binary_accuracy(predictions, batch[1])
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)


In [22]:
def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
        for batch in iterator:
            predictions = model(batch[0])
            loss = criterion(predictions, batch[1])
            acc = binary_accuracy(predictions, batch[1])
            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [23]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [24]:
### Training Loop
best_test_loss = float('inf')

for epoch in range(num_epochs):

    start_time = time.time()
    
    train_loss, train_acc = train(cnn_classifier, train_loader, optimizer, criterion)
    test_loss, test_acc = evaluate(cnn_classifier, test_loader, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if test_loss < best_test_loss:
        best_test_loss = test_loss
        torch.save(cnn_classifier.state_dict(), 'tut4-model.pt')
    
    print("EPOCH: {}, TIME: {}:{}".format(epoch, epoch_mins, epoch_secs))
    print("Train Loss: {}. Train Acc: {}".format(train_loss, train_acc*100))
    print("Test Loss: {}. Test Acc: {}".format(test_loss, test_acc*100))

EPOCH: 0, TIME: 1:46
Train Loss: 0.2065124011039734. Train Acc: 65.8360002875328
Test Loss: 0.16567027720808983. Test Acc: 75.89199985265732
EPOCH: 1, TIME: 1:47
Train Loss: 0.16196294552087784. Train Acc: 76.4119998574257
Test Loss: 0.1575906427949667. Test Acc: 77.41199966669082
EPOCH: 2, TIME: 1:46
Train Loss: 0.15299007812142373. Train Acc: 77.94399963617325
Test Loss: 0.1421374605000019. Test Acc: 79.69199942350387
EPOCH: 3, TIME: 1:46
Train Loss: 0.14726545952260495. Train Acc: 78.7439996600151
Test Loss: 0.13765579323470592. Test Acc: 80.35999960899353
EPOCH: 4, TIME: 1:49
Train Loss: 0.14309378159046174. Train Acc: 79.46399945020676
Test Loss: 0.13289373557269574. Test Acc: 81.19999957084656
EPOCH: 5, TIME: 1:53
Train Loss: 0.14042587108165025. Train Acc: 79.82799965143204
Test Loss: 0.1405327669903636. Test Acc: 80.16399949789047
EPOCH: 6, TIME: 1:57
Train Loss: 0.13708722899854184. Train Acc: 80.17599952220917
Test Loss: 0.13045398165285588. Test Acc: 81.69199948310852
EPOCH:

### Test Model

In [25]:
cnn_classifier.load_state_dict(torch.load('./tut4-model.pt'))
cnn_classifier.eval()

CNN(
  (embedding): Embedding(108875, 300)
  (conv1): Sequential(
    (0): Conv2d(1, 3, kernel_size=(10, 10), stride=(2, 2))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (conv2): Sequential(
    (0): Conv2d(3, 3, kernel_size=(5, 5), stride=(2, 2), padding=(1, 1))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (FC1): Linear(in_features=972, out_features=800, bias=True)
  (FC2): Linear(in_features=800, out_features=200, bias=True)
  (out): Linear(in_features=200, out_features=1, bias=True)
)

In [26]:
def predict_new_text(model, text):
    predict_df = pd.DataFrame( {'Message' : [text_to_classify]})
    predict_df = tokenize_pad_slice_column_to_int(predict_df, 'Message', vocab_dict, 300, stop_words)

    predict = predict_df.processed_row.tolist()
    predict = torch.from_numpy(np.vstack(predict))
    predict = np.round((model(predict)[0][0]).detach().numpy())
    
    if predict == 0:
        print ("Negative Review")
    if predict == 1:
        print ("Positive Review")


In [27]:
text_to_classify = """This movie is the beginning of the culmination of Marvel's masterfully woven cinematic universe. 
                    Beginning back in 2008 with iron man, we are finally seeing the results of all the movies have been pointing to; 
                    and it did not disappoint. Thanos is a complex villain, with deeper and more interesting desires than just "world domination." 
                    The dilemmas all the characters face in this movie (both the heroes and the villains) are truly thought provoking and 
                    leave you on the edge of your seat. No other set of movies has beeen so involved, so expanded, and encompassed so many story
                    lines/characters and previous movies. The sheer amount of star power alone in this film is insane; and they do a masterful
                    job of weaving all these unique and various characters into a common storyline."""

predict_new_text(cnn_classifier, text_to_classify)

Positive Review
