In [323]:
# This neural network performs sentiment analysis on text 
# Classifies text as positive or negative 

In [324]:
# load and preprocess the data
import numpy as np
from string import punctuation 

# fetch posts and labels
with open('data/posts.txt', 'r') as f: 
    posts = f.read()
with open('data/labels.txt', 'r') as f: 
    labels = f.read()
    
# standardize posts by lowering case and eliminating pronounciation 
posts = posts.lower()
all_text = ''.join([c for c in posts if c not in punctuation])

# split posts by new lines
posts = all_text.split('\n')

# split labels by new lines 
labels = labels.split()

In [325]:
# first 3 posts 
print(posts[:3])

['bromwell high is a cartoon comedy  it ran at the same time as some other programs about school life  such as  teachers   my   years in the teaching profession lead me to believe that bromwell high  s satire is much closer to reality than is  teachers   the scramble to survive financially  the insightful students who can see right through their pathetic teachers  pomp  the pettiness of the whole situation  all remind me of the schools i knew and their students  when i saw the episode in which a student repeatedly tried to burn down the school  i immediately recalled          at           high  a classic line inspector i  m here to sack one of your teachers  student welcome to bromwell high  i expect that many adults of my age think that bromwell high is far fetched  what a pity that it isn  t   ', 'story of a man who has unnatural feelings for a pig  starts out with a opening scene that is a terrific example of absurd comedy  a formal orchestra audience is turned into an insane  viole

In [326]:
# first 10 words 
print(words[:10])

['bromwell', 'high', 'is', 'a', 'cartoon', 'comedy', 'it', 'ran', 'at', 'the']


In [327]:
# encode the words (map words to integers)
from collections import Counter 

# create a list of words
all_posts = ' '.join(posts)
words = all_posts.split()

# words mapped to frequency
counts = Counter(words)

# words sorted by frequency, in descending order
vocab = sorted(counts, key=counts.get, reverse=True)

# dict mapping each word to an integer, starting from 1 and increasing
vocab_encoded = {word: ii for ii, word in enumerate(vocab, 1)}

# tokenize each post using the word-integer dict 
tokenized_posts = []
for post in posts:
    tokenized_posts.append([vocab_encoded[word] for word in post.split()])

In [328]:
# first tokenized post 
print(tokenized_posts[0])

[21025, 308, 6, 3, 1050, 207, 8, 2138, 32, 1, 171, 57, 15, 49, 81, 5785, 44, 382, 110, 140, 15, 5194, 60, 154, 9, 1, 4975, 5852, 475, 71, 5, 260, 12, 21025, 308, 13, 1978, 6, 74, 2395, 5, 613, 73, 6, 5194, 1, 24103, 5, 1983, 10166, 1, 5786, 1499, 36, 51, 66, 204, 145, 67, 1199, 5194, 19869, 1, 37442, 4, 1, 221, 883, 31, 2988, 71, 4, 1, 5787, 10, 686, 2, 67, 1499, 54, 10, 216, 1, 383, 9, 62, 3, 1406, 3686, 783, 5, 3483, 180, 1, 382, 10, 1212, 13583, 32, 308, 3, 349, 341, 2913, 10, 143, 127, 5, 7690, 30, 4, 129, 5194, 1406, 2326, 5, 21025, 308, 10, 528, 12, 109, 1448, 4, 60, 543, 102, 12, 21025, 308, 6, 227, 4146, 48, 3, 2211, 12, 8, 215, 23]


In [329]:
# encode the labels: 1 is positive, 0 is negative 
encoded_labels = np.array([1 if label == 'positive' else 0 for label in labels])

In [330]:
# first 3 encoded labels
print(encoded_labels[:3])

[1 0 1]


In [331]:
# obtain outlier (outliers determined by post length) stats 
post_lengths = Counter([len(post) for post in tokenized_posts])
print('Zero length reviews: {}'.format(post_lengths[0]))
print('Maximum review length: {}'.format(max(post_lengths)))

Zero length reviews: 1
Maximum review length: 2514


In [332]:
# remove posts with length 0 (get posts with non-zero length) 
non_zero_idx = [ii for ii, post in enumerate(tokenized_posts) if len(post) != 0]
tokenized_posts = [tokenized_posts[ii] for ii in non_zero_idx]
encoded_labels = np.array([encoded_labels[ii] for ii in non_zero_idx])

In [333]:
# number of posts with non-zero length 
print('Number of posts with non-zero length: ', len(tokenized_posts))

Number of posts with non-zero length:  25000


In [334]:
# pad or truncate all posts to a specific length (to pass to neural net)

# pad/truncate all posts to they can be inputs to the neural network 
def mod_features(tokenized_posts, seq_length):
    # getting the correct rows x cols shape 
    features = np.zeros((len(tokenized_posts), seq_length), dtype=int)
    
    # pad each post
    for i, post in enumerate(tokenized_posts):
        features[i, -len(post):] = np.array(post)[:seq_length]
        
    return features 

# all posts will be padded/truncated to 200 characters 
seq_length = 200

# get padded/truncated posts
features = mod_features(tokenized_posts, seq_length)

In [335]:
# ensure one-to-one mapping between features and reviews
assert len(features) == len(tokenized_posts), 'Features and reviews do not have one-to-one mapping.'

# ensure that features are of the specified length 
assert len(features[0]) == seq_length, 'Features are not of specified length.'

# print the first three modified posts 
print(features[:3])

[[    0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
  21025   308     6     3  1050   207     8  2138    32     1   171    57
     15    49    81  5785    44   382   110   140    15  5194    60   154
      9     1  4975  5852   475    71     5   260    12 21025   308    13
   1978     6    74  2395     5   613    73     6  5194     1 24103     5
   1983 10166     1  5786  1499    36    51    66   204   145    67  1199
   5194 19869     1 37442     4     1   221   883    31  2988    71     4
      1  5787    10   686     2    67  1499    54    10   216     1   383
      9    62     3  1406  3686   783     5  3483   180     1   382    10
   1212 13583    32   308     3   349 

In [336]:
# split data into traning, validation, and testing data sets 
# x: features, y: labels

# percentage of data set to be dedicated to training
split_frac = 0.8

# create traning data set 
split_idx = int(len(features) * split_frac)
train_x, remaining_x = features[:split_idx], features[split_idx:]
train_y, remaining_y = encoded_labels[:split_idx], encoded_labels[split_idx:]

# create validation and testing data set by halves of leftover data
split_idx = int(len(remaining_x) * 0.5)
val_x, test_x = remaining_x[:split_idx], remaining_x[split_idx:]
val_y, test_y = remaining_y[:split_idx], remaining_y[split_idx:]

In [337]:
# print out feature shapes of the data sets 
print('\t\t\tFeature Shapes:')
print('Training set: \t\t{}'.format(train_x.shape))
print('Validation set: \t{}'.format(val_x.shape))
print('Test set: \t\t{}'.format(test_x.shape))

			Feature Shapes:
Training set: 		(20000, 200)
Validation set: 	(2500, 200)
Test set: 		(2500, 200)


In [338]:
# create data loaders for the data 

import torch
from torch.utils.data import TensorDataset, DataLoader

# create Tensor data sets 
train_data = TensorDataset(torch.from_numpy(train_x), torch.from_numpy(train_y))
valid_data = TensorDataset(torch.from_numpy(val_x), torch.from_numpy(val_y))
test_data = TensorDataset(torch.from_numpy(test_x), torch.from_numpy(test_y))

# set batch size
batch_size = 50

# create data loaders 
train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
valid_loader = DataLoader(valid_data, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size)

In [339]:
# obtain one batch of training data 
dataiter = iter(train_loader)
sample_x, sample_y = dataiter.next()

print('Sample input size: ', sample_x.size()) # batch_size, seq_length
print('Sample input: \n', sample_x)
print()
print('Sample label size: ', sample_y.size()) # batch_size
print('Sample label: \n', sample_y)

Sample input size:  torch.Size([50, 200])
Sample input: 
 tensor([[   0,    0,    0,  ...,  304,   16,  457],
        [  10,   14, 2802,  ...,   76,  479,   69],
        [   0,    0,    0,  ..., 1629,    2,  659],
        ...,
        [1238,   22,    1,  ...,   10,   89,   23],
        [   0,    0,    0,  ...,  568,    7,    7],
        [ 530,    3,   20,  ...,  558,   32, 6953]])

Sample label size:  torch.Size([50])
Sample label: 
 tensor([0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1,
        1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1,
        0, 1])


In [340]:
# train on GPU if available 
train_on_gpu = torch.cuda.is_available()

if train_on_gpu: 
    print('GPU available. Training on GPU.')
else:
    print('No GPU available. Training on CPU.')

No GPU available. Training on CPU.


In [341]:
# define sentiment analysis recurrent neural network class
import torch.nn as nn 

class SentimentAnalysisRNN(nn.Module):
    
    def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, 
                 n_layers, dropout_prob=0.5):
        super(SentimentAnalysisRNN, self).__init__()
        
        self.output_size = output_size 
        self.n_layers = n_layers 
        self.hidden_dim = hidden_dim 
        
        # embedding and LSTM layers 
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, 
                            dropout=dropout_prob, batch_first=True)
        
        # dropout layer
        self.dropout = nn.Dropout(0.3)
        
        # linear and sigmoid layers 
        self.fc = nn.Linear(hidden_dim, output_size)
        self.sig = nn.Sigmoid()
        
    def forward(self, x, hidden):
        batch_size = x.size(0)
        
        # embeddings and lstm_out 
        x = x.long()
        embeds = self.embedding(x)
        lstm_out, hidden = self.lstm(embeds, hidden)
        
        # stack up lstm outputs 
        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)
        
        # dropout and fully-connected layer 
        out = self.dropout(lstm_out)
        out = self.fc(out)
        
        # sigmoid funciton 
        sig_out = self.sig(out)
        
        # reshape to be batch_size first
        sig_out = sig_out.view(batch_size, -1)
        sig_out = sig_out[:, -1]
        
        # return last sigmoid output and hidden state 
        return sig_out, hidden 
    
    def init_hidden(self, batch_size):
        # initialize hidden state 
        weight = next(self.parameters()).data
        if (train_on_gpu):
            hidden = (weight.new(self.n_layers, batch_size, 
                                 self.hidden_dim).zero_().cuda(),
                     weight.new(self.n_layers, batch_size, 
                                self.hidden_dim).zero_().cuda())
        else:
            hidden = (weight.new(self.n_layers, batch_size, 
                                 self.hidden_dim).zero_(),
                     weight.new(self.n_layers, batch_size, 
                                self.hidden_dim).zero_())
        return hidden
        

In [342]:
# instantiate the model with hyperparameters

# + 1 for vocab size b/c of 0 padding & word tokens start at 1
vocab_size = len(vocab_encoded) + 1 
output_size = 1
embedding_dim = 1000
hidden_dim = 128
n_layers = 2

net = SentimentAnalysisRNN(vocab_size, output_size, embedding_dim, hidden_dim, n_layers)

In [343]:
# print instance of sentiment analysis recurrent neural network 
print(net)

SentimentAnalysisRNN(
  (embedding): Embedding(74073, 1000)
  (lstm): LSTM(1000, 128, num_layers=2, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.3, inplace=False)
  (fc): Linear(in_features=128, out_features=1, bias=True)
  (sig): Sigmoid()
)


In [344]:
# loss and optimization functions 
lr = 0.001 
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(net.parameters(), lr=lr)

In [345]:
# train the model 

# training param s
epochs = 1 
counter = 0 
print_every = 100
clip = 5

# move model to GPU, if available 
if train_on_gpu:
    net.cuda()

# train for the set number of epochs 
net.train()
for e in range(epochs):
    # initialize hidden state 
    h = net.init_hidden(batch_size)
    
    # batch loop 
    for inputs, labels in train_loader: 
        counter += 1
        
        if train_on_gpu: 
            inputs, labels = inputs.cuda(), labels.cuda()
            
        # create new variables for hidden state to avoid backprop through
        # entire training history 
        h = tuple([each.data for each in h])
        
        # zero accumulated gradients 
        net.zero_grad()
        
        # get the output from the model 
        output, h = net(inputs, h)
        
        # calculate the loss and perform backprop
        loss = criterion(output.squeeze(), labels.float())
        loss.backward()
        
        # prevent the exploding gradient problem in RNNs and LSTMs
        nn.utils.clip_grad_norm_(net.parameters(), clip)
        optimizer.step()
        
        # loss stats 
        if counter % print_every == 0:
            # get validation loss 
            val_h = net.init_hidden(batch_size)
            val_losses = []
            net.eval()
            for inputs, labels in valid_loader: 
                
                # create new variables for hidden state to avoid backprop
                # through entire training history 
                val_h = tuple([each.data for each in val_h])
                
                if (train_on_gpu):
                    inputs, labels = inputs.cuda(), labels.cuda()
                    
                output, val_h = net(inputs, val_h)
                val_loss = criterion(output.squeeze(), labels.float())
                
                val_losses.append(val_loss.item())
                
            net.train()
            
            print('Epoch: {}/{}'.format(e + 1, epochs))
            print('Step: {}'.format(counter))
            print('Loss: {:.6f}'.format(loss.item()))
            print('Validation Loss: {:.6f}'.format(np.mean(val_losses)))

Epoch: 1/1
Step: 100
Loss: 0.642240
Validation Loss: 0.648161
Epoch: 1/1
Step: 200
Loss: 0.612374
Validation Loss: 0.646742
Epoch: 1/1
Step: 300
Loss: 0.586558
Validation Loss: 0.578224
Epoch: 1/1
Step: 400
Loss: 0.580955
Validation Loss: 0.505853


In [346]:
# get test data loss and accuracy 

test_losses = [] # for tracking loss 
num_correct = 0

# init hidden state 
h = net.init_hidden(batch_size)

net.eval()

# evaluate test data 
for inputs, labels in test_loader: 
    
    # create new variables for the hidden state to avoid backprop through
    # the entire training history 
    h = tuple([each.data for each in h])
    
    if train_on_gpu: 
        inputs, labels = inputs.cuda(), labels.cuda()
        
    # get predicted outputs 
    output, h = net(inputs, h)
    
    # calculate loss 
    test_loss = criterion(output.squeeze(), labels.float())
    test_losses.append(test_loss.item())
    
    # convert output probabilities to prediction (0 or 1)
    # by rounding to the nearest integer 
    pred = torch.round(output.squeeze())
    
    # compare prediction to true label 
    correct_tensor = pred.eq(labels.float().view_as(pred))
    if train_on_gpu:
        correct = np.squeeze(correct_tensor.numpy())
    else: 
        correct = np.squeeze(correct_tensor.cpu().numpy())
    num_correct += np.sum(correct)
    
# print stats 
print('Test loss: {:.3f}'.format(np.mean(test_losses)))
test_acc = num_correct / len(test_loader.dataset)
print('Test accuracy: {:.3f}'.format(test_acc))

Test loss: 0.513
Test accuracy: 0.757


In [347]:
# perform inference on a post 
from string import punctuation 

def tokenize_post(post):
    # make text lowercase, eliminate punctuation, and split by spaces
    post = post.lower()
    text = ''.join([c for c in post if c not in punctuation])
    words = text.split()
    
    # tokens 
    tokenized_post = []
    tokenized_post.append([vocab_encoded[word] for word in words])
    
    return tokenized_post

def mod_features(tokenized_posts, seq_length):
    # getting the correct rows x cols shape 
    features = np.zeros((len(tokenized_posts), seq_length), dtype=int)
    
    # pad each post
    for i, post in enumerate(tokenized_posts):
        features[i, -len(post):] = np.array(post)[:seq_length]
        
    return features

def predict(net, post, seq_length=200):
    
    net.eval() 
    
    # tokenize post 
    tokenized_post = tokenize_post(post)
    
    # pad/truncate tokenized post 
    features = mod_features(tokenized_post, seq_length)
    
    # convert to tensor to pass into model 
    feature_tensor = torch.from_numpy(features)
    batch_size = feature_tensor.size(0)
    
    # init hidden state 
    h = net.init_hidden(batch_size)
    
    if train_on_gpu: 
        feature_tensor = feature_tensor.cuda()
        
    # get the output from the model 
    output, h = net(feature_tensor, h)
    
    # convert output probabilities to prediction (0 or 1)
    pred = torch.round(output.squeeze())
    
    # describe result 
    if (pred.item() == 1):
        print('Positive post detected!')
    else:
        print('Negative post detected!')

In [393]:
# postive and negative posts 
post_neg = 'Today was easily the worst day of my life. Everything was so disorganized and nothing got done!'
post_pos = 'Today was without a doubt the best day of my life. I was so energized and productive!'

In [394]:
# make a prediction on the negative post 
predict(net, post_neg)

Negative post detected!


In [395]:
# make a prediction on the positive post 
predict(net, post_pos)

Positive post detected!
