### Import Packages

In [72]:
import numpy as np
import matplotlib.pyplot as plt
from nltk import word_tokenize
from nltk import download as nltk_download
nltk_download('punkt')
from collections import Counter
from IPython.display import clear_output
from gensim.models import Word2Vec
from collections import defaultdict
import random

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### Create Necessary Functions

In [0]:
def softmax(z, extra_param = None):
    sums = np.sum(np.exp(z))
    return np.exp(z) / sums

def one_hot(y, total_cats = None):
    if total_cats == None:
        y_onehot = np.zeros([len(y), len(np.unique(y))])
    else:
        y_onehot = np.zeros([len(y), total_cats])

    for i in range(len(y)):
        y_onehot[i, y[i]] = 1
        
    return y_onehot

def embed(sentence, hidden_size):
    embedding = np.zeros([len(sentence), hidden_size])
    for i, word in enumerate(sentence):
        embedding[i] = w2v[word]
    return embedding

### Create RNN Class

In [0]:
class RNN():
    
    def __init__(self, input_dim, output_dim, hidden_dim, bptt_truncate = 4):
        
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.output_dim = output_dim
        self.bptt_truncate = bptt_truncate
        
        self.U = np.random.uniform(-1 / np.sqrt(self.input_dim), 1 / np.sqrt(self.input_dim),
                                   [self.hidden_dim, self.input_dim])
        self.V = np.random.uniform(-1 / np.sqrt(self.hidden_dim), 1 / np.sqrt(self.hidden_dim),
                                   [self.output_dim, self.hidden_dim])
        self.W = np.random.uniform(-1 / np.sqrt(self.hidden_dim), 1 / np.sqrt(self.hidden_dim),
                                   [self.hidden_dim, self.hidden_dim])
    
    def forward_propagation(self, X):
        # Total number of time steps
        T = X.shape[0]

        # Matrix to hold the hidden state at each step from 0 to T
        S = np.zeros([T + 1, self.hidden_dim])
        # Initialize to zero
        S[-1, :] = np.zeros([self.hidden_dim])
        
        # Matrix to hold output at each time step
        O = np.zeros([T, self.output_dim])
        
        for t in range(T):
            S[t, :] = np.tanh(np.dot(self.U, X[t, :]) + np.dot(self.W, S[t - 1, :]))
            O[t, :] = softmax(np.dot(self.V, S[t, :]))
        return (O, S)
    
    def predict(self, X):
        O, S = self.forward_propagation(X)
        return np.argmax(O[X.shape[0] - 1, :])
    
    def calc_total_loss(self, X, y):
        O, S = self.forward_propagation(X)
        return -1 * np.sum(y * np.log(O[X.shape[0] - 1, :]))
    
    def random_loss(self):
        return np.log(self.input_dim)
    
    def bptt(self, X, y):
        T = X.shape[0]
        
        O, S = self.forward_propagation(X)
        
        dL_dU = np.zeros(self.U.shape)
        dL_dV = np.zeros(self.V.shape)
        dL_dW = np.zeros(self.W.shape)
        
        for t in range(T):
            
#             a_t = np.dot((O - y), self.V) * (1 - S[t, :] ** 2)
            a_t = np.dot((O - y)[t, :], self.V) * (1 - S[t, :] ** 2)

            for step in range(t, max(t - self.bptt_truncate, -1), -1):
                
                dL_dU += np.outer(a_t, X[step, :])
                dL_dW += np.outer(a_t, S[step, :])
                a_t = np.dot(a_t, self.W) * (1 - S[step - 1, :] ** 2)
            
#             dL_dV += np.outer((O - y), S[t, :])
            dL_dV += np.outer((O - y)[t, :], S[t, :])
            
        return (dL_dU, dL_dV, dL_dW)
    
    def sgd_step(self, X, y, lr, eta):
        (dL_dU, dL_dV, dL_dW) = self.bptt(X, y)
        
        # Gradient Clipping
        if np.linalg.norm(dL_dU) > eta:
            dL_dU = dL_dU * eta / np.linalg.norm(dL_dU)
        if np.linalg.norm(dL_dV) > eta:
            dL_dV = dL_dV * eta / np.linalg.norm(dL_dV)
        if np.linalg.norm(dL_dW) > eta:
            dL_dW = dL_dW * eta / np.linalg.norm(dL_dW)
            
        self.U = self.U - lr * dL_dU
        self.V = self.V - lr * dL_dV
        self.W = self.W - lr * dL_dW

### Install Kaggle

In [6]:
# Colab library to upload files to notebook
from google.colab import files

# Install Kaggle library
!pip install -q kaggle

# Upload kaggle API key file
uploaded = files.upload()

Saving kaggle.json to kaggle.json


In [7]:
!mkdir /root/.kaggle
!mv kaggle.json /root/.kaggle/kaggle.json
!chmod 600 /root/.kaggle/kaggle.json

mkdir: cannot create directory ‘/root/.kaggle’: File exists


### Create Dataset

In [8]:
!kaggle competitions download -c movie-review-sentiment-analysis-kernels-only
!unzip train.tsv.zip
!unzip test.tsv.zip

test.tsv.zip: Skipping, found more recently modified local copy (use --force to force download)
sampleSubmission.csv: Skipping, found more recently modified local copy (use --force to force download)
train.tsv.zip: Skipping, found more recently modified local copy (use --force to force download)
Archive:  train.tsv.zip
replace train.tsv? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
Archive:  test.tsv.zip
replace test.tsv? [y]es, [n]o, [A]ll, [N]one, [r]ename: n


In [9]:
!head train.tsv

PhraseId	SentenceId	Phrase	Sentiment
1	1	A series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story .	1
2	1	A series of escapades demonstrating the adage that what is good for the goose	2
3	1	A series	2
4	1	A	2
5	1	series	2
6	1	of escapades demonstrating the adage that what is good for the goose	2
7	1	of	2
8	1	escapades demonstrating the adage that what is good for the goose	2
9	1	escapades	2


In [0]:
with open('./train.tsv', 'r') as file:
    lines = [line.rstrip().split('\t')[2:] for line in file if line != 'PhraseId\tSentenceId\tPhrase\tSentiment\n']
    
sentiments = np.array([int(line[1]) for line in lines])
y = one_hot(sentiments)

large_corpus = [[word.lower() for word in word_tokenize(line[0])] for line in lines if len([word.lower() for word in word_tokenize(line[0])]) > 0]

In [0]:
corpus = [sentence for sentence in large_corpus if sentence[-1] == '.']

word_counts = defaultdict(int)
for row in corpus:
    for word in row:
        word_counts[word] += 1
        
vocab = sorted(list(word_counts.keys()), reverse = False)
word_index = {word: i for i, word in enumerate(vocab)}
index_word = {i: word for i, word in enumerate(vocab)}
vocab_size = len(vocab)

hidden_size = 128
w2v = Word2Vec(corpus, min_count = 1, size = hidden_size, workers = 12, window = 5, iter = 500, sg = 1)

# X = [[word_index[word] for word in sentence] for sentence in corpus]

### Train Model

In [119]:
rnn = RNN(hidden_size, y.shape[1], hidden_size, 6)
total_losses = []
epoch_size = 5000

for i in range(1):
    if i % 10 == 9:
        clear_output()
        plt.plot(total_losses)
        plt.show()
        print('Epoch: ' + str(i) + ' - Current Loss: ' + str(total_losses[-1]))
    elif i > 0:
        print('Epoch: ' + str(i) + ' - Current Loss: ' + str(total_losses[-1]))
    losses = 0
    idx = random.sample([i for i in range(epoch_size)], epoch_size)
    for non_random_idx, i in enumerate(idx):
        if non_random_idx % (epoch_size / 10) == 0:
            print('    Batches Finished: ' + str(non_random_idx))
        rnn.sgd_step(embed(corpus[i], hidden_size), y[i], .0001, 2)
        losses += rnn.calc_total_loss(embed(corpus[i], hidden_size), y[i])
    total_losses.append(losses / epoch_size)



# plt.plot(losses)

    Batches Finished: 0




    Batches Finished: 500
    Batches Finished: 1000
    Batches Finished: 1500
    Batches Finished: 2000
    Batches Finished: 2500
    Batches Finished: 3000
    Batches Finished: 3500
    Batches Finished: 4000
    Batches Finished: 4500


### Predict Sentiment From Trained Model

In [121]:
print(y[50:60])
[rnn.forward_propagation(embed(sentence, hidden_size))[0][len(sentence) -1, :] for sentence in corpus[50:60]]


[[0. 0. 1. 0. 0.]
 [0. 0. 1. 0. 0.]
 [0. 0. 1. 0. 0.]
 [0. 0. 1. 0. 0.]
 [0. 0. 1. 0. 0.]
 [0. 0. 1. 0. 0.]
 [0. 0. 1. 0. 0.]
 [0. 0. 1. 0. 0.]
 [0. 0. 1. 0. 0.]
 [0. 0. 1. 0. 0.]]




[array([0.09560893, 0.14054741, 0.47282912, 0.1690501 , 0.12196444]),
 array([0.11832074, 0.1841777 , 0.39476113, 0.1680638 , 0.13467662]),
 array([0.11832306, 0.18418106, 0.39475416, 0.16806421, 0.13467751]),
 array([0.06855058, 0.12728227, 0.57020102, 0.13676098, 0.09720515]),
 array([0.06864851, 0.12735809, 0.5697736 , 0.1369038 , 0.097316  ]),
 array([0.10114049, 0.18027122, 0.46662251, 0.14763339, 0.1043324 ]),
 array([0.10114205, 0.18027319, 0.46661781, 0.14763326, 0.10433369]),
 array([0.11294759, 0.1635729 , 0.42299351, 0.17485072, 0.12563527]),
 array([0.11464125, 0.16469108, 0.41864752, 0.17575831, 0.12626185]),
 array([0.09826658, 0.15512971, 0.47978986, 0.15405778, 0.11275607])]