In [None]:
from IPython.display import Image
from IPython.core.display import HTML
import getpass

from traitlets.config.manager import BaseJSONConfigManager
path = "/Users/{}/anaconda3/envs/rise_latest/etc/jupyter/nbconfig".format(getpass.getuser())
cm = BaseJSONConfigManager(config_dir=path)
o = cm.update("livereveal", {
              "theme": "sky",
              "transition": "fade",
              "start_slideshow_at": "selected",
})

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# Vectorization and Classification with RNNs

## Topics
* RNN Encoding for Text Classification
* Recurrent Neural Networks
* Word Embeddings
* Example

## Multi-class Document Classification
<br>
<br>
<center>
<img src="src/Shape_of_NLP_Problems_2.png?" alt="perceptron" style="width:968px">
</center>   

## Text Vectorization with Recurrent Neural Networks
<br>
<br>
<center>
<img src="src/Shape_of_NLP_Problems_6.png?" alt="perceptron" style="width:968px">
</center>   

## Recurrent Neural Networks
<center>
<img src="src/0_rnn.png?" alt="perceptron" style="height:400px">
</center> 
<font size="-1">
Images graciously sourced from <a href="http://colah.github.io/posts/2015-08-Understanding-LSTMs/">Understanding LSTM Networks</a> by Christopher Olah
</font>

## Recurrent Neural Networks (unrolled)

<br>
<br>
<center>
<img src="src/1_rnn.png?" alt="perceptron" style="height:300px">
</center> 

## Vanilla Recurrent Neural Networks

<br>
<br>
<center>
<img src="src/0_lstm.png?" alt="perceptron" style="height:300px">
</center> 

## Long Short-term Memory (LSTM) Networks

<br>
<br>
<center>
<img src="src/1_lstm.png?" alt="perceptron" style="height:300px">
</center> 

<font size="-1">
For a deeper dive into the necessity for and implementation of LSTM networks, see <a href="http://colah.github.io/posts/2015-08-Understanding-LSTMs/">Understanding LSTM Networks</a> by Christopher Olah
</font>

## Encoding a Sequence to a single Vector

<br>
<br>
<center>
<img src="src/0_rnn_encoding.png?" alt="perceptron" style="height:300px">
</center> 

## Sequence Classification

<br>
<br>
<center>
<img src="src/0a_rnn_encoding.png?" alt="perceptron" style="height:300px">
</center> 

## Sequence Classification

<br>
<br>
<center>
<img src="src/0b_rnn_encoding.png?" alt="perceptron" style="height:300px">
</center> 

<center>
<h2> Wait, what about the inputs to the RNN??</h2>
</center>

## Word Embeddings: Bag-of-Words vs. Dense Representations 
<br>
<center>
<img src="src/0_bow_vs_dense.png?" alt="bow_vs_dense" style="height:400px">
</center> 

## LSTM Text Encoding and Classification Example

In [None]:
# Required imports
import torch
import numpy as np
import pandas as pd
from torch.nn import Linear, Embedding, RNN, GRU, LSTM
from torch.nn import Sigmoid, LogSoftmax
from torch.optim import SGD
from torch.nn import BCELoss, NLLLoss, CrossEntropyLoss
from string import punctuation
import itertools
from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [None]:
# Load the data into a DataFrame
data = pd.read_pickle('../data/2_r8.pkl')

# Definne a simple convenience function for cleaning the strings
def clean_text(text):
    return "".join([c for c in text.lower() if c not in punctuation])

# Clean the string labels
data['text_cleaned'] = data['text'].map(clean_text)
data.head()

In [None]:
# first start by splitting the strings,
# then determining all of the unique words in the corpus
text_split = data['text_cleaned'].map(lambda x: x.split())
all_words = set(list(itertools.chain.from_iterable(text_split)))
vocab_size = len(all_words)

# next, determing all of the unique labels, 
all_labels = list(data['label'].unique())
label_size = len(all_labels)

# create two lookups that translate word <-> integer index
# note that this is similar the the underlying representation
# of a simple count vectorizer
word2idx = {word: idx for idx, word in enumerate(all_words)}
idx2word = {idx: word for word, idx in word2idx.items()}

# create a similar lookup for the labels label <-> integer
label2idx = {word: idx for idx, word in enumerate(all_labels)}
idx2label = {idx: word for word, idx in label2idx.items()}

# encode both the text and label as integers
data['text_encoded'] = data['text_cleaned'].map(lambda x: [word2idx[word] for word in x.split()])
data['label_encoded'] = data['label'].map(lambda x: [label2idx[word] for word in x.split()])

# grab the labels and features
# and create training and testing sets
labels = data['label_encoded'].values
features = data['text_encoded']
train_data, test_data = train_test_split(list(zip(features, labels)))

In [None]:
# let's define the pieces that we'll 
# need for the model

# we'll start with an embedding layer
# the input size is the size of our vocabulary 
# (we'll need a row for every word in the input)
# and the output size is the dimension that
# we'll want for our word vectors
embedding = Embedding(num_embeddings=vocab_size, embedding_dim=100)

# once we've converted our tokens to 
# vectors via an embedding layer, we'll
# want to run a sequence of these vectors
# through an LSTM layer. The input size of
# the LSTM is our embedding dimension, 
# and the hidden dimension can be chosen by us

lstm = LSTM(input_size=100, hidden_size=50)

# because the forward pass of the LSTM
# requires the hidden state from the previous
# step as input, we'll have to initialize
# the hidden state vectors. this will
# need to be done at the beginning of each iteration
# before we run any new sequence through the LSTM

h0 = torch.zeros(1, 1, 50)
c0 = torch.zeros(1, 1, 50)
lstm_hidden = h0, c0

# we'll be taking the last output of 
# the LSTM sequence which will be the 
# same dimension as the hidden layer.
# We'll then need a single linear layer 
# to act as a classifier. The input size 
# should then be the same as the hidden dim 
# of the LSTM, and the output size should be 
# the same as out number of classes for the 
# classification task

linear = Linear(50, label_size)

# lastly, we'll want to normalize the final output
# to a softmax distribution
softmax = LogSoftmax(dim=1)

# we'll want to use NLLLoss for this situation
criterion = NLLLoss()

In [None]:
# start by taking a sample feature and sample target
f = features[0]
t = labels[0]

# cast them to torch tensors
X = torch.LongTensor(f)
y = torch.LongTensor(t)
print("Integer Feature Sequence Shape:", X.shape)
print("Integer Target Shape:", y.shape)

# pass the sequence through the embedding layer
embedded_sequence = embedding(X)
print("Embedding Sequence Shape:", embedded_sequence.shape)

# the LSTM takes input tensors of shape:
# (seq_len, batch_size, input_dimension)
# so we'll use the .view() method
# of the torch tensor to reshape the embedding
# and insert an additional dimension
embedded_sequence = embedded_sequence.unsqueeze(1)

# Pass the embedded sequence through the lstm layer
lstm_output, lstm_hidden = lstm(embedded_sequence, lstm_hidden)
print("LSTM Output Shape:", lstm_output.shape)

# retain only the final output state
# of the lstm
final_output = lstm_output[-1]
print("Linear Layer Input Shape:", final_output.shape)

# run the final output through the linear layer
linear_output = linear(final_output)
print("Linear Output Shape:", linear_output.shape)

# run the linear output through the softmax activation
softmax_output = softmax(linear_output)
print("Softmax Output Shape:", softmax_output.shape)
print("Target Shape:", y.shape)

# calculate the loss w.r.t. the target
loss = criterion(softmax_output, y)
print("Loss Value:", loss.data.numpy())

In [None]:
from modules.classification import *

model = rnn_classifier(vocab_size = vocab_size, 
                       embedding_dim=100, 
                       hidden_dim=50, 
                       output_dim=label_size, 
                       batch_size=1)

optim = SGD(params=model.parameters(), lr=0.01)
criterion = NLLLoss()

for i in range(10):
    total_loss = 0
    model.train()
    for it, example in enumerate(train_data):

        f, t = example
        X = torch.LongTensor(f[:32]) # we'll only take the first 32 elements in the sequence
        y = torch.LongTensor(t)
        
        model.hidden = model.init_hidden()
        output = model.forward(X)
        optim.zero_grad()
        prediction = torch.argmax(output)
        loss = criterion(output, y)
        total_loss += loss.data.numpy()

        loss.backward()

        optim.step()

    model.eval()
    y_pred = []
    y_true = []
    for example in test_data:
        optim.zero_grad()
        f, t = example
        X = torch.LongTensor(f[:32]) # we're only using the first 32 elements in the sequence
        y = torch.LongTensor(t)

        model.hidden = model.init_hidden()
        output = model.forward(X)
        prediction = torch.argmax(output)

        y_true.append(y.data.numpy()[0])
        y_pred.append(torch.argmax(output.data).numpy())

        a = accuracy_score(y_true, y_pred)

    total_loss /= (it + 1)

    print("Loss: {:.2f}, Validation Accuracy: {:.2f}".format(total_loss, a))