In [None]:
from IPython.display import Image
from IPython.core.display import HTML
import getpass

from traitlets.config.manager import BaseJSONConfigManager
path = "/Users/{}/anaconda3/envs/rise_latest/etc/jupyter/nbconfig".format(getpass.getuser())
cm = BaseJSONConfigManager(config_dir=path)
o = cm.update("livereveal", {
              "theme": "sky",
              "transition": "fade",
              "start_slideshow_at": "selected",
})

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# LSTMs for Sequence to Sequence Modeling

## Fixed Length Sequence to Sequence
<br>
<br>
<center>
<img src="src/Shape_of_NLP_Problems_7.png?" alt="perceptron" style="width:968px">
</center>   

## Use Cases

* POS Tagging

* Named Entity Recognition

* Extractive Summarization

## Use Cases
* **POS Tagging**
* Named Entity Recognition
* Extractive Summarization

## Before: Encoding a Sequence to a single Vector

<br>
<br>
<center>
<img src="src/0_rnn_encoding.png?" alt="perceptron" style="height:300px">
</center> 

## Now: Encoding a Sequence to a _Sequence_ of Vectors

<br>
<br>
<center>
<img src="src/1_rnn_encoding.png?" alt="perceptron" style="height:300px">
</center> 

## Implementing Part-of-Speech Tagging

<br>
<br>
<center>
<img src="src/1a_rnn_encoding.png?" alt="perceptron" style="height:300px">
</center> 

In [None]:
# Required imports
import torch
import numpy as np
import pandas as pd
import pickle
from torch.nn import Linear, Embedding, RNN, GRU, LSTM
from torch.nn import Sigmoid, LogSoftmax, Softmax
from torch.optim import SGD
from torch.nn import BCELoss, NLLLoss, CrossEntropyLoss
from string import punctuation
import itertools
from tqdm import tqdm

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelBinarizer, LabelEncoder

In [None]:
data = pd.read_pickle('../data/3_penn_treebank_pos.pkl')
data.head()

In [None]:
all_words = set(itertools.chain.from_iterable(data['text']))
all_labels = set(itertools.chain.from_iterable(data['label']))

word2idx = {word: idx for idx, word in enumerate(all_words)}
idx2word = {idx: word for word, idx in word2idx.items()}

label2idx = {word: idx for idx, word in enumerate(all_labels)}
idx2label = {idx: word for word, idx in label2idx.items()}

vocab_size = len(all_words)
label_size = len(all_labels)

In [None]:
features = data['text'].map(lambda x: [word2idx[i] for i in x]).tolist()
labels = data['label'].map(lambda x: [label2idx[i] for i in x]).tolist()

train_data, test_data = train_test_split(list(zip(features, labels)))

In [None]:
# let's define the pieces that we'll 
# need for the model

# we'll start with an embedding layer
# the input size is the size of our vocabulary 
# (we'll need a row for every word in the input)
# and the output size is the dimension that
# we'll want for our word vectors
# TODO: CREATE AN EMBEDDING LAYER

# once we've converted our tokens to 
# vectors via an embedding layer, we'll
# want to run a sequence of these vectors
# through an LSTM layer. The input size of
# the LSTM is our embedding dimension, 
# and the hidden dimension can be chosen by us
# TODO: CREATE AN LSTM

# because the forward pass of the LSTM
# requires the hidden state from the previous
# step as input, we'll have to initialize
# the hidden state vectors. this will
# need to be done at the beginning of each iteration
# before we run any new sequence through the LSTM
# TODO: CREATE A TUPLE CONTAINING THE HIDDEN
# AND CELL STATES INITIALIZED TO ZEROS

# we'll be taking the last output of 
# the LSTM sequence which will be the 
# same dimension as the hidden layer.
# We'll then need a single linear layer 
# to act as a classifier. The input size 
# should then be the same as the hidden dim 
# of the LSTM, and the output size should be 
# the same as out number of classes for the 
# classification task
# TODO: CREATE A LINEAR LAYER TO TRANSFORM THE LSTM OUTPUT

# lastly, we'll want to normalize the final output
# to a softmax distribution
# CREATE A LOGSOFTMAX TRANSFORMER TO CONVERT THE 
# LINEAR OUTPUT TO A LOGSOFTMAX DISTRIBUTION

# we'll want to use NLLLoss for this again
# TODO: CREATE AN INSTANCE OF THE NLLLOSS FUNCTION

In [None]:
# start by taking a sample feature and sample target
f = #TODO
t = #TODO

# cast them to torch tensors
X = #TODO
y = #TODO
print("Integer Feature Sequence Shape:", X.shape)
print("Integer Target Shape:", y.shape)

# pass the sequence through the embedding layer
embedded_sequence = #TODO
print("Embedding Sequence Shape:", embedded_sequence.shape)

# the LSTM takes input tensors of shape:
# (seq_len, batch_size, input_dimension)
# so we'll use the .view() method
# of the torch tensor to reshape the embedding
# and insert an additional dimension
embedded_sequence = #TODO

# Pass the embedded sequence through the lstm layer
lstm_output, lstm_hidden = #TODO
print("LSTM Output Shape:", lstm_output.shape)

# this time, we want to retain the entire sequence
# of outputs from the lstm
final_output = #TODO
print("Linear Layer Input Shape:", final_output.shape)

# run the final output through the linear layer
linear_output = #TODO
print("Linear Output Shape:", linear_output.shape)

# run the linear output through the softmax activation
softmax_output = #TODO
print("Softmax Output Shape:", softmax_output.shape)

# We need to squeeze out the batch dimension of the 
# softmax output before we run through out loss function
softmax_squeezed = #TODO
print("Softmax Squeezed Shape:", softmax_squeezed.shape)
print("Target Shape:", y.shape)

# calculate the loss w.r.t. the target
loss = #TODO
print("Loss Value:", loss.data.numpy())

In [None]:
from modules.tagging import pos_tagger

model = pos_tagger(vocab_size = vocab_size, 
                       embedding_dim=100, 
                       hidden_dim=50, 
                       output_dim=label_size, 
                       batch_size=1)

optim = SGD(params=model.parameters(), lr=0.01)
criterion = NLLLoss()

for i in range(10):
    total_loss = 0
    model.train()
    y_true_train = []
    y_pred_train = []
    for it, example in enumerate(train_data):

        f, t = example
        X = torch.LongTensor(f)
        y = torch.LongTensor(t)
        
        model.hidden = model.init_hidden()
        output = model.forward(X).squeeze(1)
        optim.zero_grad()
        prediction = torch.argmax(output, dim=1)
        loss = criterion(output, y)
        total_loss += loss.data.numpy()
        y_true_train.extend(list(y.data.numpy()))
        y_pred_train.extend(list(prediction.numpy()))
        loss.backward()

        optim.step()

    model.eval()
    y_pred = []
    y_true = []
    for example in test_data:
        optim.zero_grad()
        f, t = example
        X = torch.LongTensor(f)
        y = torch.LongTensor(t)

        model.hidden = model.init_hidden()
        output = model.forward(X).squeeze(1)
        prediction = torch.argmax(output, dim=1)

        y_true.extend(list(y.data.numpy()))
        y_pred.extend(list(prediction.numpy()))

    a = accuracy_score(y_true, y_pred)
    a_train = accuracy_score(y_true_train, y_pred_train)
    total_loss /= (it + 1)

    print("Loss: {:.2f}, Training Accuracy: {:.2f}, Validation Accuracy: {:.2f}".format(total_loss, a_train, a))

In [None]:
model.eval()

In [None]:

sentence = "we run home ."
words = sentence.lower().split()
sample = [word2idx[i] for i in words]
preds = [idx2label[i] for i in list(torch.argmax(model.forward(torch.LongTensor(sample)), dim=1).data.numpy().reshape(-1))]
for word, pred in zip(words, preds):
    print(word, pred)

In [None]:
sentence = "I went for a run today"
words = sentence.lower().split()
sample = [word2idx[i] for i in words]
preds = [idx2label[i] for i in list(torch.argmax(model.forward(torch.LongTensor(sample)), dim=1).data.numpy().reshape(-1))]
for word, pred in zip(words, preds):
    print(word, pred)