In [None]:
from IPython.display import Image
from IPython.core.display import HTML
import getpass

from traitlets.config.manager import BaseJSONConfigManager
path = "/Users/{}/anaconda3/envs/rise_latest/etc/jupyter/nbconfig".format(getpass.getuser())
cm = BaseJSONConfigManager(config_dir=path)
o = cm.update("livereveal", {
              "theme": "sky",
              "transition": "fade",
              "start_slideshow_at": "selected",
})

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

<center>
<h1> Deep Learning and Modern Natural Language Processing </h1>
<br>
Zachary S. Brown
<br>
</center>

## Outline
0. NLP Problem Structure
1. Text Classification and the Perceptron
2. Vectorization and Classification with RNNs
3. POS Tagging with RNNs
4. Sequence to Sequence Modeling

## NLP Problem Structure

## General Problem Structure
<br>
<br>
<center>
<img src="src/Shape_of_NLP_Problems_0.png?" alt="perceptron" style="width:968px">
</center>   

## Binary Document Classification
<br>
<br>
<center>
<img src="src/Shape_of_NLP_Problems_1.png?" alt="perceptron" style="width:968px">
</center>   

## Multi-class Document Classification
<br>
<br>
<center>
<img src="src/Shape_of_NLP_Problems_2.png?" alt="perceptron" style="width:968px">
</center>   

## Multi-class _Sequence_ Classification
<br>
<br>
<center>
<img src="src/Shape_of_NLP_Problems_4.png?" alt="perceptron" style="width:968px">
</center>   

## Starting Easy: Neural Net with Traditional Vectorization
<br>
<br>
<center>
<img src="src/Shape_of_NLP_Problems_5.png?" alt="perceptron" style="width:968px">
</center>   

## Text Classification and the Perceptron

## Topics
* The perceptron and neural network optimization
* Example

## The Perceptron
<center>
<img src="src/0_Perceptron.png?" alt="perceptron" style="height:400px">
</center>   

## Weights
<center>
<img src="src/1_Perceptron.png?" alt="perceptron" style="height:400px">
</center>   

## Forward Pass
<center>
<img src="src/2_Perceptron.png?" alt="perceptron" style="height:400px">
</center>   

## Loss
<center>
<img src="src/3_Perceptron.png?" alt="perceptron" style="height:400px">
</center>   

## Calculate Gradients
<center>
<img src="src/4_Perceptron.png?" alt="perceptron" style="height:400px">
</center>   

## Update Weights
<center>
<img src="src/5_Perceptron.png?" alt="perceptron" style="height:400px">
</center>   

## Perceptron Example

In [None]:
# Required imports
import torch
import numpy as np
import pandas as pd
from torch.nn import Linear
from torch.nn import Sigmoid, LogSoftmax
from torch.optim import SGD
from torch.nn import BCELoss, NLLLoss
from string import punctuation
import itertools
from tqdm import tqdm

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelBinarizer, LabelEncoder

## 0. Dataset Loading and Cleaning

We'll begin by loading a prepared version of the [Stanford Large Movie Review Dataset](http://ai.stanford.edu/~amaas/data/sentiment/), on which we'll train a binary classifier. 

This dataset contains 50k highly polarized movie reviews from IMDB, labeled with positive or negative sentiment. 

We'll perform some minimal preprocessing on the text itself, simply case-normalization and removal of punctuation

In [None]:
# Load the data into a DataFrame
data = pd.read_pickle('../data/1a_acl_imdb.pkl')

# Definne a simple convenience function for cleaning the strings
def clean_text(text):
    return "".join([c for c in text.lower() if c not in punctuation])

# Clean the string labels
data['text_cleaned'] = data['text'].map(clean_text)
data.head()

# 1. Text Vectorization

Once we have some (somewhat) clean data, we can then vectorize the corpus the standard term frequency, inverse document frequency. For the sake of time, we'll limit the overall input feature space to the top 1k tokens, based on the 

In [None]:
# Initialize a TfidfVectorizer Object
tfidf = TfidfVectorizer()

# Fit the cleaned text
tfidf.fit(data['text_cleaned'])

# Examine the total number of tokens in the text
print("Total tokens in input corpus: {}".format(len(tfidf.vocabulary_)))

# Initialize a TfidfVectorizer Object, this time with a max number of features
max_features = 1000
tfidf = TfidfVectorizer(max_features=max_features)

# Fit the cleaned text
features = tfidf.fit_transform(data['text_cleaned']).todense()
labels = data.label.values.reshape(-1,1)

# Create tuples of the feature/label pairs, 
# and perform a stratified train/test split
all_data = list(zip(features, labels))
train_data, test_data = train_test_split(all_data, stratify=labels, random_state=42)

# 2a. Perceptron Classifier

For the simplest perceptron, we'll only need a single linear layer as well as a sigmoid transformation to map the output space from our linear layer into the proper probability distribution. 

Two other things that need to be considered are the choice of loss funciton and the optimization algorithm. We'll use binary cross entropy for the loss function, and stochastic gradient descent for the optimization

In [None]:
# Create a linear single linear layer, 
# with input shape of our feature space 
# and output shape of 1 (binary classification)
linear = Linear(max_features, 1, bias=True)

# Create a instance of the sigmoid function
# so we can normalize our output to the range [0,1]
sigmoid = Sigmoid()

# Binary cross entropy is an appropriate loss function 
# for this type of problem, and is implemented in the 
# `BCELoss` class in pytroch
criterion = BCELoss()

# We'll use basic stochastic gradient descent
# to optimize the parameters of our linear layer 
# (the sigmoid is a transformation with no parameters)
optim = SGD(params=linear.parameters(), lr=0.01)

In [None]:
type(f)

In [None]:
# let's see what it looks like 
# to pass a single example through
# the objects above

# grab a single example
f = features[0]
t = labels[0]

# conver the example to tensors
X = torch.FloatTensor(f)
y = torch.FloatTensor(t)
print("Shape of feature tensor:", X.shape)

# pass the input tensor through the linear layer
linear_output = linear.forward(X)
print("Shape of linear_output:", linear_output.shape)

# take the sigmoid of the linear output
sigmoid_output = sigmoid(linear_output)
print("Value of sigmoid_output:", sigmoid_output)

# calculat the loss w.r.t. the expected value
loss = criterion(sigmoid_output.view(1,-1), y)
print("Value of loss:", loss)

# calculate the gradients
loss.backward()

# check the current value and gradient
# of the bias
weights, bias = list(linear.parameters())
print("Bias:", bias.data)
print("Bias gradient:", bias.grad)

# take a step with the optimizer
# to update the parameters
optim.step()

# check the value of the bias
weights, bias = list(linear.parameters())
print("Bias:", bias.data)

# 2b. Training the Perceptron

In [None]:
total_loss = 0
for it, example in tqdm(list(enumerate(train_data))):
    
    # zero out our gradients for each weight
    optim.zero_grad()
    
    # pull out the features and the target
    # for each example
    f, t = example
    
    # cast the feature and target to 
    # the appropriate torch types
    X = torch.FloatTensor(f)
    y = torch.FloatTensor(t)
    
    # start the forward pass
    X_prime = linear(X)
    output = sigmoid(X_prime)
    
    # Calculate the loss
    loss = criterion(output.view(-1), y)
    total_loss += loss.data.numpy()
    
    # calcualte the gradients of the
    # loss w.r.t. each of the parameters
    loss.backward()

    # update the weights in the
    # linear layer
    optim.step()

# 2c. Evaluating the Perceptron

In [None]:
# initialize lists to keep track
# of predicted and actual values
y_pred = []
y_true = []

# set a probability threshold 
# for calculating the accuracy
threshold = 0.5

# loop through the test examples
for f, t in test_data:
    X = torch.FloatTensor(f)
    y = torch.FloatTensor(t)
    output = sigmoid(linear(X))
    y_true.append(y.data.numpy()[0])
    y_pred.append(output.data.numpy()[0])

# calculate a prediction,
# then compute the accuracy
y_pred = [int(p >= threshold) for p in y_pred]
a = accuracy_score(y_true, y_pred)

print("Validation Accuracy: {:.2f}".format(a))

# 2e. Creating a Model Class

In [None]:
from modules.perceptron import *

# 2d. Putting it All Together

In [None]:
from modules.perceptron import *


model = perceptron(max_features)
criterion = BCELoss()
optim = SGD(params=model.parameters(), lr=0.01)

epochs = 5
for epoch in range(epochs):
        model.train()
        total_loss = 0
        for it, example in enumerate(train_data):
            optim.zero_grad()
            f, t = example
            X = torch.FloatTensor(f)
            y = torch.FloatTensor(t)
            output = model.forward(X)
            loss = criterion(output.view(-1), y)
            total_loss += loss.data.numpy()
            loss.backward()

            optim.step()


        model.eval()
        y_pred = []
        y_true = []
        threshold = 0.5
        for f, t in test_data:
            X = torch.FloatTensor(f)
            y = torch.FloatTensor(t)
            output = model.forward(X)
            y_true.append(y.data.numpy()[0])
            y_pred.append(output.data.numpy()[0])

        y_pred = [int(p >= threshold) for p in y_pred]
        a = accuracy_score(y_true, y_pred)



        total_loss /= (it + 1)
        print("Epoch Loss: {:.2f}, Validation Accuracy: {:.2f}".format(total_loss, a))


# 3. Multi-class Model

In [None]:
# Load the data into a DataFrame
data = pd.read_pickle('../data/1b_stackoverflow_qna.pkl')
data.head()

In [None]:
# Definne a simple convenience function for cleaning the strings
def clean_text(text):
    return "".join([c for c in text.lower() if c not in punctuation])

# Clean the string labels
data['text_cleaned'] = data['text'].map(clean_text)

# Initialize a TfidfVectorizer Object
tfidf = TfidfVectorizer()

# Fit the cleaned text
tfidf.fit(data['text_cleaned'])

# Examine the total number of tokens in the text
print("Total tokens in input corpus: {}".format(len(tfidf.vocabulary_)))

# Initialize a TfidfVectorizer Object, this time with a max number of features
max_features = 1000
tfidf = TfidfVectorizer(max_features=max_features)

# Fit the cleaned text
features = tfidf.fit_transform(data['text_cleaned']).todense()

le = LabelEncoder()
labels = le.fit_transform(data.label.values).reshape(-1,1)
label_size = len(le.classes_)

# Create tuples of the feature/label pairs, 
# and perform a stratified train/test split
all_data = list(zip(features, labels))
train_data, test_data = train_test_split(all_data, stratify=labels, random_state=42)

In [None]:
# Create a linear single linear layer, 
# with input shape of our feature space 
# and output shape of 1 (binary classification)
linear = Linear(max_features, label_size, bias=True)

# Create a instance of the sigmoid function
# so we can normalize our output to the range [0,1]
softmax = LogSoftmax(dim=1)

# Binary cross entropy is an appropriate loss function 
# for this type of problem, and is implemented in the 
# `BCELoss` class in pytorch
criterion = NLLLoss()

# We'll use basic stochastic gradient descent
# to optimize the parameters of our linear layer 
# (the sigmoid is a transformation with no parameters)
optim = SGD(params=linear.parameters(), lr=0.01)

In [None]:
# take an example from features and labels
f = features[0]
t = labels[0]

# cast the features and targets
# as torch tensors
X = torch.FloatTensor(f)
y = torch.LongTensor(t)
print("Features Tensor Shape:", X.shape)
print("Features Tensor Shape:", y.shape)

# pass the features tensor through the linear layer
linear_output = linear(X)
print("Linear Output Shape:", linear_output.shape)

# pass the output from the linear
# layer through the softmax
softmax_output = softmax(linear_output)
print("Softmax Output Shape:", softmax_output.shape)

# verify that the softmax actually sums to 1
# HINT: use torch.exp (this is LogSoftmax) and torch.sum
softmax_normalization = torch.exp(softmax_output).sum()
print("Softmax Normalization:", softmax_normalization)

# calculate the loss
loss = criterion(softmax_output, y)
print("Loss:", loss.data)

In [None]:
model = multi_class_perceptron(max_features, label_size)
optim = SGD(params=model.parameters(), lr=0.01)
criterion = NLLLoss()

LEARNING_RATE = 0.01
EPOCHS = 10

for epoch in range(EPOCHS):
    total_loss = 0
    linear.train()
    total_loss = 0
    for it, example in list(enumerate(train_data)):
        optim.zero_grad()
        f, t = example
        X = torch.FloatTensor(f)
        y = torch.LongTensor(t)
        output = model.forward(X)
        loss = criterion(output, y)
        total_loss += loss.data.numpy()
        loss.backward()

        optim.step()
        
    model.eval()
    y_pred = []
    y_true = []
    threshold = 0.5

    for f, t in test_data:
        X = torch.FloatTensor(f)
        y = torch.FloatTensor([t])
        output = model.forward(X)
        y_true.append(y.data.numpy()[0])
        y_pred.append(torch.argmax(output.data).numpy())


    a = accuracy_score(y_true, y_pred)

    total_loss /= (it + 1)

    print("Loss: {:.2f}, Validation Accuracy: {:.2f}".format(total_loss, a))