# RNN

Given a sequence, predict the topic distribution for it

In [250]:
import pandas as pd
import numpy as np

# the dists are saved as comma separated values in a string (because pandas can't save datatype as int)
# so to read as numeric-values, you must split on spaces and convert to float.
# Michael says there is an efficient way to do this.

Here is our dataset

In [262]:
df = pd.read_csv('data.csv')
df.head()

Unnamed: 0,sentences,dists
0,At least 100 families receiving housing benefi...,-6.63052740918379 -7.265516111874736 -6.913763...
1,"More 30 families given staggering # 1,500 week...",-7.830457309882772 -7.835212219711822 -5.67915...
2,"Of 100 families , 60 rent paid state value # 5...",-7.872029012645258 -6.913763006152177 -5.67915...
3,At time millions people struggling get housing...,-6.63052740918379 -6.443461174803561 -7.839491...
4,"Luxury : The kind upmarket homes Kensington , ...",-6.899740172840747 -inf -4.813336632402851 -7....


In [263]:
# convert back to original distribution
df['dists'] = df['dists'].str.split(" ")
df['dists'] = df['dists'].apply(lambda x: [np.e ** float(i) for i in x])

Let the ML begin

In [264]:
import unicodedata
import string
import re
import random
import time
import math
import csv

import torch
import torch.nn as nn
from torch.autograd import Variable
from torch import optim
import torch.nn.functional as F

import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="6,7"

In [265]:
USE_CUDA = False

Define our vocabulary class

In [266]:
SOS_token = 0
EOS_token = 1

class Voc:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2 # Count SOS and EOS
      
    def index_words(self, sentence):
        for word in sentence.split(' '):
            self.index_word(word)

    def index_word(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

Methods for cleaning up data

In [267]:
# Turn a Unicode string to plain ASCII, thanks to http://stackoverflow.com/a/518232/2809427
def unicode_to_ascii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters
def normalize_string(s):
    s = unicode_to_ascii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s

Lets actually use the data now

In [280]:
MAX_SENTENCES = 200

# subset of the data for training
training_df = df[:MAX_SENTENCES]

# choose a random sentence and its corresponding distribution (label)
def choose_random_training_pair():
    training_pair = training_df.loc[random.randint(0, training_df.shape[0])]

    sentence = training_pair['sentences']
    distribution = training_pair['dists']

    return sentence, distribution

def generate_vocabulary(sentences):
    # create the vocabulary
    vocabulary = Voc('source_identification')
    for sentence in sentences:
        vocabulary.index_words(sentence)

    return vocabulary

In [281]:
vocabulary = generate_vocabulary(training_df['sentences'].tolist())
n_words = vocabulary.n_words

Lets actually do some ML now.

Here are some sentences --> variable functions

In [323]:
# Return a list of indexes, one for each word in the sentence
def indexes_from_sentence(voc, sentence):
    return [voc.word2index[word] for word in sentence.split(' ')]

def variable_from_sentence(voc, sentence):
    indexes = indexes_from_sentence(voc, sentence)
    indexes.append(EOS_token)
    var = Variable(torch.LongTensor(indexes).view(-1, 1))
    if USE_CUDA: var = var.cuda()
    return var

def tensor_from_distribution(distribution):
    return torch.FloatTensor(distribution, requires_grad=True).view(-1, 1)

And now here's the RNN

In [283]:
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, n_layers=1):
        super(RNN, self).__init__()
        
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.n_layers = n_layers
        
        # embedding contains input_size vectors of size hidden_size.
        # this performs the word embeddings
        self.embedding = nn.Embedding(input_size, hidden_size)

        self.gru = nn.GRU(hidden_size, hidden_size, n_layers)
        
    def forward(self, word_inputs, hidden):
        # Note: we run this all at once (over the whole input sequence)
        seq_len = len(word_inputs)
        
        # embedding: (sentence length, batch size, word embedding length)
        embedded = self.embedding(word_inputs).view(seq_len, 1, -1)
        
        # GRU:
        #    input: (length of sequence, batch size, size of var in sequence)
        #    output: 
        output, hidden = self.gru(embedded, hidden)
        return output, hidden

    def init_hidden(self):
        hidden = Variable(torch.zeros(self.n_layers, 1, self.hidden_size))
        if USE_CUDA: hidden = hidden.cuda()
        return hidden

In [284]:
# 10 vectors of size 3
embedding = nn.Embedding(10, 3)

# a batch of 2 samples of 4 indices each
input_var = Variable(torch.LongTensor([1,2,4,5,4,3,2,9]))
print(input_var.size())
print(input_var)

output_var = embedding(input_var)
print(output_var.size())
print(output_var.view(len(input_var), 1, -1))

torch.Size([8])
Variable containing:
 1
 2
 4
 5
 4
 3
 2
 9
[torch.LongTensor of size 8]

torch.Size([8, 3])
Variable containing:
(0 ,.,.) = 
  1.3681 -1.6186 -0.7392

(1 ,.,.) = 
 -0.2321  0.6264  0.2668

(2 ,.,.) = 
 -0.0493  0.6912  0.7097

(3 ,.,.) = 
  1.3556 -0.5066 -1.4677

(4 ,.,.) = 
 -0.0493  0.6912  0.7097

(5 ,.,.) = 
 -0.4713 -0.0734  1.0438

(6 ,.,.) = 
 -0.2321  0.6264  0.2668

(7 ,.,.) = 
  0.1863  0.8530 -0.4346
[torch.FloatTensor of size 8x1x3]



Here's our training function

In [318]:
teacher_forcing_ratio = 0.5
clip = 5.0

def train(input_var, target, rnn, optimizer, criterion):

    # Zero gradients of both optimizers
    optimizer.zero_grad()
    loss = 0 # Added onto for each word

    # Get size of input and target sentences
    # input_length = input_variable.size()[0]

    # Run sentence through rnn
    hidden_var = rnn.init_hidden()
    output_var, hidden_var = rnn(input_var, hidden_var)
    
    # compute loss
    loss = criterion(output_var[-1], target)

    # run backprop
    loss.backward()
    torch.nn.utils.clip_grad_norm(rnn.parameters(), clip)
    optimizer.step()
    
    return loss.data[0]

Helper functions for time

In [319]:
def as_minutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

def time_since(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (as_minutes(s), as_minutes(rs))

This is where we actually train the RNN  
.  
.  
.  
.  
.  
.  
.  
.  
.  
after we initialize stuff

In [320]:
hidden_size = 500 # size of each word embedding
n_layers = 2 # number of layers for the RNN
dropout_p = 0.05 # we never use this

# Initialize the RNN
rnn = RNN(n_words, hidden_size, n_layers)

# Move models to GPU
if USE_CUDA:
    rnn.cuda()

# Initialize optimizers and criterion
learning_rate = 0.0001
optimizer = optim.Adam(rnn.parameters(), lr=learning_rate)
criterion = nn.NLLLoss()

In [321]:
# Configuring training
n_epochs = 1000
plot_every = 20
print_every = 10

# Keep track of time elapsed and running averages
plot_losses = []
print_loss_total = 0 # Reset every print_every
plot_loss_total = 0 # Reset every plot_every

Aaaaand now we actually start training

In [324]:
start = time.time()

for epoch in range(1, n_epochs + 1):
    
    # Get training data for this cycle
    sentence, distribution = choose_random_training_pair()
    
    training_sentence = variable_from_sentence(vocabulary, sentence)
    target_distribution = tensor_from_distribution(distribution)

    # Run the train function
    loss = train(training_sentence, target_distribution, rnn, optimizer, criterion)

    # Keep track of loss
    print_loss_total += loss
    plot_loss_total += loss

    if epoch % print_every == 0:
        print_loss_avg = print_loss_total / print_every
        print_loss_total = 0
        print_summary = '%s (%d %d%%) %.4f' % (time_since(start, epoch / n_epochs), epoch, epoch / n_epochs * 100, print_loss_avg)
        print(print_summary)

    if epoch % plot_every == 0:
        plot_loss_avg = plot_loss_total / plot_every
        plot_losses.append(plot_loss_avg)
        plot_loss_total = 0

RuntimeError: torch.FloatTensor constructor doesn't accept any keyword arguments

Plot losses

In [None]:
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import numpy as np
%matplotlib inline

def show_plot(points):
    plt.figure()
    fig, ax = plt.subplots()
    loc = ticker.MultipleLocator(base=0.2) # put ticks at regular intervals
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

show_plot(plot_losses)