# RNN

Given a sequence, predict the topic distribution for it

In [109]:
import pandas as pd
import numpy as np

# the dists are saved as comma separated values in a string (because pandas can't save datatype as int)
# so to read as numeric-values, you must split on spaces and convert to float.
# Michael says there is an efficient way to do this.

Here is our dataset

In [110]:
df = pd.read_csv('data.csv')
df.head()

Unnamed: 0,sentences,dists
0,By,0.000215766137942 0.000241668597233 0.00020097...
1,Beth Stebner Tara Brady,0.000117660461174 0.000123021350745 0.00012870...
2,PUBLISHED :,0.000574640958915 0.000137269205565 0.00015039...
3,"13:57 EST , 21 January 2013",0.000117660461174 0.000123021350745 0.00012870...
4,|,0.000215766137942 0.000241668597233 0.00020097...


In [111]:
# convert back to original distribution
df['dists'] = df['dists'].str.split(" ")
df['dists'] = df['dists'].apply(lambda x: [float(i) for i in x])

In [148]:
OUTPUT_SIZE = len(df['dists'][0])
OUTPUT_SIZE

4406

Let the ML begin

In [None]:
import unicodedata
import string
import re
import random
import time
import math
import csv

import torch
import torch.nn as nn
from torch.autograd import Variable
from torch import optim
import torch.nn.functional as F

import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="6,7"

In [None]:
USE_CUDA = False

Define our vocabulary class

In [None]:
SOS_token = 0
EOS_token = 1

class Voc:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2 # Count SOS and EOS
      
    def index_words(self, sentence):
        for word in sentence.split(' '):
            self.index_word(word)

    def index_word(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

Methods for cleaning up data

In [197]:
# Turn a Unicode string to plain ASCII, thanks to http://stackoverflow.com/a/518232/2809427
def unicode_to_ascii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters
def normalize_string(s):
    s = unicode_to_ascii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s

Lets actually use the data now

In [198]:
MAX_SENTENCES = 200

# subset of the data for training
training_df = df[:MAX_SENTENCES]

# choose a random sentence and its corresponding distribution (label)
def choose_random_training_pair():
    training_pair = training_df.loc[random.randint(0, training_df.shape[0]-1)]

    sentence = training_pair['sentences']
    distribution = training_pair['dists']

    return sentence, distribution

def generate_vocabulary(sentences):
    # create the vocabulary
    vocabulary = Voc('source_identification')
    for sentence in sentences:
        vocabulary.index_words(sentence)

    return vocabulary

In [199]:
vocabulary = generate_vocabulary(training_df['sentences'].tolist())
n_words = vocabulary.n_words

Lets actually do some ML now.

Here are some sentences --> variable functions

In [200]:
# Return a list of indexes, one for each word in the sentence
def indexes_from_sentence(voc, sentence):
    return [voc.word2index[word] for word in sentence.split(' ')]

def variable_from_sentence(voc, sentence):
    indexes = indexes_from_sentence(voc, sentence)
    indexes.append(EOS_token)
    var = Variable(torch.LongTensor(indexes).view(-1, 1))
    if USE_CUDA: var = var.cuda()
    return var

def tensor_from_distribution(distribution):
    return Variable(torch.FloatTensor(distribution).view(-1, 1))

And now here's the RNN

In [201]:
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, n_layers=1):
        super(RNN, self).__init__()
        
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.n_layers = n_layers
        
        # embedding contains input_size vectors of size hidden_size.
        # this performs the word embeddings
        self.embedding = nn.Embedding(input_size, hidden_size)

        self.gru = nn.GRU(hidden_size, hidden_size, n_layers)
        
    def forward(self, word_inputs, hidden):
        # Note: we run this all at once (over the whole input sequence)
        seq_len = len(word_inputs)
        
        # embedding: (sentence length, batch size, word embedding length)
        embedded = self.embedding(word_inputs).view(seq_len, 1, -1)
        
        # GRU:
        #    input: (length of sequence, batch size, size of var in sequence)
        #    output: 
        output, hidden = self.gru(embedded, hidden)
        return output, hidden

    def init_hidden(self):
        hidden = Variable(torch.zeros(self.n_layers, 1, self.hidden_size))
        if USE_CUDA: hidden = hidden.cuda()
        return hidden

In [208]:
class classifierNN(nn.Module):
    def __init__(self, input_size, output_size):
        super(classifierNN, self).__init__()
        
        self.input_size = input_size
        self.output_size = output_size
        
        self.i2o = nn.Linear(input_size, output_size)
        self.softmax = nn.LogSoftmax()
    
    def forward(self, result_input):
        output = self.i2o(result_input)
        output = self.softmax(output).view(-1, 1)
        return output

Here's our training function

In [209]:
teacher_forcing_ratio = 0.5
clip = 5.0

def train(input_var, target, rnn, rnn_optimizer, classifier_optimizer, criterion):

    # Zero gradients of both optimizers
    optimizer.zero_grad()
    loss = 0 # Added onto for each word

    # Get size of input and target sentences
    # input_length = input_variable.size()[0]

    # Run sentence through rnn
    hidden_var = rnn.init_hidden()
    output_var, hidden_var = rnn(input_var, hidden_var)
    
    # run classification rnn to get distribution
    output_distribution = classifier(output_var[-1])
    
    # compute loss
    loss = criterion(output_distribution, target)

    # run backprop
    loss.backward()
    torch.nn.utils.clip_grad_norm(rnn.parameters(), clip)
    
    rnn_optimizer.step()
    classifier_optimizer.step()
    
    return loss.data[0]

Helper functions for time

In [210]:
def as_minutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

def time_since(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (as_minutes(s), as_minutes(rs))

This is where we actually train the RNN  
.  
.  
.  
.  
.  
.  
.  
.  
.  
after we initialize stuff

In [211]:
hidden_size = 500 # size of each word embedding
n_layers = 2 # number of layers for the RNN
dropout_p = 0.05 # we never use this

# Initialize the RNN
rnn = RNN(n_words, hidden_size, n_layers)

num_topics = 24

# Initialize the classifier NN
classifier = classifierNN(hidden_size, OUTPUT_SIZE)

# Move models to GPU
if USE_CUDA:
    rnn.cuda()

# Initialize optimizers and criterion
learning_rate = 0.0001
rnn_optimizer = optim.Adam(rnn.parameters(), lr=learning_rate)
classifier_optimizer = optim.Adam(classifier.parameters(), lr=learning_rate)
criterion = nn.KLDivLoss()

In [212]:
# Configuring training
n_epochs = 1000
plot_every = 20
print_every = 10

# Keep track of time elapsed and running averages
plot_losses = []
print_loss_total = 0 # Reset every print_every
plot_loss_total = 0 # Reset every plot_every

Aaaaand now we actually start training

In [None]:
start = time.time()

for epoch in range(1, n_epochs + 1):
    
    # Get training data for this cycle
    sentence, distribution = choose_random_training_pair()
    
    training_sentence = variable_from_sentence(vocabulary, sentence)
    target_distribution = tensor_from_distribution(distribution)

    # Run the train function
    loss = train(training_sentence, target_distribution, rnn, rnn_optimizer, classifier_optimizer, criterion)

    # Keep track of loss
    print_loss_total += loss
    plot_loss_total += loss

    if epoch % print_every == 0:
        print_loss_avg = print_loss_total / print_every
        print_loss_total = 0
        print_summary = '%s (%d %d%%) %.10f' % (time_since(start, epoch / n_epochs), epoch, epoch / n_epochs * 100, print_loss_avg)
        print(print_summary)

    if epoch % plot_every == 0:
        plot_loss_avg = plot_loss_total / plot_every
        plot_losses.append(plot_loss_avg)
        plot_loss_total = 0

0m 3s (- 5m 9s) (10 1%) 0.0000755474
0m 5s (- 4m 49s) (20 2%) 0.0000798607
0m 8s (- 4m 42s) (30 3%) 0.0000636696
0m 12s (- 4m 56s) (40 4%) 0.0000633480
0m 16s (- 5m 5s) (50 5%) 0.0000676324
0m 18s (- 4m 55s) (60 6%) 0.0000457608
0m 22s (- 4m 55s) (70 7%) 0.0000536902
0m 25s (- 4m 47s) (80 8%) 0.0000583254
0m 28s (- 4m 48s) (90 9%) 0.0000520081
0m 31s (- 4m 47s) (100 10%) 0.0000705582
0m 35s (- 4m 44s) (110 11%) 0.0000678794
0m 39s (- 4m 47s) (120 12%) 0.0000659909
0m 41s (- 4m 38s) (130 13%) 0.0000591950
0m 43s (- 4m 28s) (140 14%) 0.0000503005
0m 45s (- 4m 20s) (150 15%) 0.0000415671
0m 48s (- 4m 15s) (160 16%) 0.0000662836
0m 51s (- 4m 9s) (170 17%) 0.0001037342
0m 53s (- 4m 5s) (180 18%) 0.0000800503
0m 56s (- 3m 59s) (190 19%) 0.0000534334
0m 59s (- 3m 59s) (200 20%) 0.0000549219
1m 1s (- 3m 52s) (210 21%) 0.0000554445
1m 5s (- 3m 51s) (220 22%) 0.0000827069
1m 7s (- 3m 45s) (230 23%) 0.0000428872
1m 9s (- 3m 39s) (240 24%) 0.0000889584
1m 12s (- 3m 37s) (250 25%) 0.0000956379
1m 1

Plot losses

In [None]:
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import numpy as np
%matplotlib inline

def show_plot(points):
    plt.figure()
    fig, ax = plt.subplots()
    loc = ticker.MultipleLocator(base=0.2) # put ticks at regular intervals
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

show_plot(plot_losses)