# RNN Classifier

**Note: There are some random processes within this notebook, so different runs of the notebook may result in different outcomes.**

**Note: This notebook assumes the data being loaded has already been randomly shuffled.**

In [95]:
import data_utils
import math
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import time
import torch
import torch.nn as nn
import torch.nn.functional as F

from data_utils import BOWEncoding, WordEmbeddingEncoding, WordTokenDataset
from torch.utils.data import DataLoader

## Load the Dataset

In [96]:
# Will circle back and add support for embedding layer.
# embeddings = data_utils.load_embeddings('./data/glove.6B/glove.6B.100d.txt',
#                                         embedding_dim=100)

In [97]:
data = pd.read_json('./data/train_data.json', orient='records')
data = data.sample(frac=1)
data = data.iloc[:10000]


In [98]:
train_test_split = 0.95
split_idx = math.floor(len(data) * train_test_split)

train_data = data.iloc[0:split_idx]
valid_data = data.iloc[split_idx:]


In [99]:
bow_encoding = BOWEncoding(data, min_word_freq=5)
bow_encoding.prepare()


In [100]:
bow_train_dataset = WordTokenDataset(train_data, bow_encoding)
bow_train_dataset.prepare()


In [101]:
bow_valid_dataset = WordTokenDataset(valid_data, bow_encoding)
bow_valid_dataset.prepare()


In [102]:
# TODO: Should find ways to make this faster. Vectorization? Caching?

def create_bow_batched_sequences(samples):
    encoding_dim = samples.vocab_size + 1 # Add 1 for EOS token.
    sequence_len = samples.longest_sequence + 1 # Add 1 for EOS token.
    batch_size = len(samples.label)

    sequences = []
    # First pass, initialize sequences and matrices.
    for t in range(sequence_len):
        bow_batch = torch.zeros(size=(batch_size, encoding_dim), dtype=torch.float)
        sequences.append(bow_batch)

    # Looping through each token in each example. This is slow.
    for i, start_offset in enumerate(samples.offset):

        end_offset = None if (i+1) >= len(samples.offset) else samples.offset[i+1]
        sequence_slice = samples.sequence[start_offset:] if end_offset is None else samples.sequence[start_offset:end_offset]

        for j, token_idx in enumerate(sequence_slice):
            # jth token in ith example.
            sequences[j][i, token_idx] = 1.

        j += 1

        while j < sequence_len:
            # Set EOS one-hot encodings, padded at the end of each sequence.
            sequences[j][i, encoding_dim - 1] = 1.
            j += 1
        
    return sequences
    

## Defining the Model

In [103]:
class Model(nn.Module):
    def __init__(self, encoding, hidden_size):
        super(Model, self).__init__()
        self.encoding = encoding
        
        self.input_size = encoding.n_inputs() + 1 # Add 1 for EOS token.
        self.output_size = encoding.n_classes()
        self.hidden_size = hidden_size
        
        self.i2o = nn.Linear(self.input_size + hidden_size, self.output_size)
        self.i2h = nn.Linear(self.input_size + hidden_size, hidden_size)
        
        # nn.LogSoftmax vs F.log_softmax??
        self.softmax = nn.LogSoftmax(dim=1)
        
    def forward(self, input, hidden):
        combined = torch.cat([input, hidden], dim=1)
        hidden = self.i2h(combined)
        output = self.i2o(combined)
        output = self.softmax(output)
        return output, hidden

    def init_hidden(self, batch_size):
        return torch.zeros(size=(batch_size, self.hidden_size), dtype=torch.float32)


## Training the Model

In [130]:
def calculate_accuracy(model, samples):
    bow_sequences = create_bow_batched_sequences(samples)
    
    with torch.no_grad():
        hidden = model.init_hidden(batch_size = len(samples))
        for input in bow_sequences:
            output, hidden = model(input, hidden)

    max_val, max_idx = torch.max(output, axis=1)
        
    correct = torch.sum(samples.label == max_idx)
    total = len(samples.label)
    accuracy = float(correct) / total
    return accuracy
    

In [104]:
def train(model, criterion, optimizer, dataset, data_loader, epochs, logs=True):
    training_losses = []
    
    full_batch_size = data_loader.batch_size

    total_loss = 0
    print_every = 50
    iter = 0

    for epoch in range(epochs):
        print(f'Epoch {epoch + 1}')

        for samples in data_loader:
            # Note: At the end of the epoch, the final set of samples could be
            # smaller than a full batch. This will cause errors with the hidden
            # unit, which is working with the full batch size. Will skip this
            # training round.
            batch_size = len(samples.label)
            if batch_size != full_batch_size:
                break

            iter += 1

            bow_sequences = create_bow_batched_sequences(samples)

            hidden = model.init_hidden(batch_size=data_loader.batch_size)
            optimizer.zero_grad()

            for input in bow_sequences:
                output, hidden = model(input, hidden)              

            loss = criterion(output, samples.label)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            
            if iter % print_every == print_every - 1:
                print(f'Loss: {total_loss}')
                total_loss = 0

        
    return model, training_losses


In [111]:
hyperparams_list = [
    {
        'batch_size': 100,
        'lr': 0.001,
        'hidden_dim': 128,        
    },
    {
        'batch_size': 100,
        'lr': 0.0001,
        'hidden_dim': 128,        
    },
    {
        'batch_size': 1000,
        'lr': 0.001,
        'hidden_dim': 128,        
    },
    {
        'batch_size': 1000,
        'lr': 0.0001,
        'hidden_dim': 128,
    },
        {
        'batch_size': 100,
        'lr': 0.001,
        'hidden_dim': 512,        
    },
    {
        'batch_size': 100,
        'lr': 0.0001,
        'hidden_dim': 512,        
    },
    {
        'batch_size': 1000,
        'lr': 0.001,
        'hidden_dim': 512,        
    },
    {
        'batch_size': 1000,
        'lr': 0.0001,
        'hidden_dim': 512,
    }, 
]


In [136]:
models = []
epochs = 1

for i, hyperparams in enumerate(hyperparams_list):
    start_time = time.time()

    print(f'Trainin Model {i+1} / {len(hyperparams_list)}...')

    lr = hyperparams['lr']
    batch_size = hyperparams['batch_size']
    hidden_dim = hyperparams['hidden_dim']

    bow_train_data_loader = DataLoader(dataset=bow_train_dataset,
                                       batch_size=batch_size,
                                       shuffle=False,
                                       collate_fn=data_utils.collate_samples)

    model = Model(bow_encoding, hidden_size=hidden_dim)
    criterion = nn.NLLLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    model, training_losses = train(model, criterion, optimizer, bow_train_dataset, bow_train_data_loader, epochs)
    models.append(model)

    end_time = time.time()

    print(f'Ran in {(end_time - start_time)/60:.02f}m')


Trainin Model 1 / 8...
Epoch 1
Loss: 176.13814783096313
Ran in 2.19m
Trainin Model 2 / 8...
Epoch 1
Loss: 181.73610067367554
Ran in 3.51m
Trainin Model 3 / 8...
Epoch 1
Ran in 7.68m
Trainin Model 4 / 8...
Epoch 1
Ran in 6.13m
Trainin Model 5 / 8...
Epoch 1
Loss: 202.9548192024231
Ran in 5.14m
Trainin Model 6 / 8...
Epoch 1
Loss: 175.26620054244995
Ran in 6.99m
Trainin Model 7 / 8...
Epoch 1
Ran in 25.98m
Trainin Model 8 / 8...
Epoch 1
Ran in 26.06m


In [137]:
start_time = time.time()

accuracies = []

for model in models:
    samples = bow_valid_dataset[:]
    accuracies.append(calculate_accuracy(model, samples))

end_time = time.time()

print(f'Ran in {(end_time - start_time)/60:.02f}m.')


Ran in 0.57m.


In [138]:
best_acc = max(accuracies)
best_acc_idx = accuracies.index(best_acc)
best_model = models[best_acc_idx]

print(f'Best model is #{best_acc_idx+1} with validation accuracy of {best_acc*100:.02f}%')


Best model is #1 with validation accuracy of 14.60%
