# LSTM Classifier

**Note: There are some random processes within this notebook, so different runs of the notebook may result in different outcomes.**

**Note: This notebook assumes the data being loaded has already been randomly shuffled.**

In [1]:
import data_utils
import math
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import time
import torch
import torch.nn as nn
import torch.nn.functional as F

from data_utils import BOWEncoding, WordEmbeddingEncoding, WordTokenDataset
from torch.utils.data import DataLoader

## Load and Setup the Dataset

In [2]:
# Will circle back and add support for embedding layer.
# embeddings = data_utils.load_embeddings('./data/glove.6B/glove.6B.100d.txt',
#                                         embedding_dim=100)

In [3]:
data = pd.read_json('./data/train_data.json', orient='records')
data = data.sample(frac=1)


In [4]:
train_test_split = 0.95
split_idx = math.floor(len(data) * train_test_split)

train_data = data.iloc[0:split_idx]
valid_data = data.iloc[split_idx:]


In [5]:
bow_encoding = BOWEncoding(data, min_word_freq=5)
bow_encoding.prepare()


In [6]:
bow_train_dataset = WordTokenDataset(train_data, bow_encoding)
bow_train_dataset.prepare()


In [7]:
bow_valid_dataset = WordTokenDataset(valid_data, bow_encoding)
bow_valid_dataset.prepare()


In [8]:
def create_bow_batched_sequences(samples):
    encoding_dim = samples.vocab_size + 1 # Add 1 for EOS token.
    sequence_len = samples.longest_sequence + 1 # Add 1 for EOS token.
    batch_size = len(samples.label)

    data = torch.zeros(size=(sequence_len, batch_size, encoding_dim), dtype=torch.float)

    # Looping through each token in each example. This is slow.
    # TODO: Should find ways to make this faster. Vectorization? Caching?
    for i, start_offset in enumerate(samples.offset):

        end_offset = None if (i+1) >= len(samples.offset) else samples.offset[i+1]
        sequence_slice = samples.sequence[start_offset:] if end_offset is None else samples.sequence[start_offset:end_offset]

        for j, token_idx in enumerate(sequence_slice):
            # jth token in ith example.
            data[j, i, token_idx] = 1.

        j += 1

        while j < sequence_len:
            # Set EOS one-hot encodings, padded at the end of each sequence.
            data[j, i, encoding_dim - 1] = 1.
            j += 1
        
    return data


## Define the Model

In [22]:
inp_size = 3
hid_size = 5
out_size = 3
lay_size = 2
batch_size = 8

lstm = nn.LSTM(inp_size, hid_size, lay_size)

inp = torch.rand(size=(7, batch_size, inp_size))
hid = (torch.rand(lay_size, batch_size, hid_size), torch.rand(lay_size, batch_size, hid_size))
a, b = lstm(inp, hid)

# a.size() # (seq_len x batch_size x hid_size)

len(b)

2

In [None]:
class Model(nn.Module):
    def __init__(self, encoding, lstm_hidden_size, lstm_num_layers):
        self.encoding = encoding
        
        input_size = encoding.n_inputs() + 1 # Add EOS tag to vocab.
        output_size = encoding.n_classes()

        self.lstm = nn.LSTM(input_size, lstm_hidden_size, lstm_num_layers)
        self.hidden2tag = nn.Linear(lstm_hidden_size, output_size)
        
    def forward(self, input, hidden):
        _, hidden = self.lstm(input, hidden)
        output = self.hidden2tag(hidden)
        output = F.log_softmax(output, dim=1)

        return output
        
    def init_hidden(self):
        return torch.zeros(size=(1, 1, encoding.n_classes()), dtype=torch.float)

