In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from data_transform_pipeline import DataTransformPipeline
from data import Data
from sklearn.model_selection import train_test_split
from tensorboard_logger import Logger

In [3]:
embeds, idx_word_map, embedding_vocab = DataTransformPipeline.load("f_sents_prod").data
embedding_vocab = torch.stack(list(embedding_vocab.values()))

y, mapping = DataTransformPipeline.load("genre2_label _prod").data
y = y.values.tolist()

In [4]:
len(y), len(embeds)

(24603, 24603)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(embeds, y, test_size=0.15, random_state = 42)

train_data = Data(X_train, y_train)
train_loader = train_data.get_loader(batch_size = 15)

val_data = Data(X_test, y_test)
val_loader = val_data.get_loader()

In [6]:
len(X_train), len(y_train)

(20912, 20912)

In [7]:
class RNN(nn.Module):
    def __init__(self, embedding, embedding_size, hidden_size, num_layers, num_classes):
        # RNN Accepts the following hyperparams:
        # emb_size: Embedding Size
        # hidden_size: Hidden Size of layer in RNN
        # num_layers: number of layers in RNN
        # num_classes: number of output classes
        super(RNN, self).__init__()

        self.num_layers, self.hidden_size = num_layers, hidden_size
        self.embedding = nn.Embedding.from_pretrained(embedding)
        self.rnn = nn.RNN(embedding_size, hidden_size, num_layers, batch_first=True)
        self.linear = nn.Linear(hidden_size, num_classes)

    def init_hidden(self, batch_size):
        # Function initializes the activation of recurrent neural net at timestep 0
        # Needs to be in format (num_layers, batch_size, hidden_size)
        hidden = torch.randn(self.num_layers, batch_size, self.hidden_size)

        return hidden

    def forward(self, x, lengths):
        # reset hidden state

        batch_size, seq_len = x.size()

        self.hidden = self.init_hidden(batch_size)
        x = x.long()

        # get embedding of characters
        embed = self.embedding(x)
        # pack padded sequence
#         embed = torch.nn.utils.rnn.pack_padded_sequence(embed, lengths.numpy(), batch_first=True)
        # fprop though RNN
        rnn_out, self.hidden = self.rnn(embed, self.hidden)
        # undo packing
#         rnn_out, _ = torch.nn.utils.rnn.pad_packed_sequence(rnn_out, batch_first=True)
        # sum hidden activations of RNN across time
        rnn_out = torch.sum(rnn_out, dim=1)

        logits = self.linear(rnn_out)
        return logits


In [8]:
def test_model(loader, model):
    """
    Help function that tests the model's performance on a dataset
    @param: loader - data loader for the dataset to test against
    """
    correct = 0
    total = 0
    model.eval()
    for lengths, labels, data in loader:
        data_batch, lengths_batch, label_batch = data, lengths, labels
        outputs = F.softmax(model(data_batch, lengths_batch), dim=1)
        predicted = outputs.max(1, keepdim=True)[1]

        total += labels.size(0)
        correct += predicted.eq(labels.view_as(predicted)).sum().item()
    return (100 * correct / total)

In [9]:
def train(train_loader, val_loader, model, logger = None):
    learning_rate = 3e-5
    num_epochs = 8 # number epoch to train

    # Criterion and Optimizer
    loss_fn = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    # Train the model
    total_step = len(train_loader)
    step_counter = 0
    for epoch in range(num_epochs):
        print("Epoch {}".format(epoch+1))
        for i, (lengths, labels, data) in enumerate(train_loader):
            model.train()
            optimizer.zero_grad()
            # Forward pass
            outputs = model(data, lengths)
            loss = loss_fn(outputs, labels)
            logger.log_value("loss", loss.item(), step_counter)

            # Backward and optimize
            loss.backward()
            optimizer.step()
            
            if i > 0 and i % 50 == 0:
                val_acc = test_model(val_loader, model)
                logger.log_value("validation accuracy", val_acc, step_counter)
#                 print('Epoch: [{}/{}] \t Step: [{}/{}] \tValidation Acc: [{:.4f}]'.format(
#                            epoch+1, num_epochs, i+1, len(train_loader), val_acc))
            step_counter += 1

In [10]:
def train_rnn(train_loader, val_loader, hidden_size, num_layers, logger = None):
    model = RNN(embedding_vocab, embedding_vocab.shape[1], hidden_size, num_layers, np.unique(y).shape[0])
    train(train_loader, val_loader, model, logger)

In [21]:
import IPython
url = "http://127.0.0.1:6006/"
iframe = "<iframe src='{}' width=1100 height=800></iframe>".format(url)
IPython.display.HTML(iframe)

In [None]:
hidden_size = [350, 350, 350, 350, 400, 400, 400, 400, 500, 500, 500, 500]
num_layers =  [  3,   4,   5,   6,   3,   4,   5,   6,   3,   4,   5,   6]

for i, params in enumerate(zip(hidden_size, num_layers)):
    t = ["{}: {}".format(name, val) for name, val in [("hidden size", params[0]), ("num layers", params[1])]]
    print("-------------------\nTraining Parameters\n-------------------\n\n{}\n-------------------".format("\n".join(t)))
    train_rnn(train_loader, val_loader, *params, Logger("runs/{}".format("_".join(t))))
    print()

-------------------
Training Parameters
-------------------

hidden size: 350
num layers: 3
-------------------
Epoch 1
