In [2]:
%matplotlib inline

from gensim import models
import numpy as np
import matplotlib.pyplot as plt
import text_utils as tu
import multiprocessing
import os

import torch
import torch.utils.data
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as transforms

import torchtext
import torchtext.vocab as vocab

# Datasets and Dataloaders

In [37]:
def create_report_examples(path):
    raw_reports = np.load(path)
    dirty_reports = [report['body'] for report in raw_reports]
    clean_reports, _ = tu.clean_report(dirty_reports, clean=1) # first pass removes \n's and weird characters
    tokenised_reports, report_vocab = tu.clean_report(clean_reports, clean=2) # second pass tokenises and builds vocab
    vocab, embeddings = tu.load_glove('/home/rohanmirchandani/glove/glove.6B.50d.w2vformat.txt', report_vocab, 50)
    vocab['<SOS>'] = embeddings.shape[0]
    embeddings = np.vstack((embeddings, np.zeros((1, 50))))
    vocab['<EOS>'] = embeddings.shape[0]
    embeddings = np.vstack((embeddings, np.ones((1, 50))))
    vocab['<UNK>'] = embeddings.shape[0]
    embeddings = np.vstack((embeddings, -np.ones((1, 50))))
    for i, tokens in enumerate(tokenised_reports): # should multithread this at some point
        tokens = ['<SOS>'] + tokens + ['<EOS>']
        length = len(tokens)
        if length > 300 or length < 10:
            continue
        vecs = np.array([[vocab[token] if token in vocab.keys() else vocab['<UNK>'] for token in tokens]]).transpose()
        print(vecs.shape)
        padding_size = 300 - vecs.shape[0]
        padding = np.zeros((padding_size, 1))
        vecs = np.vstack((vecs, padding))
        data = data = {'tokens': tokens, "vectors": vecs}
        name = "example_{}".format(i)
        np.save(os.path.join('/home/rohanmirchandani/maxwell-pt-test/examples/', name), data)

In [None]:
create_report_examples(path='/home/rohanmirchandani/maxwell-pt-test/points.npy')

# LSTM Testing

In [3]:
class EncoderGRU(nn.Module):
    
    def __init__(self, input_dim, hidden_dim):
        super(EncoderGRU, self).__init__()
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        
        self.gru = nn.GRU(self.input_dim, self.hidden_dim)
        
    def forward(self, x, hidden):
        output, hidden = self.gru(x, hidden)
        return output, hidden
    
    def init_hidden(self, bs):
        result = Variable(torch.zeros(1, bs, self.hidden_dim))
        return result

In [4]:
class DecoderGRU(nn.Module):
    
    def __init__(self, output_dim, hidden_dim):
        super(DecoderGRU, self).__init__()
        self.output_dim = output_dim
        self.hidden_dim = hidden_dim
        
        self.gru = nn.GRU(self.hidden_dim, self.hidden_dim)
        self.out = nn.Linear(self.hidden_dim, self.output_dim)
        self.softmax = nn.LogSoftmax(dim=1)
    
    def forward(self, x, hidden):
        output, hidden = self.gru(x, hidden)
        output = self.out(output[0])
        output = self.softmax(output)
        return output, hidden
    
    def init_hidden(self, bs):
        result = Variable(torch.zeros(1, bs, self.hidden_dim))
        return result

In [6]:
dataloader = tu.create_dataloader('/home/rohanmirchandani/maxwell-pt-test/examples/', batch_size=1)

In [88]:
iterator = iter(dataloader)

In [89]:
tokens, vectors = next(iterator)
print(len(tokens))
print(vectors.shape)

53
torch.Size([1, 35, 50])


In [108]:
E = EncoderGRU(50, 50)
D = DecoderGRU(50, 50)

In [109]:
e_hidden = E.init_hidden(bs=vectors.shape[1])
d_hidden = D.init_hidden(bs=vectors.shape[1])
inputs = Variable(vectors.float())

In [111]:
output, z_hidden = E(inputs, e_hidden)

In [114]:
output, final_hidden = D(z_hidden, d_hidden)

In [115]:
output

Variable containing:
-3.8827 -3.9737 -3.8523  ...  -4.0064 -3.8536 -3.9493
-3.8884 -3.9655 -3.8116  ...  -3.9746 -3.8851 -3.9170
-3.8900 -3.9792 -3.8736  ...  -3.9952 -3.8475 -3.9095
          ...             ⋱             ...          
-3.9041 -3.9987 -3.8122  ...  -4.0027 -3.8208 -3.9384
-3.8993 -3.9575 -3.8215  ...  -3.9901 -3.8706 -3.9202
-3.9041 -3.9987 -3.8122  ...  -4.0027 -3.8208 -3.9384
[torch.FloatTensor of size 35x50]

# Training

In [None]:
epochs = 1
criterion = nn.MSELoss()

embedding_dim = 50

E = nn.DataParallel(EncoderGRU(50, 50).cuda())
D = nn.DataParallel(DecoderGRU(50, 50).cuda())

optm = optim.Adam(list(E.parameters()) + list(D.parameters()))

dataloader = tu.create_dataloader('/home/rohanmirchandani/maxwell-pt-test/examples/', batch_size=1)

for epoch in range(epochs):
    
    for tokens, vectors in dataloader:
        
        e_hidden = Variable(torch.zeros(1, vectors.shape[1], embedding_dim)).cuda()
        d_hidden = Variable(torch.zeros(1, vectors.shape[1], embedding_dim)).cuda()
        
        optm.zero_grad()
        
        inputs = Variable(vectors.float()).cuda()
        z_output, e_hidden = E(inputs, e_hidden)
        outputs, d_hidden = D(e_hidden, d_hidden)
        loss = criterion(outputs, inputs)
        print(loss)
        loss.backward()
        optm.step()