In [1]:
from data_generators import get_iterator, get_dataset
from classifiers import theme_classifier

In [2]:
import torch
import torch.nn as nn

In [3]:
from torchtext.vocab import GloVe 
GLOVE_EMBEDDING = GloVe(name="6B", dim=300)

In [4]:
train_dataset, val_dataset, test_dataset, review_text_FIELD, theme_FIELD = get_dataset(vectors = 
                                                                                       GLOVE_EMBEDDING)

In [5]:
batch_size = 32
train_iter = get_iterator(train_dataset, batch_size, train=True, shuffle=False, repeat=False)
val_iter = get_iterator(val_dataset, batch_size, train=False, shuffle=False, repeat=False)
test_iter = get_iterator(test_dataset, batch_size, train=False, shuffle=False, repeat=False)

In [6]:
list_train = list(train_iter)

In [7]:
batch = list_train[5600]
x = batch.review_text.transpose(1, 0).int()[:10]
y = batch.theme.int()

for idx in range(x.shape[0]):
    #print(x.shape, y.shape)
    print("{} | {}".format(' '.join([train_dataset.fields['review_text'].vocab.itos[_] for _ in x[idx]]),
         train_dataset.fields['theme'].vocab.itos[y[idx]]))

great micro budget film with decent writing and acting . | plot
vince vaughn 's acting was really good as well . | acting
mark hamil returning as joker saves this mildly disappointing film | other
for those in doubt , locke is the proof . | other
great mixture of epic battles and individual character development . | plot
i would give it 0 stars if i could . | other
i could n't take my eyes off the screen . | other
a slightly odd cast in the very strange film . | acting
the overall feel is boredom interspersed with nasty violence . | other
too many boobs and too many repeated voice overs . | effect


In [8]:
review_text_FIELD.vocab.vectors.shape, len(review_text_FIELD.vocab.itos)

(torch.Size([30002, 300]), 30002)

In [9]:
vocab_size = review_text_FIELD.vocab.vectors.shape[0]
emb_dim = review_text_FIELD.vocab.vectors.shape[1]
vectors = train_dataset.fields["review_text"].vocab.vectors
hidden_dim = 500
layers = 2
dropout = .2

In [10]:
class BaseModel(nn.Module):

    def __init__(self, 
                 ninp = vocab_size, 
                 emb_dim = emb_dim, 
                 nhid = hidden_dim, 
                 nout = vocab_size, 
                 nlayers = layers, 
                 dropout = dropout, 
                 vectors = vectors,
                 pretrained = False):
        super().__init__()
        
        self.ninp = ninp
        self.emb_dim = emb_dim
        self.nhid = nhid
        self.nout = nout
        self.nlayers = nlayers
        self.drop = nn.Dropout(dropout)

        self.embedding = nn.Embedding(ninp, emb_dim)
        self.rnn = nn.LSTM(emb_dim, nhid, nlayers, dropout=dropout)
        self.rnn.flatten_parameters()
        self.decoder = nn.Linear(nhid, nout)
        self.softmax = nn.LogSoftmax(dim=-1)

        if pretrained:
            self.encoder.weight.data = vectors
            
    def init_weights(self):
        initrange = .1
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, X, hidden):
        X = self.embedding(X)
        X = self.drop(X)
        X, hidden = self.rnn(X, hidden)
        X = self.drop(X)
        X = X.view(X.size(0)*X.size(1), X.size(2))
        X = self.decoder(X)
        log_probs = self.softmax(X)
        return log_probs, hidden
      
    def init_hidden(self, bsz):
        weight = next(self.parameters())
        output = (weight.new_zeros(self.nlayers, bsz, self.nhid).cuda(),
            weight.new_zeros(self.nlayers, bsz, self.nhid).cuda())
        return output

In [11]:
def repackage_hidden(h):
    """Wraps hidden states in new Tensors, to detach them from their history."""
    if isinstance(h, torch.Tensor):
        return h.detach()
    else:
        return tuple(repackage_hidden(v) for v in h)

In [59]:
model = BaseModel().cuda()

In [60]:
# implement PPL
learning_rate = 0.001
criterion = nn.NLLLoss(reduction='sum', 
                       ignore_index=train_dataset.fields["review_text"].vocab.stoi['<pad>']).cuda()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [73]:
def evaluate(data_source):
    # Turn on evaluation mode which disables dropout.
    model.eval()
    total_loss_e = 0
    
    total_number_of_words = 0
    with torch.no_grad():
        for i, batch in enumerate(data_source):
            if i == 50: break
            batch = batch.review_text.cuda().long()
            hidden = model.init_hidden(batch.shape[1])
            if batch.shape[0] > 1:
                
                data, targets = batch[:-1,:], batch[1:,:]
                number_of_words = data.shape[0]*data.shape[1]
                
                output, hidden = model(data, hidden)
                output_flat = output.contiguous().view(-1, vocab_size)

                total_loss_e += criterion(output_flat, targets.contiguous().view(-1)).data.float()
                total_number_of_words += number_of_words
                hidden = repackage_hidden(hidden)
            
    return (total_loss_e.item() / total_number_of_words)

In [74]:
#for batch in val_iter:
#    print(batch.review_text.shape)

In [78]:
import numpy as np
np.exp(evaluate(val_iter))

29992.535056669723

In [56]:
batch.review_text[:-1,:].shape

torch.Size([598, 3])

In [57]:
batch.review_text.shape

torch.Size([599, 3])