In [1]:
from data_generators import get_iterator, get_dataset
from classifiers import theme_classifier

In [2]:
from torchtext.vocab import GloVe 
GLOVE_EMBEDDING = GloVe(name="6B", dim=300)

In [6]:
train_dataset, val_dataset, test_dataset, review_text_FIELD, theme_FIELD = get_dataset(vectors = 
                                                                                       GLOVE_EMBEDDING)

In [7]:
train_iter = get_iterator(train_dataset, 32, train=True, shuffle=False, repeat=False)
val_iter = get_iterator(val_dataset, 32, train=False, shuffle=False, repeat=False)
test_iter = get_iterator(test_dataset, 32, train=False, shuffle=False, repeat=False)

In [8]:
list_train = list(train_iter)

In [9]:
batch = list_train[5600]
x = batch.review_text.transpose(1, 0).int()
y = batch.theme.int()

for idx in range(x.shape[0]):
    #print(x.shape, y.shape)
    print("{} | {}".format(' '.join([train_dataset.fields['review_text'].vocab.itos[_] for _ in x[idx]]),
         train_dataset.fields['theme'].vocab.itos[y[idx]]))

great micro budget film with decent writing and acting . | plot
vince vaughn 's acting was really good as well . | acting
mark hamil returning as joker saves this mildly disappointing film | other
for those in doubt , locke is the proof . | other
great mixture of epic battles and individual character development . | plot
i would give it 0 stars if i could . | other
i could n't take my eyes off the screen . | other
a slightly odd cast in the very strange film . | acting
the overall feel is boredom interspersed with nasty violence . | other
too many boobs and too many repeated voice overs . | effect
and daniel radcliffe and zoe <unk> prove just that . | other
the story seemed to be one big loop hole . | plot
the trailer was more suspenseful than the actual movie . | effect
i 'd give it 0.5 stars if i could . | other
the visuals are tremendous and the acting very good . | acting
now , that 's how you make an entrance . | other
no plot , no acting and no reason to see | plot
there are a lo

In [10]:
review_text_FIELD.vocab.vectors.shape, len(review_text_FIELD.vocab.itos)

(torch.Size([30002, 300]), 30002)

In [20]:
vocab_size = review_text_FIELD.vocab.vectors.shape[0]
emb_dim = review_text_FIELD.vocab.vectors.shape[1]
hidden_dim = 500
layers = 2
dropout = .2

In [21]:
import torch.nn as nn

class BaseModel(nn.Module):

    def __init__(self, nhid = hidden_dim, emb_dim = emb_dim, ninp = hidden_dim, 
                 nlayers = layers, ntoken = vocab_size, dropout = dropout, pretrained = False):
        super().__init__()
        
        self.drop = nn.Dropout(dropout)
        
        self.pretrained = pretrained
        if pretrained:
            self.encoder = nn.Embedding(ntoken, emb_dim)

        self.softmax = nn.LogSoftmax(dim=-1)
        self.rnn = nn.LSTM(ninp, nhid, nlayers, dropout=dropout)
        self.rnn.flatten_parameters()
        self.decoder = nn.Linear(nhid, ntoken)
        self.nhid = nhid
        self.nlayers = nlayers
        
        if pretrained:
            self.encoder.weight.data = train_dataset.fields["review_text"].vocab.vectors
            
    def init_weights(self):
        initrange = .1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, input, hidden):
        emb = self.encoder(input)
        emb = self.drop(emb)
        output, hidden = self.rnn(emb, hidden)
        output = self.drop(output)
        output = output.view(output.size(0)*output.size(1), output.size(2))
        decoded = self.decoder(output)
        log_probs = self.softmax(decoded)
        return log_probs, hidden
      
      
    def init_hidden(self, bsz):
        weight = next(self.parameters())
        output = (weight.new_zeros(self.nlayers, bsz, self.nhid).cuda(),
            weight.new_zeros(self.nlayers, bsz, self.nhid).cuda())
        return output