In [1]:
# use a baseline trained only with PPL, which has already converged and will be used as our baseline.
# they do teacher forcing all the way.

In [2]:
from data_generators import get_iterator, get_dataset
from classifiers import theme_classifier

In [3]:
import torch
import torch.nn as nn

In [4]:
from torchtext.vocab import GloVe 
GLOVE_EMBEDDING = GloVe(name="6B", dim=300)

In [5]:
train_dataset, val_dataset, test_dataset, review_text_FIELD, theme_FIELD = get_dataset(vectors = GLOVE_EMBEDDING)

In [6]:
batch_size = 20
train_iter = get_iterator(train_dataset, batch_size, train=True, shuffle=True, repeat=False)
val_iter = get_iterator(val_dataset, batch_size, train=False, shuffle=True, repeat=False)
test_iter = get_iterator(test_dataset, batch_size, train=False, shuffle=True, repeat=False)

In [7]:
val_list = list(val_iter)

In [8]:
batch = val_list[50]
x = batch.review_text.transpose(1, 0).int()[:10]
y = batch.theme.int()

for idx in range(x.shape[0]):
    #print(x.shape, y.shape)
    print("{} | {}".format(' '.join([train_dataset.fields['review_text'].vocab.itos[_] for _ in x[idx]]),
         train_dataset.fields['theme'].vocab.itos[y[idx]]))

<sos> awesome sauce <eos> | other
<sos> totally recommend <eos> | other
<sos> <unk> . <eos> | other
<sos> truly amazing <eos> | other
<sos> talky . <eos> | other
<sos> must see <eos> | other
<sos> 3.5 5 <eos> | other
<sos> 5 stars <eos> | other
<sos> movie ? <eos> | other
<sos> brutal . <eos> | other


In [9]:
theme_FIELD.vocab.itos

['<unk>', 'other', 'plot', 'acting', 'effect', 'production']

In [10]:
vocab_size = review_text_FIELD.vocab.vectors.shape[0]
label_size = len(theme_FIELD.vocab) - 1
emb_dim = review_text_FIELD.vocab.vectors.shape[1]
vectors = train_dataset.fields["review_text"].vocab.vectors
hidden_dim = 1024
layers = 2
dropout = .5

vocab_size, label_size, emb_dim, vectors.shape

(12304, 5, 300, torch.Size([12304, 300]))

In [11]:
from baseline_model import BaseModel, repackage_hidden

In [13]:
import numpy as np
import gc

In [14]:
EOS_token = review_text_FIELD.vocab.stoi['<eos>']
EOS_token

3

In [15]:
def forward_pass(model, data, labels, i):
    
    split_tf = data.shape[0] - (i % data.shape[0])
    #print(split_tf)
    
    seq_len = data.shape[0]
    data_tf = data[:split_tf,:]
    data_nf = data[split_tf:,:]

    output_flat = None
    hidden = None
    
    if split_tf > 0:

        data = data_tf
        output_tf, hidden = model(data, labels, hidden)
        repackage_hidden(hidden)
        output_flat = output_tf.contiguous().view(-1, vocab_size)

    if split_tf < seq_len:

        data = data_nf
        shape = tuple((*data_nf.shape, vocab_size))
        output_nf = torch.zeros(shape).cuda()
        hidden_i = None
        data_i = data[0,:]    

        for di in range(data_nf.shape[0]):
            params = data_i.unsqueeze(0), labels, hidden_i
            output_i, hidden_i = model(*params)
            hidden_i = repackage_hidden(hidden_i)
            topv, topi = output_i.topk(1)
            data_i = topi.squeeze().detach()
            output_nf[di,:] = output_i

        temp_output_flat = output_nf.contiguous().view(-1, vocab_size)
        if output_flat is None:
            output_flat = temp_output_flat
        else:
            output_flat = torch.cat([output_flat, temp_output_flat], 0)
            
    return output_flat

In [16]:
def evaluate(model, data_source, criterion, teacher_forcing = False):
    model.eval()
    total_loss_e = 0
    total_number_of_words = 0
    
    with torch.no_grad():
        for i, batch in enumerate(data_source):
            labels = batch.theme.cuda().long() - 1
            batch = batch.review_text.cuda().long()
            
            if batch.shape[0] > 3:
                data, targets = batch[1:-1,:], batch[2:,:]
                target_flat = targets.contiguous().view(-1)
                
                tf = i if not teacher_forcing else 0
                output_flat = forward_pass(model, data, labels, tf)
                
                batch_loss = criterion(output_flat, target_flat).detach().item()
                number_of_words = data.shape[0] * data.shape[1]
                total_loss_e += batch_loss * number_of_words
                total_number_of_words += number_of_words
            
    return total_loss_e / total_number_of_words

In [17]:
def train(model, ep0, epN, train_iter, dev_iter, optimizer, criterion, 
          max_grad_norm, model_name, best_ppl = float('inf'), teacher_forcing = False):
    
    best_ppl = best_ppl
    
    len_train_iter = len(train_iter)
    for epoch in range(ep0, epN):
        model.train()
        total_loss_e = 0
        total_number_of_words = 0 
        
        for i, batch in enumerate(train_iter):

            labels = batch.theme.cuda().long() - 1
            batch = batch.review_text.cuda().long()
            hidden = None
            
            if batch.shape[0] > 3:
                data, targets = batch[1:-1,:], batch[2:,:]
                
                tf = i if not teacher_forcing else 0
                output_flat = forward_pass(model, data, labels, tf)
                
                target_flat = targets.contiguous().view(-1)
                batch_loss = criterion(output_flat, target_flat)
                
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
                optimizer.zero_grad()
                batch_loss.backward()
                optimizer.step()

                number_of_words = data.shape[0] * data.shape[1]
                total_loss_e += batch_loss.detach().item() * number_of_words
                total_number_of_words += number_of_words
            
                
                if i % 500 == 0:
                    cur_loss = batch_loss.detach().item() 
                    tr_ppl_print = np.exp(cur_loss)
                    print("| epoch {:3d} | batch {} / {} | train_loss {} | train_ppl {}".format(
                            epoch, i, len_train_iter, 
                            np.round(cur_loss, 3), np.round(tr_ppl_print, 3)))

                
                if i % 4999 == 1: #len_train_iter - 1:
                    cur_loss = batch_loss.detach().item()
                    tr_ppl_print = np.exp(cur_loss)
                    gc.collect()
                    val_loss_eval = evaluate(model, dev_iter, criterion)
                    val_ppl_print = np.exp(val_loss_eval)
                    
                    template = "| epoch {:3d} | batch {} / {} | train_loss {} | train_ppl {} | val_loss {} | val_ppl {}"
                    print(template.format(
                            epoch, i, len_train_iter, 
                            np.round(cur_loss, 3), np.round(tr_ppl_print, 3), 
                            np.round(val_loss_eval, 3), np.round(val_ppl_print, 3)))

                    if val_ppl_print < best_ppl :
                        print('old best ppl {} new best ppl {}'.format(best_ppl, val_ppl_print))
                        best_ppl = val_ppl_print
                        best_model_name = '{}_{}.model'.format(model_name, best_ppl)
                        print('save model...', best_model_name)
                        with open(best_model_name, 'wb') as file:
                            torch.save(model, file) 

                    gc.collect()
                    model.train()
                    
                if i == 40000: break

In [31]:
with open('./no-teacher-forcing/model_ppl_188.3817233517709.model', 'rb') as file:
    model = torch.load(file)
    model.eval()

In [32]:
model

BaseModel(
  (drop): Dropout(p=0.5)
  (word_embedding): Embedding(12304, 300)
  (label_embedding): Embedding(5, 20)
  (rnn): LSTM(320, 1024, num_layers=2, dropout=0.5)
  (decoder): Linear(in_features=1024, out_features=12304, bias=True)
)

In [33]:
learning_rate = 0.001
criterion = nn.CrossEntropyLoss(reduction='mean',
                       ignore_index=train_dataset.fields["review_text"].vocab.stoi['<pad>']).cuda()
optimizer = torch.optim.Adadelta(model.parameters(), lr=learning_rate)#, betas=(.9999, .9999))

In [34]:
#evaluate(model, val_iter, criterion, teacher_forcing=False)

In [35]:
np.exp(5.300473668434095)

200.43172567537505

In [None]:
train(model,
      ep0 = 3,
      epN = 4,
      train_iter = train_iter,
      dev_iter = val_iter,
      optimizer = optimizer,
      criterion = criterion,
      max_grad_norm = 10,
      model_name = 'no-teacher-forcing/model_ppl',
      best_ppl = 188.382)

| epoch   3 | batch 500 / 35138 | train_loss 3.841 | train_ppl 46.573
| epoch   3 | batch 1000 / 35138 | train_loss 3.067 | train_ppl 21.467
| epoch   3 | batch 1500 / 35138 | train_loss 3.672 | train_ppl 39.318
| epoch   3 | batch 2000 / 35138 | train_loss 4.235 | train_ppl 69.031
| epoch   3 | batch 2500 / 35138 | train_loss 4.057 | train_ppl 57.773
| epoch   3 | batch 3000 / 35138 | train_loss 4.128 | train_ppl 62.074
| epoch   3 | batch 3500 / 35138 | train_loss 4.615 | train_ppl 100.944
| epoch   3 | batch 4000 / 35138 | train_loss 4.548 | train_ppl 94.432
| epoch   3 | batch 4500 / 35138 | train_loss 4.481 | train_ppl 88.351
| epoch   3 | batch 5000 / 35138 | train_loss 4.839 | train_ppl 126.329
| epoch   3 | batch 5000 / 35138 | train_loss 4.839 | train_ppl 126.329 | val_loss 5.238 | val_ppl 188.208
old best ppl 188.382 new best ppl 188.20771813579665
save model... no-teacher-forcing/model_ppl_188.20771813579665.model


  "type " + obj.__name__ + ". It won't be checked "


| epoch   3 | batch 5500 / 35138 | train_loss 5.043 | train_ppl 154.944
| epoch   3 | batch 6000 / 35138 | train_loss 4.176 | train_ppl 65.094
| epoch   3 | batch 6500 / 35138 | train_loss 4.643 | train_ppl 103.83
| epoch   3 | batch 7000 / 35138 | train_loss 5.17 | train_ppl 175.931
| epoch   3 | batch 7500 / 35138 | train_loss 4.384 | train_ppl 80.171
| epoch   3 | batch 8000 / 35138 | train_loss 5.119 | train_ppl 167.205
| epoch   3 | batch 8500 / 35138 | train_loss 3.956 | train_ppl 52.264
| epoch   3 | batch 9000 / 35138 | train_loss 4.272 | train_ppl 71.662
| epoch   3 | batch 9500 / 35138 | train_loss 5.216 | train_ppl 184.212


| epoch   2 | batch 500 / 35138 | train_loss 1.877 | train_ppl 6.534
| epoch   2 | batch 1000 / 35138 | train_loss 1.729 | train_ppl 5.632
| epoch   2 | batch 1500 / 35138 | train_loss 1.975 | train_ppl 7.205
| epoch   2 | batch 2000 / 35138 | train_loss 3.559 | train_ppl 35.135
| epoch   2 | batch 2500 / 35138 | train_loss 3.516 | train_ppl 33.64
| epoch   2 | batch 3000 / 35138 | train_loss 3.418 | train_ppl 30.512
| epoch   2 | batch 3500 / 35138 | train_loss 3.915 | train_ppl 50.149
| epoch   2 | batch 4000 / 35138 | train_loss 4.437 | train_ppl 84.486
| epoch   2 | batch 4500 / 35138 | train_loss 3.952 | train_ppl 52.045
| epoch   2 | batch 5000 / 35138 | train_loss 4.573 | train_ppl 96.801
| epoch   2 | batch 5000 / 35138 | train_loss 4.573 | train_ppl 96.801 | val_loss 5.813 | val_ppl 334.701
| epoch   2 | batch 5500 / 35138 | train_loss 4.864 | train_ppl 129.486
| epoch   2 | batch 6000 / 35138 | train_loss 3.992 | train_ppl 54.139
| epoch   2 | batch 6500 / 35138 | train_loss 

  "type " + obj.__name__ + ". It won't be checked "


| epoch   2 | batch 20000 / 35138 | train_loss 4.591 | train_ppl 98.579
| epoch   2 | batch 20500 / 35138 | train_loss 4.357 | train_ppl 77.992
| epoch   2 | batch 21000 / 35138 | train_loss 4.41 | train_ppl 82.289
| epoch   2 | batch 21500 / 35138 | train_loss 6.286 | train_ppl 537.266
| epoch   2 | batch 22000 / 35138 | train_loss 4.862 | train_ppl 129.255
| epoch   2 | batch 22500 / 35138 | train_loss 5.501 | train_ppl 244.908
| epoch   2 | batch 23000 / 35138 | train_loss 4.533 | train_ppl 93.076
| epoch   2 | batch 23500 / 35138 | train_loss 4.756 | train_ppl 116.317
| epoch   2 | batch 24000 / 35138 | train_loss 4.855 | train_ppl 128.354
| epoch   2 | batch 24500 / 35138 | train_loss 4.63 | train_ppl 102.533


KeyboardInterrupt: 

In [None]:
train(model,
      ep0 = 1,
      epN = 2,
      train_iter = train_iter,
      dev_iter = val_iter,
      optimizer = optimizer,
      criterion = criterion,
      max_grad_norm = 10,
      model_name = 'no-teacher-forcing/model_ppl',
      best_ppl = 200.43172567537505)

| epoch   0 | batch 500 / 35138 | train_loss 2.503 | train_ppl 12.218
| epoch   0 | batch 1000 / 35138 | train_loss 2.95 | train_ppl 19.103
| epoch   0 | batch 1500 / 35138 | train_loss 2.853 | train_ppl 17.337
| epoch   0 | batch 2000 / 35138 | train_loss 3.617 | train_ppl 37.235
| epoch   0 | batch 2500 / 35138 | train_loss 3.536 | train_ppl 34.321
| epoch   0 | batch 3000 / 35138 | train_loss 3.504 | train_ppl 33.265
| epoch   0 | batch 3500 / 35138 | train_loss 4.776 | train_ppl 118.588
| epoch   0 | batch 4000 / 35138 | train_loss 5.511 | train_ppl 247.355
| epoch   0 | batch 4500 / 35138 | train_loss 4.247 | train_ppl 69.926
| epoch   0 | batch 5000 / 35138 | train_loss 4.605 | train_ppl 100.01
| epoch   0 | batch 5000 / 35138 | train_loss 4.605 | train_ppl 100.01 | val_loss 6.354 | val_ppl 574.88
old best ppl inf new best ppl 574.8795707969175
save model... no-teacher-forcing/model_ppl_574.8795707969175.model


  "type " + obj.__name__ + ". It won't be checked "


| epoch   0 | batch 5500 / 35138 | train_loss 6.005 | train_ppl 405.364
| epoch   0 | batch 6000 / 35138 | train_loss 3.74 | train_ppl 42.103
| epoch   0 | batch 6500 / 35138 | train_loss 4.746 | train_ppl 115.148
| epoch   0 | batch 7000 / 35138 | train_loss 5.954 | train_ppl 385.253
| epoch   0 | batch 7500 / 35138 | train_loss 4.211 | train_ppl 67.427
| epoch   0 | batch 8000 / 35138 | train_loss 6.44 | train_ppl 626.429
| epoch   0 | batch 8500 / 35138 | train_loss 4.041 | train_ppl 56.855
| epoch   0 | batch 9000 / 35138 | train_loss 4.318 | train_ppl 75.062
| epoch   0 | batch 9500 / 35138 | train_loss 5.659 | train_ppl 286.859
| epoch   0 | batch 9999 / 35138 | train_loss 4.001 | train_ppl 54.663 | val_loss 5.534 | val_ppl 253.173
old best ppl 574.8795707969175 new best ppl 253.172912038308
save model... no-teacher-forcing/model_ppl_253.172912038308.model
| epoch   0 | batch 10000 / 35138 | train_loss 4.209 | train_ppl 67.273
| epoch   0 | batch 10500 / 35138 | train_loss 5.15 |

KeyboardInterrupt: 