In [1]:
import os
import time
import torch
import torch.nn as nn
from torch.autograd import Variable
import math
from data_utils import * 
import model

In [2]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [3]:
train_batch_size = 32
eval_batch_size = 10

In [4]:
# Tokenize the data. Give the correct path. For ptb must have ptb in the path. 
corpus = Corpus('/Users/arijitsehanobish/simple-examples/ptb_data')

In [5]:
train_data = batchify(corpus.train, train_batch_size) # size(total_len//bsz, bsz)
val_data = batchify(corpus.valid, eval_batch_size)
test_data = batchify(corpus.test, eval_batch_size)

In [6]:
# Build the model
interval = 200 # interval to report
ntokens = len(corpus.dictionary)

# choose bidirectional vs unidirectional model and other model hyperparameters
directions = 2
hidden_size = 200
output_dim = 128 
n_layers = 2
net = model.RNNModel(ntokens, hidden_size, output_dim, n_layers, directions=directions, dropout=.5)

net.to(device)

# Load checkpoint
# if args.checkpoint != '':
#     model = torch.load(args.checkpoint, map_location=lambda storage, loc: storage)

RNNModel(
  (drop): Dropout(p=0.5, inplace=False)
  (encoder): Embedding(10000, 200)
  (rnn): LSTM(200, 128, num_layers=2, dropout=0.5, bidirectional=True)
  (decoder): Linear(in_features=256, out_features=10000, bias=True)
)


In [None]:
lr = .001
weight_decay = .0001
opt = torch.optim.Adam(net.parameters(), lr=lr, weight_decay=weight_decay)
criterion = nn.CrossEntropyLoss()

In [None]:
def evaluate(data_source):
    # Turn on evaluation mode which disables dropout.
    with torch.no_grad():
        net.eval()
        total_loss = 0
        ntokens = len(corpus.dictionary)
        hidden = net.init_hidden(eval_batch_size, directions=directions) #hidden size(nlayers, bsz, hdsize)
        for i in range(0, data_source.size(0) - 1, 64):# iterate over every timestep
            data, targets = get_batch(data_source, i)
            data, targets = data.to(device), targets.to(device)
            output, hidden = net(data, hidden)
            # model input and output
            # inputdata size(bptt, bsz), and size(bptt, bsz, embsize) after embedding
            # output size(bptt*bsz, ntoken)
            total_loss += len(data) * criterion(output, targets).data
            hidden = repackage_hidden(hidden)
        return total_loss / len(data_source)

In [None]:
### Main trainer 
n_epochs = 200
def train():

    net.train()
    total_loss = 0
    start_time = time.time()
   
    hidden = net.init_hidden(train_batch_size, directions=directions)
   
    # train_data size(batchcnt, bsz)
    for batch, i in enumerate(range(0, train_data.size(0) - 1, 64)):
        data, targets = get_batch(train_data, i)
        data, targets = data.to(device), targets.to(device)
        # Starting each batch, we detach the hidden state from how it was previously produced.
        # If we didn't, the model would try backpropagating all the way to start of the dataset.
        hidden = repackage_hidden(hidden)
        
      
        output, hidden = net(data, hidden)
        loss = criterion(output, targets)
        opt.zero_grad()
        loss.backward()

        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        torch.nn.utils.clip_grad_norm_(net.parameters(), 1.0)
        opt.step()

        total_loss += loss.data

        if batch % interval == 0 and batch > 0:
            cur_loss = total_loss / interval
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches | ms/batch {:5.2f} | '
                    'loss {:5.2f} | ppl {:8.2f}'.format(
                epoch, batch, len(train_data) // 64,
                elapsed * 1000 / interval, cur_loss, math.exp(cur_loss)))
            total_loss = 0
            start_time = time.time()

# Loop over epochs.

best_val_loss = None

try:
    for epoch in range(1, n_epochs+1):
        epoch_start_time = time.time()
        train()
        val_loss = evaluate(val_data)
        print('-' * 89)
        print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
                'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
                                           val_loss, math.exp(val_loss)))
        print('-' * 89)
        # Save the model if the validation loss is the best we've seen so far.
        if not best_val_loss or val_loss < best_val_loss:
            ###Fix path
            with open('/home/as3837/ptb/lstm', 'wb') as f:
                torch.save(model, f)
            best_val_loss = val_loss
        else:
            # Anneal the learning rate if no improvement has been seen in the validation dataset.
#             if args.opt == 'SGD' or args.opt == 'Momentum':
#                 lr /= 4.0
#                 for group in opt.param_groups:
#                     group['lr'] = lr
            pass

###TODO Anneal learning rate 
except KeyboardInterrupt:
    print('-' * 89)
    print('Exiting from training early')


In [None]:
## TODO issues with LSTM trained with pytorch 1.7 vs 1.8. Debug later and maps between versions.

In [None]:
test_loss = evaluate(test_data)
print('=' * 89)
print('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format(
    test_loss, math.exp(test_loss)))
print('=' * 89)