In [9]:
%load_ext autoreload
%autoreload 2



The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [10]:
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchtext
from torchtext.data.utils import get_tokenizer
import numpy as np
from torch.autograd import Variable
from encoderdecoder import make_model
from encoderdecoder import subsequent_mask

In [11]:
TEXT = torchtext.data.Field(tokenize=get_tokenizer('basic_english'), init_token='<sos>', eos_token='<eos>', lower=True)

train_txt, val_txt, test_txt = torchtext.datasets.WikiText2.splits(TEXT)

TEXT.build_vocab(train_txt)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [12]:
def batchify(data, bsz):
    data = TEXT.numericalize([data.examples[0].text])

    nbatch = data.size(0) // bsz
    data = data.narrow(0, 0, nbatch * bsz)
    data = data.view(bsz, -1).t().contiguous()
    return data.to(device)


batch_size = 20
eval_batch_size = 10

train_data = batchify(train_txt, batch_size)
val_data = batchify(val_txt, eval_batch_size)
test_data = batchify(test_txt, batch_size)

bptt = 35

def get_batch(source, i):
    seq_len = min(bptt, len(source) - 1 - i)
    src = source[i:i+seq_len].t().contiguous()
    target = source[i+1:i+seq_len+1].t().contiguous()
    return src, target

In [None]:
ntokens = len(TEXT.vocab.stoi)
emsize = 200
nhid = 200
nlayers = 2
nhead = 2
dropout = 0.2
lr = 0.1
model = make_model(source_vocab=ntokens, target_vocab=ntokens, N=2)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(),lr=lr)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer=optimizer, step_size=1, gamma=0.95)


In [23]:
import time

def train():
    model.train()
    total_loss = 0.
    start_time = time.time()

    for batch, i in enumerate(range(0, train_data.size(0)-1, bptt)):
        source, target = get_batch(train_data, i)
        mask = subsequent_mask(source.size(1))
        optimizer.zero_grad()
        output = model(source, target, mask, mask)
        loss = criterion(output.view(-1, ntokens), target.view(-1))
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(),0.5)
        optimizer.step()
        total_loss += loss.item()
        log_interval = 200

        if batch % log_interval ==0 and batch > 0:
            elapsed = time.time() - start_time
            print("epochs:",epoch,"\t","batches:",batch,"\t","loss:",loss,"\t",elapsed * 1000 / log_interval,"ms/batch","\n")
            total_loss = 0
            start_time = time.time()

def evaluate(eval_model, data_source):
    eval_model.eval()
    total_loss = 0
    with torch.no_grad():
        for i in range(0, data_source.size(0), bptt):
            source, target = get_batch(data_source, i)
            mask = subsequent_mask(source.size(1))
            output = eval_model(source, target, mask, mask)
            loss = criterion(output.view(-1, ntokens), target.view(-1))
            total_loss += loss.item()

    return total_loss

In [24]:
best_val_loss = float("inf")

epochs = 1

best_model = None

for epoch in range(1, epochs + 1):

    train()
    val_loss = evaluate(model, val_data)

    print('#' * 89)
    print("valid loss:", val_loss)
    print('#' * 89)

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_model = model
    
    scheduler.step()

epochs: 1 	 batches: 200 	 loss: tensor(6.5790, grad_fn=<NllLossBackward0>) 	 535.3096842765808 ms/batch 

epochs: 1 	 batches: 400 	 loss: tensor(7.0774, grad_fn=<NllLossBackward0>) 	 539.1224372386932 ms/batch 

epochs: 1 	 batches: 600 	 loss: tensor(7.2337, grad_fn=<NllLossBackward0>) 	 528.6331367492676 ms/batch 

epochs: 1 	 batches: 800 	 loss: tensor(16.1844, grad_fn=<NllLossBackward0>) 	 528.3575987815857 ms/batch 

epochs: 1 	 batches: 1000 	 loss: tensor(7.6496, grad_fn=<NllLossBackward0>) 	 531.2740540504456 ms/batch 

epochs: 1 	 batches: 1200 	 loss: tensor(7.6510, grad_fn=<NllLossBackward0>) 	 533.8312685489655 ms/batch 

epochs: 1 	 batches: 1400 	 loss: tensor(6.8532, grad_fn=<NllLossBackward0>) 	 532.6292955875397 ms/batch 

epochs: 1 	 batches: 1600 	 loss: tensor(7.1578, grad_fn=<NllLossBackward0>) 	 530.5762588977814 ms/batch 

epochs: 1 	 batches: 1800 	 loss: tensor(6.9524, grad_fn=<NllLossBackward0>) 	 529.5039236545563 ms/batch 

epochs: 1 	 batches: 2000 	 los

In [22]:
best_val_loss

3630.100763320923