In [1]:
import spacy
import numpy as np
from spacy.symbols import ORTH
from torchtext import data, datasets
import torch
import torch.nn as nn
import torch.optim as optim

In [2]:
USE_CUDA = torch.cuda.is_available()
device = torch.device("cuda" if USE_CUDA else "cpu")
print("Use CUDA:", USE_CUDA)

Use CUDA: True


# Hyperparameters

In [3]:
nhid = 200
embed_dim = 300
lr = 10
NUM_EPOCHS = 20
bptt_len = 60
batch_size = 32

# Read Data

In [4]:
train_file = 'train.txt'
dev_file = 'dev.txt'

In [5]:
lm_tok = spacy.load('en')
def spacy_tok(x):
    return [tok.text for tok in lm_tok.tokenizer(x)]

TEXT = data.ReversibleField(sequential=True, tokenize=spacy_tok,
                            lower=True, include_lengths=False)

In [6]:
train_dataset = datasets.LanguageModelingDataset(train_file, TEXT, newline_eos=True)
dev_dataset = datasets.LanguageModelingDataset(dev_file, TEXT, newline_eos=True)

In [7]:
vectors = "glove.840B.300d"
TEXT.build_vocab(train_dataset, dev_dataset, vectors=vectors)

In [8]:
# iterators
train_iter = data.BPTTIterator(train_dataset, batch_size=batch_size, bptt_len=bptt_len, repeat=False, shuffle=True)
dev_iter = data.BPTTIterator(dev_dataset, batch_size=batch_size, bptt_len=bptt_len, repeat=False)

In [9]:
len(TEXT.vocab)

26246

In [10]:
# Create embeddings
embedding = nn.Embedding(len(TEXT.vocab), embed_dim)
embedding.weight.data.copy_(TEXT.vocab.vectors)
embedding.weight.requires_grad = False
embedding = embedding.to(device)

# Define Model

In [11]:
class LM(nn.Module):
    def __init__(self, ntoken, ninp, nhid, embedding, dropout=0.5):
        super(LM, self).__init__()
        self.nhid = nhid
        self.encoder = embedding
        self.rnn = nn.LSTM(ninp, nhid, batch_first=True)
        self.decoder = nn.Linear(nhid, ntoken)
        self.embed_drop = nn.Dropout(dropout)
        self.output_drop = nn.Dropout(dropout)
#         self.embed_drop = LockedDropout(dropout)
#         self.output_drop = LockedDropout(dropout)

        # # tie weights
        # self.decoder.weight = self.encoder.weight

    def forward(self, inputs, hidden=None):
        """

        :param inputs: (batch_size, max_len)
        :param hidden: ((1, batch_size, nhid), (1, batch_size, nhid))
        :return:
        """
        emb = self.embed_drop(self.encoder(inputs))
        if hidden:
            outputs, hidden = self.rnn(emb, hidden)
        else:
            outputs, hidden = self.rnn(emb)
        outputs = self.output_drop(outputs)
        decoded = self.decoder(outputs)
        return decoded, outputs, hidden

In [12]:
lm = LM(len(TEXT.vocab), embed_dim, nhid, embedding).to(device)
criterion = nn.CrossEntropyLoss().to(device)
optimizer = optim.SGD(lm.parameters(), lr=lr)

# Training and Evaluation

In [13]:
def train_epoch():
    losses = []
    for batch in train_iter:
        x, y = batch.text.transpose(0, 1).contiguous().to(device), \
                   batch.target.transpose(0, 1).contiguous().to(device)
        
        out, _, _ = lm(x)
        
        out = out.contiguous().view(-1, len(TEXT.vocab))
        y = y.view(-1)

        loss = criterion(out, y).to(device)
        losses.append(loss.item())

        # update model
        optimizer.zero_grad()
        loss.backward()
        # _ = torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.config.grad_clipping)
        optimizer.step()

    return np.mean(losses)

In [14]:
def eval_epoch():
    losses = []
    for batch in dev_iter:
        x, y = batch.text.transpose(0, 1).contiguous().to(device), \
                   batch.target.transpose(0, 1).contiguous().to(device)
        
        with torch.no_grad():
            out, _, _ = lm(x)
        
        out = out.contiguous().view(-1, len(TEXT.vocab))
        y = y.view(-1)

        loss = criterion(out, y).to(device)
        losses.append(loss.item())
            
    return np.mean(losses)

In [15]:
train_losses = []
dev_losses = []
for epoch in range(NUM_EPOCHS):
    loss_train = train_epoch()
    loss_dev = eval_epoch()
    
    print('train loss: %.4f, dev loss: %.4f' % (loss_train, loss_dev))
    
    train_losses.append(loss_train)
    dev_losses.append(loss_dev)

train loss: 5.0529, dev loss: 4.5787
train loss: 4.4433, dev loss: 4.3347
train loss: 4.2720, dev loss: 4.2306
train loss: 4.1831, dev loss: 4.1583
train loss: 4.1194, dev loss: 4.1019


KeyboardInterrupt: 