In [1]:
import os

import itertools
import pickle
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import math 

import sys
sys.path.append('../')
import utils
import wiki_utils
%matplotlib inline

In [2]:
import torchtext
from torchtext import data
from torchtext.datasets import WikiText2

tokenizer = lambda x: list(x)

TEXT = data.Field(lower=True, tokenize=tokenizer)

In [3]:
path = './wikitext/'
train, valid, test = torchtext.datasets.LanguageModelingDataset.splits(text_field=TEXT, root=".", path=path, 
                                                   train="train.txt", validation="valid.txt", test="test.txt")

In [4]:
TEXT.build_vocab(train)

In [5]:
batch_size = 128
sequence_length = 30
grad_clip = 0.1
lr = 4.
best_val_loss = None
log_interval = 100
eval_batch_size = 128

In [6]:
train_iter, valid_iter, test_iter = data.BPTTIterator.splits(
    (train, valid, test),
    batch_size=batch_size,
    bptt_len=sequence_length, # this is where we specify the sequence length
    device=0,
    repeat=False)

The `device` argument should be set by using `torch.device` or passing a string as an argument. This behavior will be deprecated soon and currently defaults to cpu.
The `device` argument should be set by using `torch.device` or passing a string as an argument. This behavior will be deprecated soon and currently defaults to cpu.
The `device` argument should be set by using `torch.device` or passing a string as an argument. This behavior will be deprecated soon and currently defaults to cpu.


In [7]:
class CustomLoader:
    def __init__(self, data_iter, batch_size):
        self.data_iter = data_iter
        self.iter = iter(data_iter)
        self.batch_size = batch_size
        
    def __iter__(self):
        for d in self.data_iter:
#             d = next(self.data_iter)
            yield d.text, d.target.view(-1)
        
    def __len__(self):
        return int(len(train_iter.dataset.examples[0].text)/self.batch_size)

In [8]:
train_loader = CustomLoader(train_iter, batch_size)
val_loader = CustomLoader(valid_iter, batch_size)
test_loader = CustomLoader(test_ite, batch_size)

In [9]:
# corpus = wiki_utils.Texts('./wikitext/')

In [10]:
# eval_batch_size = 128
# train_loader = wiki_utils.TextLoader(corpus.train, batch_size=batch_size)
# val_loader = wiki_utils.TextLoader(corpus.valid, batch_size=eval_batch_size)
# test_loader = wiki_utils.TextLoader(corpus.test, batch_size=eval_batch_size)

In [9]:
class RNNModel(nn.Module):

    def __init__(self, rnn_type, ntoken, ninp, nhid, nlayers, dropout=0.5):
        super(RNNModel, self).__init__()
        self.drop = nn.Dropout(dropout)
        self.encoder = nn.Embedding(ntoken, ninp)
        if rnn_type == 'LSTM':
            self.rnn = nn.LSTM(ninp, nhid, nlayers, dropout=dropout)
        elif rnn_type == 'GRU':
            self.rnn = nn.GRU(ninp, nhid, nlayers, dropout=dropout)
        self.decoder = nn.Linear(nhid, ntoken)

        self.init_weights()

        self.rnn_type = rnn_type
        self.nhid = nhid
        self.nlayers = nlayers

    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.fill_(0)
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, x, hidden=None):
        emb = self.drop(self.encoder(x))
        output, hidden = self.rnn(emb, hidden)
        output = self.drop(output)
        decoded = self.decoder(output.view(output.size(0)*output.size(1), output.size(2)))
        return decoded.view(output.size(0), output.size(1), decoded.size(1)), hidden

    def init_hidden(self, bsz):
        weight = next(self.parameters()).data
        if self.rnn_type == 'LSTM':
            return (weight.new(self.nlayers, bsz, self.nhid).zero_(),
                    weight.new(self.nlayers, bsz, self.nhid).zero_())
        else:
            return weight.new(self.nlayers, bsz, self.nhid).zero_()

In [10]:
def evaluate(data_loader):
    model.eval()
    total_loss = 0
    ntokens = len(TEXT.vocab.stoi) # len(corpus.dictionary)
    hidden = model.init_hidden(eval_batch_size)
    for i, (data, targets) in enumerate(data_loader):
        output, hidden = model(data)
        output_flat = output.view(-1, ntokens)
        total_loss += len(data) * criterion(output_flat, targets).item()
    return total_loss / len(data_loader)

In [11]:
def train():
    model.train()
    total_loss = 0
    ntokens = len(TEXT.vocab.stoi) # len(corpus.dictionary)
    for batch, (data, targets) in enumerate(train_loader):
        model.zero_grad()
        output, hidden = model(data)
#         print(targets.size(), output.view(-1, ntokens).size())
        loss = criterion(output.view(-1, ntokens), targets)
        loss.backward()

        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
        for p in model.parameters():
            p.data.add_(-lr, p.grad.data)

        total_loss += loss.item()

        if batch % log_interval == 0 and batch > 0:
            cur_loss = total_loss / log_interval
            print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | loss {:5.2f} | ppl {:8.2f}'.format(
                epoch, batch, len(train_loader) // sequence_length, lr, cur_loss, math.exp(cur_loss)))
            total_loss = 0

In [12]:
ntokens = len(TEXT.vocab.stoi) # len(corpus.dictionary)
model = RNNModel('LSTM', ntokens, 128, 128, 2, 0.3)
criterion = nn.CrossEntropyLoss()

In [13]:
def generate(n=50, temp=1.):
    model.eval()
    x = torch.rand(1, 1).mul(ntokens).long()
    hidden = None
    out = []
    for i in range(n):
        output, hidden = model(x, hidden)
        s_weights = output.squeeze().data.div(temp).exp()
        s_idx = torch.multinomial(s_weights, 1)[0]
        x.data.fill_(s_idx)
        # s = corpus.dictionary.idx2symbol[s_idx]
        s = TEXT.vocab.itos[s_idx]
        out.append(s)
    return ''.join(out)

In [None]:
with torch.no_grad():
    print('sample:\n', generate(50), '\n')

for epoch in range(1, 101):
    train()
    val_loss = evaluate(val_loader)
    print('-' * 89)
    print('| end of epoch {:3d} | valid loss {:5.2f} | valid ppl {:8.2f}'.format(
        epoch, val_loss, math.exp(val_loss)))
    print('-' * 89)
    if not best_val_loss or val_loss < best_val_loss:
        best_val_loss = val_loss
    else:
        # Anneal the learning rate if no improvement has been seen in the validation dataset.
        lr /= 4.0
    with torch.no_grad():
        print('sample:\n', generate(50), '\n')


sample:
 トキ–ōκėβ0×½ā#²ü⁄yỳò礮”ง 4ลî隊еァ¥隊≤[ñt火ū჻,大á隊оớβ±ิト<unk>ê7 

| epoch   1 |   100/ 2807 batches | lr 4.00 | loss  3.47 | ppl    32.13
| epoch   1 |   200/ 2807 batches | lr 4.00 | loss  3.15 | ppl    23.42
| epoch   1 |   300/ 2807 batches | lr 4.00 | loss  3.13 | ppl    22.77
| epoch   1 |   400/ 2807 batches | lr 4.00 | loss  3.10 | ppl    22.26
| epoch   1 |   500/ 2807 batches | lr 4.00 | loss  3.10 | ppl    22.28
| epoch   1 |   600/ 2807 batches | lr 4.00 | loss  3.08 | ppl    21.74
| epoch   1 |   700/ 2807 batches | lr 4.00 | loss  2.97 | ppl    19.45
| epoch   1 |   800/ 2807 batches | lr 4.00 | loss  2.84 | ppl    17.19
| epoch   1 |   900/ 2807 batches | lr 4.00 | loss  2.71 | ppl    14.96
| epoch   1 |  1000/ 2807 batches | lr 4.00 | loss  2.61 | ppl    13.54
| epoch   1 |  1100/ 2807 batches | lr 4.00 | loss  2.51 | ppl    12.32
| epoch   1 |  1200/ 2807 batches | lr 4.00 | loss  2.46 | ppl    11.67
| epoch   1 |  1300/ 2807 batches | lr 4.00 | loss  2.41 | ppl    11.1

| epoch   4 |  1800/ 2807 batches | lr 4.00 | loss  1.72 | ppl     5.56
| epoch   4 |  1900/ 2807 batches | lr 4.00 | loss  1.72 | ppl     5.60
| epoch   4 |  2000/ 2807 batches | lr 4.00 | loss  1.71 | ppl     5.54
| epoch   4 |  2100/ 2807 batches | lr 4.00 | loss  1.72 | ppl     5.58
| epoch   4 |  2200/ 2807 batches | lr 4.00 | loss  1.72 | ppl     5.56
| epoch   4 |  2300/ 2807 batches | lr 4.00 | loss  1.72 | ppl     5.57
| epoch   4 |  2400/ 2807 batches | lr 4.00 | loss  1.71 | ppl     5.51
| epoch   4 |  2500/ 2807 batches | lr 4.00 | loss  1.71 | ppl     5.51
| epoch   4 |  2600/ 2807 batches | lr 4.00 | loss  1.72 | ppl     5.56
| epoch   4 |  2700/ 2807 batches | lr 4.00 | loss  1.71 | ppl     5.52
| epoch   4 |  2800/ 2807 batches | lr 4.00 | loss  1.70 | ppl     5.46
-----------------------------------------------------------------------------------------
| end of epoch   4 | valid loss  0.16 | valid ppl     1.17
-----------------------------------------------------------

| epoch   8 |   300/ 2807 batches | lr 4.00 | loss  1.64 | ppl     5.14
| epoch   8 |   400/ 2807 batches | lr 4.00 | loss  1.64 | ppl     5.15
| epoch   8 |   500/ 2807 batches | lr 4.00 | loss  1.64 | ppl     5.13
| epoch   8 |   600/ 2807 batches | lr 4.00 | loss  1.63 | ppl     5.11
| epoch   8 |   700/ 2807 batches | lr 4.00 | loss  1.64 | ppl     5.14
| epoch   8 |   800/ 2807 batches | lr 4.00 | loss  1.63 | ppl     5.11
| epoch   8 |   900/ 2807 batches | lr 4.00 | loss  1.63 | ppl     5.13
| epoch   8 |  1000/ 2807 batches | lr 4.00 | loss  1.63 | ppl     5.12
| epoch   8 |  1100/ 2807 batches | lr 4.00 | loss  1.63 | ppl     5.11
| epoch   8 |  1200/ 2807 batches | lr 4.00 | loss  1.64 | ppl     5.14
| epoch   8 |  1300/ 2807 batches | lr 4.00 | loss  1.63 | ppl     5.11
| epoch   8 |  1400/ 2807 batches | lr 4.00 | loss  1.62 | ppl     5.04
| epoch   8 |  1500/ 2807 batches | lr 4.00 | loss  1.63 | ppl     5.09
| epoch   8 |  1600/ 2807 batches | lr 4.00 | loss  1.63 | ppl  

| epoch  11 |  2100/ 2807 batches | lr 4.00 | loss  1.61 | ppl     5.00
| epoch  11 |  2200/ 2807 batches | lr 4.00 | loss  1.61 | ppl     4.98
| epoch  11 |  2300/ 2807 batches | lr 4.00 | loss  1.61 | ppl     5.00
| epoch  11 |  2400/ 2807 batches | lr 4.00 | loss  1.59 | ppl     4.92
| epoch  11 |  2500/ 2807 batches | lr 4.00 | loss  1.60 | ppl     4.94
| epoch  11 |  2600/ 2807 batches | lr 4.00 | loss  1.61 | ppl     5.00
| epoch  11 |  2700/ 2807 batches | lr 4.00 | loss  1.60 | ppl     4.96
| epoch  11 |  2800/ 2807 batches | lr 4.00 | loss  1.60 | ppl     4.93
-----------------------------------------------------------------------------------------
| end of epoch  11 | valid loss  0.15 | valid ppl     1.16
-----------------------------------------------------------------------------------------
sample:
  . in 6 variets suite for from the working the fir 

| epoch  12 |   100/ 2807 batches | lr 4.00 | loss  1.62 | ppl     5.03
| epoch  12 |   200/ 2807 batches | lr 4.00 | loss 

| epoch  15 |   700/ 2807 batches | lr 4.00 | loss  1.58 | ppl     4.88
| epoch  15 |   800/ 2807 batches | lr 4.00 | loss  1.58 | ppl     4.87
| epoch  15 |   900/ 2807 batches | lr 4.00 | loss  1.58 | ppl     4.87
| epoch  15 |  1000/ 2807 batches | lr 4.00 | loss  1.58 | ppl     4.86
| epoch  15 |  1100/ 2807 batches | lr 4.00 | loss  1.58 | ppl     4.86
| epoch  15 |  1200/ 2807 batches | lr 4.00 | loss  1.59 | ppl     4.89
| epoch  15 |  1300/ 2807 batches | lr 4.00 | loss  1.58 | ppl     4.86
| epoch  15 |  1400/ 2807 batches | lr 4.00 | loss  1.57 | ppl     4.80
| epoch  15 |  1500/ 2807 batches | lr 4.00 | loss  1.58 | ppl     4.84
| epoch  15 |  1600/ 2807 batches | lr 4.00 | loss  1.58 | ppl     4.86
| epoch  15 |  1700/ 2807 batches | lr 4.00 | loss  1.58 | ppl     4.84
| epoch  15 |  1800/ 2807 batches | lr 4.00 | loss  1.58 | ppl     4.86
| epoch  15 |  1900/ 2807 batches | lr 4.00 | loss  1.59 | ppl     4.90
| epoch  15 |  2000/ 2807 batches | lr 4.00 | loss  1.58 | ppl  

| epoch  18 |  2500/ 2807 batches | lr 4.00 | loss  1.56 | ppl     4.78
| epoch  18 |  2600/ 2807 batches | lr 4.00 | loss  1.58 | ppl     4.83
| epoch  18 |  2700/ 2807 batches | lr 4.00 | loss  1.57 | ppl     4.81
| epoch  18 |  2800/ 2807 batches | lr 4.00 | loss  1.56 | ppl     4.78
-----------------------------------------------------------------------------------------
| end of epoch  18 | valid loss  0.14 | valid ppl     1.15
-----------------------------------------------------------------------------------------
sample:
 ulia and amustrale . the 1982 , imperiers in her s 

| epoch  19 |   100/ 2807 batches | lr 4.00 | loss  1.58 | ppl     4.87
| epoch  19 |   200/ 2807 batches | lr 4.00 | loss  1.56 | ppl     4.77
| epoch  19 |   300/ 2807 batches | lr 4.00 | loss  1.57 | ppl     4.80
| epoch  19 |   400/ 2807 batches | lr 4.00 | loss  1.57 | ppl     4.81
| epoch  19 |   500/ 2807 batches | lr 4.00 | loss  1.57 | ppl     4.80
| epoch  19 |   600/ 2807 batches | lr 4.00 | loss 

| epoch  22 |  1100/ 2807 batches | lr 4.00 | loss  1.56 | ppl     4.75
| epoch  22 |  1200/ 2807 batches | lr 4.00 | loss  1.57 | ppl     4.78
| epoch  22 |  1300/ 2807 batches | lr 4.00 | loss  1.56 | ppl     4.75
| epoch  22 |  1400/ 2807 batches | lr 4.00 | loss  1.55 | ppl     4.69
| epoch  22 |  1500/ 2807 batches | lr 4.00 | loss  1.55 | ppl     4.73
| epoch  22 |  1600/ 2807 batches | lr 4.00 | loss  1.56 | ppl     4.75
| epoch  22 |  1700/ 2807 batches | lr 4.00 | loss  1.56 | ppl     4.74
| epoch  22 |  1800/ 2807 batches | lr 4.00 | loss  1.56 | ppl     4.75
| epoch  22 |  1900/ 2807 batches | lr 4.00 | loss  1.57 | ppl     4.80
| epoch  22 |  2000/ 2807 batches | lr 4.00 | loss  1.56 | ppl     4.74
| epoch  22 |  2100/ 2807 batches | lr 4.00 | loss  1.56 | ppl     4.78
| epoch  22 |  2200/ 2807 batches | lr 4.00 | loss  1.56 | ppl     4.77
| epoch  22 |  2300/ 2807 batches | lr 4.00 | loss  1.56 | ppl     4.78
| epoch  22 |  2400/ 2807 batches | lr 4.00 | loss  1.55 | ppl  

-----------------------------------------------------------------------------------------
| end of epoch  25 | valid loss  0.14 | valid ppl     1.15
-----------------------------------------------------------------------------------------
sample:
 n virtual disch open the new ! increasing the few  

| epoch  26 |   100/ 2807 batches | lr 4.00 | loss  1.57 | ppl     4.79
| epoch  26 |   200/ 2807 batches | lr 4.00 | loss  1.55 | ppl     4.69
| epoch  26 |   300/ 2807 batches | lr 4.00 | loss  1.55 | ppl     4.73
| epoch  26 |   400/ 2807 batches | lr 4.00 | loss  1.55 | ppl     4.73
| epoch  26 |   500/ 2807 batches | lr 4.00 | loss  1.55 | ppl     4.71
| epoch  26 |   600/ 2807 batches | lr 4.00 | loss  1.55 | ppl     4.70
| epoch  26 |   700/ 2807 batches | lr 4.00 | loss  1.55 | ppl     4.73
| epoch  26 |   800/ 2807 batches | lr 4.00 | loss  1.55 | ppl     4.72
| epoch  26 |   900/ 2807 batches | lr 4.00 | loss  1.55 | ppl     4.71
| epoch  26 |  1000/ 2807 batches | lr 4.00 | loss 

| epoch  29 |  1500/ 2807 batches | lr 4.00 | loss  1.54 | ppl     4.67
| epoch  29 |  1600/ 2807 batches | lr 4.00 | loss  1.55 | ppl     4.69
| epoch  29 |  1700/ 2807 batches | lr 4.00 | loss  1.54 | ppl     4.67
| epoch  29 |  1800/ 2807 batches | lr 4.00 | loss  1.54 | ppl     4.68
| epoch  29 |  1900/ 2807 batches | lr 4.00 | loss  1.56 | ppl     4.74
| epoch  29 |  2000/ 2807 batches | lr 4.00 | loss  1.54 | ppl     4.69
| epoch  29 |  2100/ 2807 batches | lr 4.00 | loss  1.55 | ppl     4.72
| epoch  29 |  2200/ 2807 batches | lr 4.00 | loss  1.55 | ppl     4.71
| epoch  29 |  2300/ 2807 batches | lr 4.00 | loss  1.55 | ppl     4.72
| epoch  29 |  2400/ 2807 batches | lr 4.00 | loss  1.54 | ppl     4.65
| epoch  29 |  2500/ 2807 batches | lr 4.00 | loss  1.54 | ppl     4.67
| epoch  29 |  2600/ 2807 batches | lr 4.00 | loss  1.55 | ppl     4.73
| epoch  29 |  2700/ 2807 batches | lr 4.00 | loss  1.55 | ppl     4.69
| epoch  29 |  2800/ 2807 batches | lr 4.00 | loss  1.54 | ppl  

| epoch  33 |   100/ 2807 batches | lr 4.00 | loss  1.56 | ppl     4.74
| epoch  33 |   200/ 2807 batches | lr 4.00 | loss  1.53 | ppl     4.64
| epoch  33 |   300/ 2807 batches | lr 4.00 | loss  1.54 | ppl     4.67
| epoch  33 |   400/ 2807 batches | lr 4.00 | loss  1.54 | ppl     4.68
| epoch  33 |   500/ 2807 batches | lr 4.00 | loss  1.54 | ppl     4.67
| epoch  33 |   600/ 2807 batches | lr 4.00 | loss  1.54 | ppl     4.65
| epoch  33 |   700/ 2807 batches | lr 4.00 | loss  1.54 | ppl     4.69
| epoch  33 |   800/ 2807 batches | lr 4.00 | loss  1.54 | ppl     4.66
| epoch  33 |   900/ 2807 batches | lr 4.00 | loss  1.54 | ppl     4.66
| epoch  33 |  1000/ 2807 batches | lr 4.00 | loss  1.54 | ppl     4.66
| epoch  33 |  1100/ 2807 batches | lr 4.00 | loss  1.54 | ppl     4.66
| epoch  33 |  1200/ 2807 batches | lr 4.00 | loss  1.55 | ppl     4.69
| epoch  33 |  1300/ 2807 batches | lr 4.00 | loss  1.54 | ppl     4.67
| epoch  33 |  1400/ 2807 batches | lr 4.00 | loss  1.53 | ppl  

| epoch  36 |  1900/ 2807 batches | lr 4.00 | loss  1.55 | ppl     4.69
| epoch  36 |  2000/ 2807 batches | lr 4.00 | loss  1.53 | ppl     4.64
| epoch  36 |  2100/ 2807 batches | lr 4.00 | loss  1.54 | ppl     4.68
| epoch  36 |  2200/ 2807 batches | lr 4.00 | loss  1.54 | ppl     4.67
| epoch  36 |  2300/ 2807 batches | lr 4.00 | loss  1.54 | ppl     4.68
| epoch  36 |  2400/ 2807 batches | lr 4.00 | loss  1.53 | ppl     4.61
| epoch  36 |  2500/ 2807 batches | lr 4.00 | loss  1.53 | ppl     4.63
| epoch  36 |  2600/ 2807 batches | lr 4.00 | loss  1.54 | ppl     4.68
| epoch  36 |  2700/ 2807 batches | lr 4.00 | loss  1.54 | ppl     4.66
| epoch  36 |  2800/ 2807 batches | lr 4.00 | loss  1.53 | ppl     4.62
-----------------------------------------------------------------------------------------
| end of epoch  36 | valid loss  0.14 | valid ppl     1.15
-----------------------------------------------------------------------------------------
sample:
  2003 during the first killed fi

| epoch  40 |   500/ 2807 batches | lr 1.00 | loss  1.53 | ppl     4.61
| epoch  40 |   600/ 2807 batches | lr 1.00 | loss  1.53 | ppl     4.61
| epoch  40 |   700/ 2807 batches | lr 1.00 | loss  1.53 | ppl     4.63
| epoch  40 |   800/ 2807 batches | lr 1.00 | loss  1.53 | ppl     4.61
| epoch  40 |   900/ 2807 batches | lr 1.00 | loss  1.53 | ppl     4.62
| epoch  40 |  1000/ 2807 batches | lr 1.00 | loss  1.53 | ppl     4.61
| epoch  40 |  1100/ 2807 batches | lr 1.00 | loss  1.53 | ppl     4.61
| epoch  40 |  1200/ 2807 batches | lr 1.00 | loss  1.53 | ppl     4.64
| epoch  40 |  1300/ 2807 batches | lr 1.00 | loss  1.53 | ppl     4.62
| epoch  40 |  1400/ 2807 batches | lr 1.00 | loss  1.51 | ppl     4.54
| epoch  40 |  1500/ 2807 batches | lr 1.00 | loss  1.52 | ppl     4.59
| epoch  40 |  1600/ 2807 batches | lr 1.00 | loss  1.53 | ppl     4.61
| epoch  40 |  1700/ 2807 batches | lr 1.00 | loss  1.52 | ppl     4.59
| epoch  40 |  1800/ 2807 batches | lr 1.00 | loss  1.53 | ppl  

| epoch  43 |  2300/ 2807 batches | lr 1.00 | loss  1.53 | ppl     4.61
| epoch  43 |  2400/ 2807 batches | lr 1.00 | loss  1.52 | ppl     4.55
| epoch  43 |  2500/ 2807 batches | lr 1.00 | loss  1.52 | ppl     4.57
| epoch  43 |  2600/ 2807 batches | lr 1.00 | loss  1.53 | ppl     4.62
| epoch  43 |  2700/ 2807 batches | lr 1.00 | loss  1.53 | ppl     4.60
| epoch  43 |  2800/ 2807 batches | lr 1.00 | loss  1.52 | ppl     4.56
-----------------------------------------------------------------------------------------
| end of epoch  43 | valid loss  0.14 | valid ppl     1.15
-----------------------------------------------------------------------------------------
sample:
  <unk> , merce , phillive grandis international to 

| epoch  44 |   100/ 2807 batches | lr 1.00 | loss  1.54 | ppl     4.67
| epoch  44 |   200/ 2807 batches | lr 1.00 | loss  1.52 | ppl     4.56
| epoch  44 |   300/ 2807 batches | lr 1.00 | loss  1.53 | ppl     4.60
| epoch  44 |   400/ 2807 batches | lr 1.00 | loss 

| epoch  47 |   900/ 2807 batches | lr 1.00 | loss  1.52 | ppl     4.59
| epoch  47 |  1000/ 2807 batches | lr 1.00 | loss  1.52 | ppl     4.58
| epoch  47 |  1100/ 2807 batches | lr 1.00 | loss  1.52 | ppl     4.59
| epoch  47 |  1200/ 2807 batches | lr 1.00 | loss  1.53 | ppl     4.62
| epoch  47 |  1300/ 2807 batches | lr 1.00 | loss  1.53 | ppl     4.60
| epoch  47 |  1400/ 2807 batches | lr 1.00 | loss  1.51 | ppl     4.53
| epoch  47 |  1500/ 2807 batches | lr 1.00 | loss  1.52 | ppl     4.57
| epoch  47 |  1600/ 2807 batches | lr 1.00 | loss  1.52 | ppl     4.59
| epoch  47 |  1700/ 2807 batches | lr 1.00 | loss  1.52 | ppl     4.57
| epoch  47 |  1800/ 2807 batches | lr 1.00 | loss  1.52 | ppl     4.59
| epoch  47 |  1900/ 2807 batches | lr 1.00 | loss  1.53 | ppl     4.63
| epoch  47 |  2000/ 2807 batches | lr 1.00 | loss  1.52 | ppl     4.59
| epoch  47 |  2100/ 2807 batches | lr 1.00 | loss  1.53 | ppl     4.62
| epoch  47 |  2200/ 2807 batches | lr 1.00 | loss  1.53 | ppl  

| epoch  50 |  2700/ 2807 batches | lr 1.00 | loss  1.53 | ppl     4.60
| epoch  50 |  2800/ 2807 batches | lr 1.00 | loss  1.52 | ppl     4.56
-----------------------------------------------------------------------------------------
| end of epoch  50 | valid loss  0.14 | valid ppl     1.15
-----------------------------------------------------------------------------------------
sample:
  wrepher batters williamstor <unk> " vious was the 



In [18]:
t1 = generate(10000, 1.)
t15 = generate(10000, 1.5)
t075 = generate(10000, 0.75)
with open('./generated075.txt', 'w') as outf:
    outf.write(t075)
with open('./generated1.txt', 'w') as outf:
    outf.write(t1)
with open('./generated15.txt', 'w') as outf:
    outf.write(t15)