In [None]:
!pip install tqdm

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
# coding: utf-8
import sys
sys.path.append('..')
import time
# import cupy
import matplotlib.pyplot as plt
# 在用GPU运行时，请打开下面的注释（需要cupy）
# ==============================================
GPU = True
# ==============================================
from tqdm.auto import tqdm

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
if GPU:
    import cupy as np
    import cupyx
    np.cuda.set_allocator(np.cuda.MemoryPool().malloc)

    print('\033[92m' + '-' * 60 + '\033[0m')
    print(' ' * 23 + '\033[92mGPU Mode (cupy)\033[0m')
    print('\033[92m' + '-' * 60 + '\033[0m\n')
else:
    import numpy as np
    print('dasd')

[92m------------------------------------------------------------[0m
                       [92mGPU Mode (cupy)[0m
[92m------------------------------------------------------------[0m



In [None]:
# from common.optimizer import SGD
class SGD:
    '''
    随机梯度下降法（Stochastic Gradient Descent）
    '''
    def __init__(self, lr=0.01):
        self.lr = lr
        
    def update(self, params, grads):
        for i in range(len(params)):
            params[i] -= self.lr * grads[i]
            

In [None]:
# from common.trainer import RnnlmTrainer
class RnnlmTrainer:
    def __init__(self, model, optimizer):
        self.model = model
        self.optimizer = optimizer
        self.time_idx = None
        self.ppl_list = None
        self.eval_interval = None
        self.current_epoch = 0

    def get_batch(self, x, t, batch_size, time_size):
        batch_x = np.empty((batch_size, time_size), dtype='i')
        batch_t = np.empty((batch_size, time_size), dtype='i')

        data_size = len(x)
        jump = data_size // batch_size
        offsets = [i * jump for i in range(batch_size)]  # mini-batch的各笔样本数据的开始位置

        for time in range(time_size):
            for i, offset in enumerate(offsets):
                batch_x[i, time] = x[(offset + self.time_idx) % data_size]
                batch_t[i, time] = t[(offset + self.time_idx) % data_size]
            self.time_idx += 1
        return batch_x, batch_t

    def fit(self, xs, ts, max_epoch=10, batch_size=20, time_size=35,
            max_grad=None, eval_interval=20):
        data_size = len(xs)
        max_iters = data_size // (batch_size * time_size)
        self.time_idx = 0
        self.ppl_list = []
        self.eval_interval = eval_interval
        model, optimizer = self.model, self.optimizer
        total_loss = 0
        loss_count = 0

        start_time = time.time()
        for epoch in range(max_epoch):
            for iters in tqdm(range(max_iters)):
                batch_x, batch_t = self.get_batch(xs, ts, batch_size, time_size)

                # 计算梯度，更新参数
                loss = model.forward(batch_x, batch_t)
                model.backward()
                params, grads = remove_duplicate(model.params, model.grads)  # 将共享的权重整合为1个
                if max_grad is not None:
                    clip_grads(grads, max_grad)
                optimizer.update(params, grads)
                total_loss += loss
                loss_count += 1

                # 评价困惑度
                if (eval_interval is not None) and (iters % eval_interval) == 0:
                    ppl = np.exp(total_loss / loss_count)
                    elapsed_time = time.time() - start_time
                    print('| epoch %d |  iter %d / %d | time %d[s] | perplexity %.2f'
                          % (self.current_epoch + 1, iters + 1, max_iters, elapsed_time, ppl))
                    self.ppl_list.append(float(ppl))
                    total_loss, loss_count = 0, 0

            self.current_epoch += 1

    def plot(self, ylim=None):
        x = numpy.arange(len(self.ppl_list))
        if ylim is not None:
            plt.ylim(*ylim)
        plt.plot(x, self.ppl_list, label='train')
        plt.xlabel('iterations (x' + str(self.eval_interval) + ')')
        plt.ylabel('perplexity')
        plt.show()


In [None]:
# from common.util import eval_perplexity, to_gpu
def eval_perplexity(model, corpus, batch_size=10, time_size=35):
    print('evaluating perplexity ...')
    corpus_size = len(corpus)
    total_loss, loss_cnt = 0, 0
    max_iters = (corpus_size - 1) // (batch_size * time_size)
    jump = (corpus_size - 1) // batch_size

    for iters in range(max_iters):
        xs = np.zeros((batch_size, time_size), dtype=np.int32)
        ts = np.zeros((batch_size, time_size), dtype=np.int32)
        time_offset = iters * time_size
        offsets = [time_offset + (i * jump) for i in range(batch_size)]
        for t in range(time_size):
            for i, offset in enumerate(offsets):
                xs[i, t] = corpus[(offset + t) % corpus_size]
                ts[i, t] = corpus[(offset + t + 1) % corpus_size]

        try:
            loss = model.forward(xs, ts, train_flg=False)
        except TypeError:
            loss = model.forward(xs, ts)
        total_loss += loss

        sys.stdout.write('\r%d / %d' % (iters, max_iters))
        sys.stdout.flush()

    print('')
    ppl = np.exp(total_loss / max_iters)
    return ppl


def to_cpu(x):
    import numpy
    if type(x) == numpy.ndarray:
        return x
    return np.asnumpy(x)

def to_gpu(x):
    import cupy
    if type(x) == cupy.ndarray:
        return x
    return cupy.asarray(x)



In [None]:
import os
import pickle
class BaseModel:
    def __init__(self):
        self.params, self.grads = None, None

    def forward(self, *args):
        raise NotImplementedError

    def backward(self, *args):
        raise NotImplementedError

    def save_params(self, file_name=None):
        if file_name is None:
            file_name = self.__class__.__name__ + '.pkl'

        params = [p.astype(np.float16) for p in self.params]
        if GPU:
            params = [to_cpu(p) for p in params]

        with open(file_name, 'wb') as f:
            pickle.dump(params, f)

    def load_params(self, file_name=None):
        if file_name is None:
            file_name = self.__class__.__name__ + '.pkl'

        if '/' in file_name:
            file_name = file_name.replace('/', os.sep)

        if not os.path.exists(file_name):
            raise IOError('No file: ' + file_name)

        with open(file_name, 'rb') as f:
            params = pickle.load(f)

        params = [p.astype('f') for p in params]
        if GPU:
            params = [to_gpu(p) for p in params]

        for i, param in enumerate(self.params):
            param[...] = params[i]


In [None]:
def softmax(x):
    if x.ndim == 2:
        x = x - x.max(axis=1, keepdims=True)
        x = np.exp(x)
        x /= x.sum(axis=1, keepdims=True)
    elif x.ndim == 1:
        x = x - np.max(x)
        x = np.exp(x) / np.sum(np.exp(x))

    return x

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

In [None]:
class Dropout:
    '''
    http://arxiv.org/abs/1207.0580
    '''
    def __init__(self, dropout_ratio=0.5):
        self.params, self.grads = [], []
        self.dropout_ratio = dropout_ratio
        self.mask = None

    def forward(self, x, train_flg=True):
        if train_flg:
            self.mask = np.random.rand(*x.shape) > self.dropout_ratio
            return x * self.mask
        else:
            return x * (1.0 - self.dropout_ratio)

    def backward(self, dout):
        return dout * self.mask
    
class Embedding:
    def __init__(self, W):
        self.params = [W]
        self.grads = [np.zeros_like(W)]
        self.idx = None

    def forward(self, idx):
        W, = self.params
        self.idx = idx
        out = W[idx]
        return out

    def backward(self, dout):
        dW, = self.grads
        dW[...] = 0
        if GPU:
            cupyx.scatter_add(dW, self.idx, dout)
        else:
            np.add.at(dW, self.idx, dout)
        return None
    
class LSTM:
    def __init__(self, Wx, Wh, b):
        '''

        Parameters
        ----------
        Wx: 输入`x`用的权重参数（整合了4个权重）
        Wh: 隐藏状态`h`用的权重参数（整合了4个权重）
        b: 偏置（整合了4个偏置）
        '''
        self.params = [Wx, Wh, b]
        self.grads = [np.zeros_like(Wx), np.zeros_like(Wh), np.zeros_like(b)]
        self.cache = None

    def forward(self, x, h_prev, c_prev):
        Wx, Wh, b = self.params
        N, H = h_prev.shape

        A = np.dot(x, Wx) + np.dot(h_prev, Wh) + b

        f = A[:, :H]
        g = A[:, H:2*H]
        i = A[:, 2*H:3*H]
        o = A[:, 3*H:]

        f = sigmoid(f)
        g = np.tanh(g)
        i = sigmoid(i)
        o = sigmoid(o)

        c_next = f * c_prev + g * i
        h_next = o * np.tanh(c_next)

        self.cache = (x, h_prev, c_prev, i, f, g, o, c_next)
        return h_next, c_next

    def backward(self, dh_next, dc_next):
        Wx, Wh, b = self.params
        x, h_prev, c_prev, i, f, g, o, c_next = self.cache

        tanh_c_next = np.tanh(c_next)

        ds = dc_next + (dh_next * o) * (1 - tanh_c_next ** 2)

        dc_prev = ds * f

        di = ds * g
        df = ds * c_prev
        do = dh_next * tanh_c_next
        dg = ds * i

        di *= i * (1 - i)
        df *= f * (1 - f)
        do *= o * (1 - o)
        dg *= (1 - g ** 2)

        dA = np.hstack((df, dg, di, do))

        dWh = np.dot(h_prev.T, dA)
        dWx = np.dot(x.T, dA)
        db = dA.sum(axis=0)

        self.grads[0][...] = dWx
        self.grads[1][...] = dWh
        self.grads[2][...] = db

        dx = np.dot(dA, Wx.T)
        dh_prev = np.dot(dA, Wh.T)

        return dx, dh_prev, dc_prev


In [None]:


class TimeEmbedding:
    def __init__(self, W):
        self.params = [W]
        self.grads = [np.zeros_like(W)]
        self.layers = None
        self.W = W

    def forward(self, xs):
        N, T = xs.shape
        V, D = self.W.shape

        out = np.empty((N, T, D), dtype='f')
        self.layers = []

        for t in range(T):
            layer = Embedding(self.W)
            out[:, t, :] = layer.forward(xs[:, t])
            self.layers.append(layer)

        return out

    def backward(self, dout):
        N, T, D = dout.shape

        grad = 0
        for t in range(T):
            layer = self.layers[t]
            layer.backward(dout[:, t, :])
            grad += layer.grads[0]

        self.grads[0][...] = grad
        return None
    
class TimeDropout:
    def __init__(self, dropout_ratio=0.5):
        self.params, self.grads = [], []
        self.dropout_ratio = dropout_ratio
        self.mask = None
        self.train_flg = True

    def forward(self, xs):
        if self.train_flg:
            flg = np.random.rand(*xs.shape) > self.dropout_ratio
            scale = 1 / (1.0 - self.dropout_ratio)
            self.mask = flg.astype(np.float32) * scale

            return xs * self.mask
        else:
            return xs

    def backward(self, dout):
        return dout * self.mask
    
class TimeLSTM:
    def __init__(self, Wx, Wh, b, stateful=False):
        self.params = [Wx, Wh, b]
        self.grads = [np.zeros_like(Wx), np.zeros_like(Wh), np.zeros_like(b)]
        self.layers = None

        self.h, self.c = None, None
        self.dh = None
        self.stateful = stateful

    def forward(self, xs):
        Wx, Wh, b = self.params
        N, T, D = xs.shape
        H = Wh.shape[0]

        self.layers = []
        hs = np.empty((N, T, H), dtype='f')

        if not self.stateful or self.h is None:
            self.h = np.zeros((N, H), dtype='f')
        if not self.stateful or self.c is None:
            self.c = np.zeros((N, H), dtype='f')

        for t in range(T):
            layer = LSTM(*self.params)
            self.h, self.c = layer.forward(xs[:, t, :], self.h, self.c)
            hs[:, t, :] = self.h

            self.layers.append(layer)

        return hs

    def backward(self, dhs):
        Wx, Wh, b = self.params
        N, T, H = dhs.shape
        D = Wx.shape[0]

        dxs = np.empty((N, T, D), dtype='f')
        dh, dc = 0, 0

        grads = [0, 0, 0]
        for t in reversed(range(T)):
            layer = self.layers[t]
            dx, dh, dc = layer.backward(dhs[:, t, :] + dh, dc)
            dxs[:, t, :] = dx
            for i, grad in enumerate(layer.grads):
                grads[i] += grad

        for i, grad in enumerate(grads):
            self.grads[i][...] = grad
        self.dh = dh
        return dxs

    def set_state(self, h, c=None):
        self.h, self.c = h, c

    def reset_state(self):
        self.h, self.c = None, None

class TimeAffine:
    def __init__(self, W, b):
        self.params = [W, b]
        self.grads = [np.zeros_like(W), np.zeros_like(b)]
        self.x = None

    def forward(self, x):
        N, T, D = x.shape
        W, b = self.params

        rx = x.reshape(N*T, -1)
        out = np.dot(rx, W) + b
        self.x = x
        return out.reshape(N, T, -1)

    def backward(self, dout):
        x = self.x
        N, T, D = x.shape
        W, b = self.params

        dout = dout.reshape(N*T, -1)
        rx = x.reshape(N*T, -1)

        db = np.sum(dout, axis=0)
        dW = np.dot(rx.T, dout)
        dx = np.dot(dout, W.T)
        dx = dx.reshape(*x.shape)

        self.grads[0][...] = dW
        self.grads[1][...] = db

        return dx
class TimeSoftmaxWithLoss:
    def __init__(self):
        self.params, self.grads = [], []
        self.cache = None
        self.ignore_label = -1

    def forward(self, xs, ts):
        N, T, V = xs.shape

        if ts.ndim == 3:  # 在监督标签为one-hot向量的情况下
            ts = ts.argmax(axis=2)

        mask = (ts != self.ignore_label)

        # 按批次大小和时序大小进行整理（reshape）
        xs = xs.reshape(N * T, V)
        ts = ts.reshape(N * T)
        mask = mask.reshape(N * T)

        ys = softmax(xs)
        ls = np.log(ys[np.arange(N * T), ts])
        ls *= mask  # 与ignore_label相应的数据将损失设为0
        loss = -np.sum(ls)
        loss /= mask.sum()

        self.cache = (ts, ys, mask, (N, T, V))
        return loss

    def backward(self, dout=1):
        ts, ys, mask, (N, T, V) = self.cache

        dx = ys
        dx[np.arange(N * T), ts] -= 1
        dx *= dout
        dx /= mask.sum()
        dx *= mask[:, np.newaxis]  # 与ignore_label相应的数据将梯度设为0

        dx = dx.reshape((N, T, V))

        return dx


In [None]:
# from better_rnnlm import BetterRnnlm
class BetterRnnlm(BaseModel):
    '''
     利用2个LSTM层并在各层使用Dropout的模型
     基于[1]提出的模型，利用weight tying[2][3]

     [1] Recurrent Neural Network Regularization (https://arxiv.org/abs/1409.2329)
     [2] Using the Output Embedding to Improve Language Models (https://arxiv.org/abs/1608.05859)
     [3] Tying Word Vectors and Word Classifiers (https://arxiv.org/pdf/1611.01462.pdf)
    '''
    def __init__(self, vocab_size=10000, wordvec_size=650,
                 hidden_size=650, dropout_ratio=0.5):
        V, D, H = vocab_size, wordvec_size, hidden_size
        rn = np.random.randn

        embed_W = (rn(V, D) / 100).astype('f')
        lstm_Wx1 = (rn(D, 4 * H) / np.sqrt(D)).astype('f')
        lstm_Wh1 = (rn(H, 4 * H) / np.sqrt(H)).astype('f')
        lstm_b1 = np.zeros(4 * H).astype('f')
        lstm_Wx2 = (rn(H, 4 * H) / np.sqrt(H)).astype('f')
        lstm_Wh2 = (rn(H, 4 * H) / np.sqrt(H)).astype('f')
        lstm_b2 = np.zeros(4 * H).astype('f')
        affine_b = np.zeros(V).astype('f')

        self.layers = [
            TimeEmbedding(embed_W),
            TimeDropout(dropout_ratio),
            TimeLSTM(lstm_Wx1, lstm_Wh1, lstm_b1, stateful=True),
            TimeDropout(dropout_ratio),
            TimeLSTM(lstm_Wx2, lstm_Wh2, lstm_b2, stateful=True),
            TimeDropout(dropout_ratio),
            TimeAffine(embed_W.T, affine_b)  # weight tying!!
        ]
        self.loss_layer = TimeSoftmaxWithLoss()
        self.lstm_layers = [self.layers[2], self.layers[4]]
        self.drop_layers = [self.layers[1], self.layers[3], self.layers[5]]

        self.params, self.grads = [], []
        for layer in self.layers:
            self.params += layer.params
            self.grads += layer.grads

    def predict(self, xs, train_flg=False):
        for layer in self.drop_layers:
            layer.train_flg = train_flg

        for layer in self.layers:
            xs = layer.forward(xs)
        return xs

    def forward(self, xs, ts, train_flg=True):
        score = self.predict(xs, train_flg)
        loss = self.loss_layer.forward(score, ts)
        return loss

    def backward(self, dout=1):
        dout = self.loss_layer.backward(dout)
        for layer in reversed(self.layers):
            dout = layer.backward(dout)
        return dout

    def reset_state(self):
        for layer in self.lstm_layers:
            layer.reset_state()


In [None]:
# from dataset import ptb
# import sys 
# sys.path.append('..')
import os
try:
    import urllib.request
except ImportError:
    raise ImportError('Use Python3!')
import pickle


url_base = 'https://raw.githubusercontent.com/tomsercu/lstm/master/data/'
key_file = {
    'train':'ptb.train.txt',
    'test':'ptb.test.txt',
    'valid':'ptb.valid.txt'
}
save_file = {
    'train':'ptb.train.npy',
    'test':'ptb.test.npy',
    'valid':'ptb.valid.npy'
}
vocab_file = 'ptb.vocab.pkl'

dataset_dir = os.getcwd()


def _download(file_name):
    file_path = dataset_dir + '/' + file_name
    if os.path.exists(file_path):
        return

    print('Downloading ' + file_name + ' ... ')

    try:
        urllib.request.urlretrieve(url_base + file_name, file_path)
    except urllib.error.URLError:
        import ssl
        ssl._create_default_https_context = ssl._create_unverified_context
        urllib.request.urlretrieve(url_base + file_name, file_path)

    print('Done')


def load_vocab():
    vocab_path = dataset_dir + '/' + vocab_file

    if os.path.exists(vocab_path):
        with open(vocab_path, 'rb') as f:
            word_to_id, id_to_word = pickle.load(f)
        return word_to_id, id_to_word

    word_to_id = {}
    id_to_word = {}
    data_type = 'train'
    file_name = key_file[data_type]
    file_path = dataset_dir + '/' + file_name

    _download(file_name)

    words = open(file_path).read().replace('\n', '<eos>').strip().split()

    for i, word in enumerate(words):
        if word not in word_to_id:
            tmp_id = len(word_to_id)
            word_to_id[word] = tmp_id
            id_to_word[tmp_id] = word

    with open(vocab_path, 'wb') as f:
        pickle.dump((word_to_id, id_to_word), f)

    return word_to_id, id_to_word


def load_data(data_type='train'):
    '''
        :param data_type: 数据的种类：'train' or 'test' or 'valid (val)'
        :return:
    '''
    if data_type == 'val': data_type = 'valid'
    save_path = dataset_dir + '/' + save_file[data_type]

    word_to_id, id_to_word = load_vocab()

    if os.path.exists(save_path):
        corpus = np.load(save_path)
        return corpus, word_to_id, id_to_word

    file_name = key_file[data_type]
    file_path = dataset_dir + '/' + file_name
    _download(file_name)

    words = open(file_path).read().replace('\n', '<eos>').strip().split()
    corpus = np.array([word_to_id[w] for w in words])

    np.save(save_path, corpus)
    return corpus, word_to_id, id_to_word

In [None]:
# for data_type in ('train', 'val', 'test'):
#         load_data(data_type)

In [None]:

# 设定超参数
batch_size = 20
wordvec_size = 650
hidden_size = 650
time_size = 35
lr = 20.0
max_epoch = 40
max_grad = 0.25
dropout = 0.5

In [None]:
# 读入训练数据
corpus, word_to_id, id_to_word = load_data('train')
corpus_val, _, _ = load_data('val')
corpus_test, _, _ = load_data('test')

In [None]:
if GPU:
    corpus = to_gpu(corpus)
    corpus_val = to_gpu(corpus_val)
    corpus_test = to_gpu(corpus_test)

In [None]:

vocab_size = len(word_to_id)
xs = corpus[:-1]
ts = corpus[1:]

In [None]:
model = BetterRnnlm(vocab_size, wordvec_size, hidden_size, dropout)
optimizer = SGD(lr)
trainer = RnnlmTrainer(model, optimizer)

这个改动是，针对每个 epoch 使用验证数据评价困惑度，在值变差时，降低学习率。

In [None]:
def remove_duplicate(params, grads):
    '''
    将参数列表中重复的权重整合为1个，
    加上与该权重对应的梯度
    '''
    params, grads = params[:], grads[:]  # copy list

    while True:
        find_flg = False
        L = len(params)

        for i in range(0, L - 1):
            for j in range(i + 1, L):
                # 在共享权重的情况下
                if params[i] is params[j]:
                    grads[i] += grads[j]  # 加上梯度
                    find_flg = True
                    params.pop(j)
                    grads.pop(j)
                # 在作为转置矩阵共享权重的情况下（weight tying）
                elif params[i].ndim == 2 and params[j].ndim == 2 and \
                     params[i].T.shape == params[j].shape and np.all(params[i].T == params[j]):
                    grads[i] += grads[j].T
                    find_flg = True
                    params.pop(j)
                    grads.pop(j)

                if find_flg: break
            if find_flg: break

        if not find_flg: break

    return params, grads

def clip_grads(grads, max_norm):
    total_norm = 0
    for grad in grads:
        total_norm += np.sum(grad ** 2)
    total_norm = np.sqrt(total_norm)

    rate = max_norm / (total_norm + 1e-6)
    if rate < 1:
        for grad in grads:
            grad *= rate

In [None]:
best_ppl = float('inf')
for epoch in range(max_epoch):
    trainer.fit(xs, ts, max_epoch=1, batch_size=batch_size,
                time_size=time_size, max_grad=max_grad)

    model.reset_state()
    ppl = eval_perplexity(model, corpus_val)
    print('valid perplexity: ', ppl)

    if best_ppl > ppl:
        best_ppl = ppl
        model.save_params()
    else:
        lr /= 4.0
        optimizer.lr = lr

    model.reset_state()
    print('-' * 50)

  0%|          | 0/1327 [00:00<?, ?it/s]

| epoch 1 |  iter 1 / 1327 | time 7[s] | perplexity 10000.01
| epoch 1 |  iter 21 / 1327 | time 12[s] | perplexity 3720.74
| epoch 1 |  iter 41 / 1327 | time 16[s] | perplexity 1913.80
| epoch 1 |  iter 61 / 1327 | time 20[s] | perplexity 1346.81
| epoch 1 |  iter 81 / 1327 | time 24[s] | perplexity 1045.03
| epoch 1 |  iter 101 / 1327 | time 28[s] | perplexity 830.97
| epoch 1 |  iter 121 / 1327 | time 32[s] | perplexity 785.13
| epoch 1 |  iter 141 / 1327 | time 35[s] | perplexity 729.70
| epoch 1 |  iter 161 / 1327 | time 39[s] | perplexity 679.97
| epoch 1 |  iter 181 / 1327 | time 43[s] | perplexity 687.21
| epoch 1 |  iter 201 / 1327 | time 47[s] | perplexity 597.58
| epoch 1 |  iter 221 / 1327 | time 51[s] | perplexity 586.93
| epoch 1 |  iter 241 / 1327 | time 55[s] | perplexity 526.03
| epoch 1 |  iter 261 / 1327 | time 59[s] | perplexity 542.85
| epoch 1 |  iter 281 / 1327 | time 63[s] | perplexity 523.71
| epoch 1 |  iter 301 / 1327 | time 67[s] | perplexity 454.92
| epoch 1

  0%|          | 0/1327 [00:00<?, ?it/s]

| epoch 2 |  iter 1 / 1327 | time 0[s] | perplexity 287.56
| epoch 2 |  iter 21 / 1327 | time 4[s] | perplexity 228.99
| epoch 2 |  iter 41 / 1327 | time 8[s] | perplexity 213.35
| epoch 2 |  iter 61 / 1327 | time 12[s] | perplexity 197.24
| epoch 2 |  iter 81 / 1327 | time 16[s] | perplexity 179.84
| epoch 2 |  iter 101 / 1327 | time 20[s] | perplexity 168.04
| epoch 2 |  iter 121 / 1327 | time 23[s] | perplexity 181.95
| epoch 2 |  iter 141 / 1327 | time 28[s] | perplexity 200.68
| epoch 2 |  iter 161 / 1327 | time 32[s] | perplexity 215.93
| epoch 2 |  iter 181 / 1327 | time 35[s] | perplexity 223.75
| epoch 2 |  iter 201 / 1327 | time 40[s] | perplexity 206.12
| epoch 2 |  iter 221 / 1327 | time 43[s] | perplexity 205.44
| epoch 2 |  iter 241 / 1327 | time 47[s] | perplexity 197.61
| epoch 2 |  iter 261 / 1327 | time 51[s] | perplexity 211.89
| epoch 2 |  iter 281 / 1327 | time 55[s] | perplexity 206.06
| epoch 2 |  iter 301 / 1327 | time 59[s] | perplexity 188.52
| epoch 2 |  iter

  0%|          | 0/1327 [00:00<?, ?it/s]

| epoch 3 |  iter 1 / 1327 | time 0[s] | perplexity 216.56
| epoch 3 |  iter 21 / 1327 | time 4[s] | perplexity 160.60
| epoch 3 |  iter 41 / 1327 | time 7[s] | perplexity 151.42
| epoch 3 |  iter 61 / 1327 | time 12[s] | perplexity 142.97
| epoch 3 |  iter 81 / 1327 | time 16[s] | perplexity 127.64
| epoch 3 |  iter 101 / 1327 | time 19[s] | perplexity 120.93
| epoch 3 |  iter 121 / 1327 | time 24[s] | perplexity 132.56
| epoch 3 |  iter 141 / 1327 | time 28[s] | perplexity 145.33
| epoch 3 |  iter 161 / 1327 | time 31[s] | perplexity 160.57
| epoch 3 |  iter 181 / 1327 | time 36[s] | perplexity 169.42
| epoch 3 |  iter 201 / 1327 | time 40[s] | perplexity 159.00
| epoch 3 |  iter 221 / 1327 | time 44[s] | perplexity 155.29
| epoch 3 |  iter 241 / 1327 | time 48[s] | perplexity 149.94
| epoch 3 |  iter 261 / 1327 | time 52[s] | perplexity 162.42
| epoch 3 |  iter 281 / 1327 | time 56[s] | perplexity 158.69
| epoch 3 |  iter 301 / 1327 | time 60[s] | perplexity 139.90
| epoch 3 |  iter

  0%|          | 0/1327 [00:00<?, ?it/s]

| epoch 4 |  iter 1 / 1327 | time 0[s] | perplexity 185.54
| epoch 4 |  iter 21 / 1327 | time 4[s] | perplexity 128.99
| epoch 4 |  iter 41 / 1327 | time 8[s] | perplexity 123.87
| epoch 4 |  iter 61 / 1327 | time 12[s] | perplexity 117.25
| epoch 4 |  iter 81 / 1327 | time 15[s] | perplexity 104.20
| epoch 4 |  iter 101 / 1327 | time 20[s] | perplexity 99.13
| epoch 4 |  iter 121 / 1327 | time 23[s] | perplexity 109.79
| epoch 4 |  iter 141 / 1327 | time 27[s] | perplexity 120.21
| epoch 4 |  iter 161 / 1327 | time 32[s] | perplexity 136.41
| epoch 4 |  iter 181 / 1327 | time 36[s] | perplexity 143.96
| epoch 4 |  iter 201 / 1327 | time 39[s] | perplexity 136.82
| epoch 4 |  iter 221 / 1327 | time 44[s] | perplexity 132.29
| epoch 4 |  iter 241 / 1327 | time 47[s] | perplexity 126.67
| epoch 4 |  iter 261 / 1327 | time 51[s] | perplexity 139.01
| epoch 4 |  iter 281 / 1327 | time 56[s] | perplexity 133.16
| epoch 4 |  iter 301 / 1327 | time 59[s] | perplexity 115.41
| epoch 4 |  iter 

  0%|          | 0/1327 [00:00<?, ?it/s]

| epoch 5 |  iter 1 / 1327 | time 0[s] | perplexity 175.20
| epoch 5 |  iter 21 / 1327 | time 4[s] | perplexity 113.08
| epoch 5 |  iter 41 / 1327 | time 8[s] | perplexity 108.83
| epoch 5 |  iter 61 / 1327 | time 12[s] | perplexity 103.60
| epoch 5 |  iter 81 / 1327 | time 16[s] | perplexity 93.15
| epoch 5 |  iter 101 / 1327 | time 19[s] | perplexity 86.11
| epoch 5 |  iter 121 / 1327 | time 23[s] | perplexity 97.22
| epoch 5 |  iter 141 / 1327 | time 28[s] | perplexity 104.80
| epoch 5 |  iter 161 / 1327 | time 31[s] | perplexity 119.11
| epoch 5 |  iter 181 / 1327 | time 35[s] | perplexity 126.46
| epoch 5 |  iter 201 / 1327 | time 39[s] | perplexity 121.99
| epoch 5 |  iter 221 / 1327 | time 43[s] | perplexity 119.66
| epoch 5 |  iter 241 / 1327 | time 47[s] | perplexity 112.48
| epoch 5 |  iter 261 / 1327 | time 51[s] | perplexity 122.20
| epoch 5 |  iter 281 / 1327 | time 55[s] | perplexity 118.63
| epoch 5 |  iter 301 / 1327 | time 59[s] | perplexity 100.47
| epoch 5 |  iter 32

  0%|          | 0/1327 [00:00<?, ?it/s]

| epoch 6 |  iter 1 / 1327 | time 0[s] | perplexity 149.11
| epoch 6 |  iter 21 / 1327 | time 3[s] | perplexity 101.29
| epoch 6 |  iter 41 / 1327 | time 8[s] | perplexity 99.08
| epoch 6 |  iter 61 / 1327 | time 11[s] | perplexity 95.55
| epoch 6 |  iter 81 / 1327 | time 15[s] | perplexity 83.80
| epoch 6 |  iter 101 / 1327 | time 19[s] | perplexity 79.51
| epoch 6 |  iter 121 / 1327 | time 23[s] | perplexity 87.23
| epoch 6 |  iter 141 / 1327 | time 27[s] | perplexity 95.28
| epoch 6 |  iter 161 / 1327 | time 31[s] | perplexity 110.08
| epoch 6 |  iter 181 / 1327 | time 35[s] | perplexity 117.92
| epoch 6 |  iter 201 / 1327 | time 39[s] | perplexity 113.34
| epoch 6 |  iter 221 / 1327 | time 43[s] | perplexity 109.24
| epoch 6 |  iter 241 / 1327 | time 47[s] | perplexity 102.41
| epoch 6 |  iter 261 / 1327 | time 51[s] | perplexity 112.03
| epoch 6 |  iter 281 / 1327 | time 55[s] | perplexity 110.30
| epoch 6 |  iter 301 / 1327 | time 59[s] | perplexity 92.13
| epoch 6 |  iter 321 / 

  0%|          | 0/1327 [00:00<?, ?it/s]

| epoch 7 |  iter 1 / 1327 | time 0[s] | perplexity 145.79
| epoch 7 |  iter 21 / 1327 | time 4[s] | perplexity 94.24
| epoch 7 |  iter 41 / 1327 | time 8[s] | perplexity 92.30
| epoch 7 |  iter 61 / 1327 | time 11[s] | perplexity 88.62
| epoch 7 |  iter 81 / 1327 | time 16[s] | perplexity 77.73
| epoch 7 |  iter 101 / 1327 | time 20[s] | perplexity 74.46
| epoch 7 |  iter 121 / 1327 | time 23[s] | perplexity 82.49
| epoch 7 |  iter 141 / 1327 | time 28[s] | perplexity 88.06
| epoch 7 |  iter 161 / 1327 | time 32[s] | perplexity 102.63
| epoch 7 |  iter 181 / 1327 | time 36[s] | perplexity 108.85
| epoch 7 |  iter 201 / 1327 | time 40[s] | perplexity 105.81
| epoch 7 |  iter 221 / 1327 | time 44[s] | perplexity 100.35
| epoch 7 |  iter 241 / 1327 | time 47[s] | perplexity 96.53
| epoch 7 |  iter 261 / 1327 | time 51[s] | perplexity 104.36
| epoch 7 |  iter 281 / 1327 | time 55[s] | perplexity 101.55
| epoch 7 |  iter 301 / 1327 | time 59[s] | perplexity 84.39
| epoch 7 |  iter 321 / 13

  0%|          | 0/1327 [00:00<?, ?it/s]

| epoch 8 |  iter 1 / 1327 | time 0[s] | perplexity 140.51
| epoch 8 |  iter 21 / 1327 | time 3[s] | perplexity 88.54
| epoch 8 |  iter 41 / 1327 | time 7[s] | perplexity 86.13
| epoch 8 |  iter 61 / 1327 | time 12[s] | perplexity 84.51
| epoch 8 |  iter 81 / 1327 | time 15[s] | perplexity 73.92
| epoch 8 |  iter 101 / 1327 | time 19[s] | perplexity 70.09
| epoch 8 |  iter 121 / 1327 | time 24[s] | perplexity 77.43
| epoch 8 |  iter 141 / 1327 | time 27[s] | perplexity 82.99
| epoch 8 |  iter 161 / 1327 | time 31[s] | perplexity 96.24
| epoch 8 |  iter 181 / 1327 | time 35[s] | perplexity 103.77
| epoch 8 |  iter 201 / 1327 | time 39[s] | perplexity 99.71
| epoch 8 |  iter 221 / 1327 | time 43[s] | perplexity 97.96
| epoch 8 |  iter 241 / 1327 | time 47[s] | perplexity 91.34
| epoch 8 |  iter 261 / 1327 | time 51[s] | perplexity 98.46
| epoch 8 |  iter 281 / 1327 | time 55[s] | perplexity 95.58
| epoch 8 |  iter 301 / 1327 | time 59[s] | perplexity 80.14
| epoch 8 |  iter 321 / 1327 | 

  0%|          | 0/1327 [00:00<?, ?it/s]

| epoch 9 |  iter 1 / 1327 | time 0[s] | perplexity 136.27
| epoch 9 |  iter 21 / 1327 | time 3[s] | perplexity 85.53
| epoch 9 |  iter 41 / 1327 | time 8[s] | perplexity 82.16
| epoch 9 |  iter 61 / 1327 | time 12[s] | perplexity 80.81
| epoch 9 |  iter 81 / 1327 | time 15[s] | perplexity 68.96
| epoch 9 |  iter 101 / 1327 | time 20[s] | perplexity 66.86
| epoch 9 |  iter 121 / 1327 | time 23[s] | perplexity 72.74
| epoch 9 |  iter 141 / 1327 | time 27[s] | perplexity 78.68
| epoch 9 |  iter 161 / 1327 | time 32[s] | perplexity 91.25
| epoch 9 |  iter 181 / 1327 | time 35[s] | perplexity 99.04
| epoch 9 |  iter 201 / 1327 | time 39[s] | perplexity 96.32
| epoch 9 |  iter 221 / 1327 | time 43[s] | perplexity 92.64
| epoch 9 |  iter 241 / 1327 | time 47[s] | perplexity 86.87
| epoch 9 |  iter 261 / 1327 | time 51[s] | perplexity 93.71
| epoch 9 |  iter 281 / 1327 | time 55[s] | perplexity 91.33
| epoch 9 |  iter 301 / 1327 | time 59[s] | perplexity 76.16
| epoch 9 |  iter 321 / 1327 | t

  0%|          | 0/1327 [00:00<?, ?it/s]

| epoch 10 |  iter 1 / 1327 | time 0[s] | perplexity 136.01
| epoch 10 |  iter 21 / 1327 | time 4[s] | perplexity 81.29
| epoch 10 |  iter 41 / 1327 | time 8[s] | perplexity 77.43
| epoch 10 |  iter 61 / 1327 | time 12[s] | perplexity 77.22
| epoch 10 |  iter 81 / 1327 | time 16[s] | perplexity 66.30
| epoch 10 |  iter 101 / 1327 | time 20[s] | perplexity 63.92
| epoch 10 |  iter 121 / 1327 | time 23[s] | perplexity 71.09
| epoch 10 |  iter 141 / 1327 | time 28[s] | perplexity 75.00
| epoch 10 |  iter 161 / 1327 | time 32[s] | perplexity 88.32
| epoch 10 |  iter 181 / 1327 | time 35[s] | perplexity 94.69
| epoch 10 |  iter 201 / 1327 | time 39[s] | perplexity 92.03
| epoch 10 |  iter 221 / 1327 | time 44[s] | perplexity 88.07
| epoch 10 |  iter 241 / 1327 | time 47[s] | perplexity 83.57
| epoch 10 |  iter 261 / 1327 | time 51[s] | perplexity 89.14
| epoch 10 |  iter 281 / 1327 | time 55[s] | perplexity 88.07
| epoch 10 |  iter 301 / 1327 | time 59[s] | perplexity 72.78
| epoch 10 |  it

  0%|          | 0/1327 [00:00<?, ?it/s]

| epoch 11 |  iter 1 / 1327 | time 0[s] | perplexity 133.17
| epoch 11 |  iter 21 / 1327 | time 4[s] | perplexity 77.69
| epoch 11 |  iter 41 / 1327 | time 7[s] | perplexity 75.38
| epoch 11 |  iter 61 / 1327 | time 12[s] | perplexity 73.73
| epoch 11 |  iter 81 / 1327 | time 15[s] | perplexity 64.13
| epoch 11 |  iter 101 / 1327 | time 19[s] | perplexity 61.40
| epoch 11 |  iter 121 / 1327 | time 24[s] | perplexity 68.35
| epoch 11 |  iter 141 / 1327 | time 27[s] | perplexity 72.42
| epoch 11 |  iter 161 / 1327 | time 31[s] | perplexity 85.27
| epoch 11 |  iter 181 / 1327 | time 35[s] | perplexity 90.22
| epoch 11 |  iter 201 / 1327 | time 39[s] | perplexity 89.32
| epoch 11 |  iter 221 / 1327 | time 43[s] | perplexity 85.45
| epoch 11 |  iter 241 / 1327 | time 47[s] | perplexity 80.39
| epoch 11 |  iter 261 / 1327 | time 51[s] | perplexity 86.63
| epoch 11 |  iter 281 / 1327 | time 55[s] | perplexity 84.44
| epoch 11 |  iter 301 / 1327 | time 58[s] | perplexity 70.41
| epoch 11 |  it

  0%|          | 0/1327 [00:00<?, ?it/s]

| epoch 12 |  iter 1 / 1327 | time 0[s] | perplexity 126.63
| epoch 12 |  iter 21 / 1327 | time 3[s] | perplexity 76.54
| epoch 12 |  iter 41 / 1327 | time 8[s] | perplexity 72.66
| epoch 12 |  iter 61 / 1327 | time 12[s] | perplexity 71.15
| epoch 12 |  iter 81 / 1327 | time 15[s] | perplexity 62.14
| epoch 12 |  iter 101 / 1327 | time 19[s] | perplexity 61.07
| epoch 12 |  iter 121 / 1327 | time 23[s] | perplexity 66.34
| epoch 12 |  iter 141 / 1327 | time 27[s] | perplexity 70.94
| epoch 12 |  iter 161 / 1327 | time 31[s] | perplexity 81.59
| epoch 12 |  iter 181 / 1327 | time 35[s] | perplexity 88.57
| epoch 12 |  iter 201 / 1327 | time 39[s] | perplexity 86.41
| epoch 12 |  iter 221 / 1327 | time 43[s] | perplexity 82.92
| epoch 12 |  iter 241 / 1327 | time 47[s] | perplexity 78.40
| epoch 12 |  iter 261 / 1327 | time 51[s] | perplexity 83.87
| epoch 12 |  iter 281 / 1327 | time 55[s] | perplexity 82.70
| epoch 12 |  iter 301 / 1327 | time 59[s] | perplexity 67.12
| epoch 12 |  it

  0%|          | 0/1327 [00:00<?, ?it/s]

| epoch 13 |  iter 1 / 1327 | time 0[s] | perplexity 121.00
| epoch 13 |  iter 21 / 1327 | time 4[s] | perplexity 72.42
| epoch 13 |  iter 41 / 1327 | time 8[s] | perplexity 70.87
| epoch 13 |  iter 61 / 1327 | time 12[s] | perplexity 68.90
| epoch 13 |  iter 81 / 1327 | time 16[s] | perplexity 59.64
| epoch 13 |  iter 101 / 1327 | time 20[s] | perplexity 59.08
| epoch 13 |  iter 121 / 1327 | time 24[s] | perplexity 63.99
| epoch 13 |  iter 141 / 1327 | time 28[s] | perplexity 67.89
| epoch 13 |  iter 161 / 1327 | time 32[s] | perplexity 79.37
| epoch 13 |  iter 181 / 1327 | time 36[s] | perplexity 85.95
| epoch 13 |  iter 201 / 1327 | time 39[s] | perplexity 83.71
| epoch 13 |  iter 221 / 1327 | time 44[s] | perplexity 80.35
| epoch 13 |  iter 241 / 1327 | time 47[s] | perplexity 75.48
| epoch 13 |  iter 261 / 1327 | time 51[s] | perplexity 80.76
| epoch 13 |  iter 281 / 1327 | time 56[s] | perplexity 78.85
| epoch 13 |  iter 301 / 1327 | time 59[s] | perplexity 66.09
| epoch 13 |  it

  0%|          | 0/1327 [00:00<?, ?it/s]

| epoch 14 |  iter 1 / 1327 | time 0[s] | perplexity 119.55
| epoch 14 |  iter 21 / 1327 | time 4[s] | perplexity 71.95
| epoch 14 |  iter 41 / 1327 | time 7[s] | perplexity 69.60
| epoch 14 |  iter 61 / 1327 | time 11[s] | perplexity 66.97
| epoch 14 |  iter 81 / 1327 | time 15[s] | perplexity 58.12
| epoch 14 |  iter 101 / 1327 | time 19[s] | perplexity 57.73
| epoch 14 |  iter 121 / 1327 | time 23[s] | perplexity 62.84
| epoch 14 |  iter 141 / 1327 | time 27[s] | perplexity 67.82
| epoch 14 |  iter 161 / 1327 | time 31[s] | perplexity 76.91
| epoch 14 |  iter 181 / 1327 | time 35[s] | perplexity 82.52
| epoch 14 |  iter 201 / 1327 | time 39[s] | perplexity 81.65
| epoch 14 |  iter 221 / 1327 | time 43[s] | perplexity 78.76
| epoch 14 |  iter 241 / 1327 | time 47[s] | perplexity 74.61
| epoch 14 |  iter 261 / 1327 | time 51[s] | perplexity 79.39
| epoch 14 |  iter 281 / 1327 | time 55[s] | perplexity 77.25
| epoch 14 |  iter 301 / 1327 | time 58[s] | perplexity 64.35
| epoch 14 |  it

  0%|          | 0/1327 [00:00<?, ?it/s]

| epoch 15 |  iter 1 / 1327 | time 0[s] | perplexity 126.84
| epoch 15 |  iter 21 / 1327 | time 3[s] | perplexity 70.48
| epoch 15 |  iter 41 / 1327 | time 8[s] | perplexity 67.44
| epoch 15 |  iter 61 / 1327 | time 12[s] | perplexity 65.73
| epoch 15 |  iter 81 / 1327 | time 15[s] | perplexity 57.39
| epoch 15 |  iter 101 / 1327 | time 19[s] | perplexity 56.06
| epoch 15 |  iter 121 / 1327 | time 24[s] | perplexity 60.39
| epoch 15 |  iter 141 / 1327 | time 27[s] | perplexity 64.95
| epoch 15 |  iter 161 / 1327 | time 31[s] | perplexity 75.68
| epoch 15 |  iter 181 / 1327 | time 35[s] | perplexity 79.89
| epoch 15 |  iter 201 / 1327 | time 39[s] | perplexity 80.25
| epoch 15 |  iter 221 / 1327 | time 43[s] | perplexity 76.88
| epoch 15 |  iter 241 / 1327 | time 47[s] | perplexity 71.35
| epoch 15 |  iter 261 / 1327 | time 51[s] | perplexity 75.65
| epoch 15 |  iter 281 / 1327 | time 55[s] | perplexity 76.48
| epoch 15 |  iter 301 / 1327 | time 59[s] | perplexity 62.36
| epoch 15 |  it

  0%|          | 0/1327 [00:00<?, ?it/s]

| epoch 16 |  iter 1 / 1327 | time 0[s] | perplexity 113.25
| epoch 16 |  iter 21 / 1327 | time 4[s] | perplexity 68.41
| epoch 16 |  iter 41 / 1327 | time 8[s] | perplexity 66.54
| epoch 16 |  iter 61 / 1327 | time 12[s] | perplexity 64.57
| epoch 16 |  iter 81 / 1327 | time 16[s] | perplexity 56.03
| epoch 16 |  iter 101 / 1327 | time 20[s] | perplexity 55.70
| epoch 16 |  iter 121 / 1327 | time 24[s] | perplexity 60.25
| epoch 16 |  iter 141 / 1327 | time 27[s] | perplexity 64.38
| epoch 16 |  iter 161 / 1327 | time 32[s] | perplexity 73.54
| epoch 16 |  iter 181 / 1327 | time 35[s] | perplexity 79.32
| epoch 16 |  iter 201 / 1327 | time 39[s] | perplexity 78.33
| epoch 16 |  iter 221 / 1327 | time 44[s] | perplexity 75.10
| epoch 16 |  iter 241 / 1327 | time 47[s] | perplexity 70.94
| epoch 16 |  iter 261 / 1327 | time 51[s] | perplexity 76.53
| epoch 16 |  iter 281 / 1327 | time 56[s] | perplexity 73.21
| epoch 16 |  iter 301 / 1327 | time 59[s] | perplexity 60.89
| epoch 16 |  it

  0%|          | 0/1327 [00:00<?, ?it/s]

| epoch 17 |  iter 1 / 1327 | time 0[s] | perplexity 111.43
| epoch 17 |  iter 21 / 1327 | time 4[s] | perplexity 67.65
| epoch 17 |  iter 41 / 1327 | time 8[s] | perplexity 64.37
| epoch 17 |  iter 61 / 1327 | time 12[s] | perplexity 63.04
| epoch 17 |  iter 81 / 1327 | time 16[s] | perplexity 54.59
| epoch 17 |  iter 101 / 1327 | time 20[s] | perplexity 54.38
| epoch 17 |  iter 121 / 1327 | time 23[s] | perplexity 58.50
| epoch 17 |  iter 141 / 1327 | time 28[s] | perplexity 62.41
| epoch 17 |  iter 161 / 1327 | time 32[s] | perplexity 71.54
| epoch 17 |  iter 181 / 1327 | time 35[s] | perplexity 76.46
| epoch 17 |  iter 201 / 1327 | time 40[s] | perplexity 76.92
| epoch 17 |  iter 221 / 1327 | time 44[s] | perplexity 73.39
| epoch 17 |  iter 241 / 1327 | time 47[s] | perplexity 67.93
| epoch 17 |  iter 261 / 1327 | time 52[s] | perplexity 73.31
| epoch 17 |  iter 281 / 1327 | time 55[s] | perplexity 72.96
| epoch 17 |  iter 301 / 1327 | time 59[s] | perplexity 60.52
| epoch 17 |  it

  0%|          | 0/1327 [00:00<?, ?it/s]

| epoch 18 |  iter 1 / 1327 | time 0[s] | perplexity 106.21
| epoch 18 |  iter 21 / 1327 | time 3[s] | perplexity 66.23
| epoch 18 |  iter 41 / 1327 | time 7[s] | perplexity 64.00
| epoch 18 |  iter 61 / 1327 | time 12[s] | perplexity 61.96
| epoch 18 |  iter 81 / 1327 | time 15[s] | perplexity 53.80
| epoch 18 |  iter 101 / 1327 | time 19[s] | perplexity 52.50
| epoch 18 |  iter 121 / 1327 | time 24[s] | perplexity 57.03
| epoch 18 |  iter 141 / 1327 | time 28[s] | perplexity 60.11
| epoch 18 |  iter 161 / 1327 | time 31[s] | perplexity 68.59
| epoch 18 |  iter 181 / 1327 | time 36[s] | perplexity 72.19
| epoch 18 |  iter 201 / 1327 | time 40[s] | perplexity 71.41
| epoch 18 |  iter 221 / 1327 | time 44[s] | perplexity 69.51
| epoch 18 |  iter 241 / 1327 | time 48[s] | perplexity 64.56
| epoch 18 |  iter 261 / 1327 | time 52[s] | perplexity 69.67
| epoch 18 |  iter 281 / 1327 | time 56[s] | perplexity 66.45
| epoch 18 |  iter 301 / 1327 | time 60[s] | perplexity 55.06
| epoch 18 |  it

  0%|          | 0/1327 [00:00<?, ?it/s]

| epoch 19 |  iter 1 / 1327 | time 0[s] | perplexity 97.85
| epoch 19 |  iter 21 / 1327 | time 4[s] | perplexity 59.19
| epoch 19 |  iter 41 / 1327 | time 8[s] | perplexity 57.69
| epoch 19 |  iter 61 / 1327 | time 12[s] | perplexity 56.11
| epoch 19 |  iter 81 / 1327 | time 16[s] | perplexity 48.51
| epoch 19 |  iter 101 / 1327 | time 20[s] | perplexity 47.44
| epoch 19 |  iter 121 / 1327 | time 24[s] | perplexity 52.16
| epoch 19 |  iter 141 / 1327 | time 28[s] | perplexity 54.58
| epoch 19 |  iter 161 / 1327 | time 32[s] | perplexity 63.46
| epoch 19 |  iter 181 / 1327 | time 36[s] | perplexity 66.91
| epoch 19 |  iter 201 / 1327 | time 40[s] | perplexity 67.48
| epoch 19 |  iter 221 / 1327 | time 44[s] | perplexity 63.55
| epoch 19 |  iter 241 / 1327 | time 48[s] | perplexity 60.11
| epoch 19 |  iter 261 / 1327 | time 52[s] | perplexity 63.69
| epoch 19 |  iter 281 / 1327 | time 56[s] | perplexity 61.49
| epoch 19 |  iter 301 / 1327 | time 60[s] | perplexity 51.58
| epoch 19 |  ite

  0%|          | 0/1327 [00:00<?, ?it/s]

| epoch 20 |  iter 1 / 1327 | time 0[s] | perplexity 89.57
| epoch 20 |  iter 21 / 1327 | time 4[s] | perplexity 56.05
| epoch 20 |  iter 41 / 1327 | time 8[s] | perplexity 54.21
| epoch 20 |  iter 61 / 1327 | time 12[s] | perplexity 52.78
| epoch 20 |  iter 81 / 1327 | time 16[s] | perplexity 46.35
| epoch 20 |  iter 101 / 1327 | time 20[s] | perplexity 45.26
| epoch 20 |  iter 121 / 1327 | time 24[s] | perplexity 49.42
| epoch 20 |  iter 141 / 1327 | time 28[s] | perplexity 51.81
| epoch 20 |  iter 161 / 1327 | time 32[s] | perplexity 60.49
| epoch 20 |  iter 181 / 1327 | time 36[s] | perplexity 64.71
| epoch 20 |  iter 201 / 1327 | time 40[s] | perplexity 63.97
| epoch 20 |  iter 221 / 1327 | time 44[s] | perplexity 60.92
| epoch 20 |  iter 241 / 1327 | time 48[s] | perplexity 57.47
| epoch 20 |  iter 261 / 1327 | time 52[s] | perplexity 61.39
| epoch 20 |  iter 281 / 1327 | time 56[s] | perplexity 60.13
| epoch 20 |  iter 301 / 1327 | time 60[s] | perplexity 49.19
| epoch 20 |  ite

  0%|          | 0/1327 [00:00<?, ?it/s]

| epoch 21 |  iter 1 / 1327 | time 0[s] | perplexity 94.43
| epoch 21 |  iter 21 / 1327 | time 4[s] | perplexity 55.30
| epoch 21 |  iter 41 / 1327 | time 7[s] | perplexity 52.89
| epoch 21 |  iter 61 / 1327 | time 12[s] | perplexity 51.65
| epoch 21 |  iter 81 / 1327 | time 16[s] | perplexity 45.80
| epoch 21 |  iter 101 / 1327 | time 20[s] | perplexity 44.17
| epoch 21 |  iter 121 / 1327 | time 24[s] | perplexity 48.22
| epoch 21 |  iter 141 / 1327 | time 28[s] | perplexity 50.45
| epoch 21 |  iter 161 / 1327 | time 32[s] | perplexity 58.72
| epoch 21 |  iter 181 / 1327 | time 36[s] | perplexity 61.77
| epoch 21 |  iter 201 / 1327 | time 40[s] | perplexity 61.52
| epoch 21 |  iter 221 / 1327 | time 44[s] | perplexity 60.77
| epoch 21 |  iter 241 / 1327 | time 48[s] | perplexity 56.15
| epoch 21 |  iter 261 / 1327 | time 52[s] | perplexity 60.50
| epoch 21 |  iter 281 / 1327 | time 56[s] | perplexity 57.90
| epoch 21 |  iter 301 / 1327 | time 60[s] | perplexity 47.52
| epoch 21 |  ite

  0%|          | 0/1327 [00:00<?, ?it/s]

| epoch 22 |  iter 1 / 1327 | time 0[s] | perplexity 91.00
| epoch 22 |  iter 21 / 1327 | time 4[s] | perplexity 53.37
| epoch 22 |  iter 41 / 1327 | time 8[s] | perplexity 51.65
| epoch 22 |  iter 61 / 1327 | time 12[s] | perplexity 50.37
| epoch 22 |  iter 81 / 1327 | time 16[s] | perplexity 43.96
| epoch 22 |  iter 101 / 1327 | time 20[s] | perplexity 43.17
| epoch 22 |  iter 121 / 1327 | time 24[s] | perplexity 46.32
| epoch 22 |  iter 141 / 1327 | time 28[s] | perplexity 48.98
| epoch 22 |  iter 161 / 1327 | time 32[s] | perplexity 57.18
| epoch 22 |  iter 181 / 1327 | time 36[s] | perplexity 61.63
| epoch 22 |  iter 201 / 1327 | time 40[s] | perplexity 61.05
| epoch 22 |  iter 221 / 1327 | time 44[s] | perplexity 59.00
| epoch 22 |  iter 241 / 1327 | time 48[s] | perplexity 54.29
| epoch 22 |  iter 261 / 1327 | time 52[s] | perplexity 58.80
| epoch 22 |  iter 281 / 1327 | time 56[s] | perplexity 56.63
| epoch 22 |  iter 301 / 1327 | time 60[s] | perplexity 46.78
| epoch 22 |  ite

  0%|          | 0/1327 [00:00<?, ?it/s]

| epoch 23 |  iter 1 / 1327 | time 0[s] | perplexity 82.26
| epoch 23 |  iter 21 / 1327 | time 4[s] | perplexity 52.47
| epoch 23 |  iter 41 / 1327 | time 8[s] | perplexity 50.14
| epoch 23 |  iter 61 / 1327 | time 12[s] | perplexity 49.87
| epoch 23 |  iter 81 / 1327 | time 16[s] | perplexity 43.68
| epoch 23 |  iter 101 / 1327 | time 20[s] | perplexity 42.26
| epoch 23 |  iter 121 / 1327 | time 24[s] | perplexity 45.12
| epoch 23 |  iter 141 / 1327 | time 28[s] | perplexity 48.75
| epoch 23 |  iter 161 / 1327 | time 32[s] | perplexity 55.90
| epoch 23 |  iter 181 / 1327 | time 36[s] | perplexity 60.19
| epoch 23 |  iter 201 / 1327 | time 40[s] | perplexity 58.96
| epoch 23 |  iter 221 / 1327 | time 45[s] | perplexity 57.12
| epoch 23 |  iter 241 / 1327 | time 49[s] | perplexity 53.22
| epoch 23 |  iter 261 / 1327 | time 52[s] | perplexity 57.52
| epoch 23 |  iter 281 / 1327 | time 57[s] | perplexity 55.76
| epoch 23 |  iter 301 / 1327 | time 61[s] | perplexity 45.61
| epoch 23 |  ite

  0%|          | 0/1327 [00:00<?, ?it/s]

| epoch 24 |  iter 1 / 1327 | time 0[s] | perplexity 82.92
| epoch 24 |  iter 21 / 1327 | time 3[s] | perplexity 51.27
| epoch 24 |  iter 41 / 1327 | time 7[s] | perplexity 49.96
| epoch 24 |  iter 61 / 1327 | time 11[s] | perplexity 48.69
| epoch 24 |  iter 81 / 1327 | time 15[s] | perplexity 41.68
| epoch 24 |  iter 101 / 1327 | time 19[s] | perplexity 41.42
| epoch 24 |  iter 121 / 1327 | time 23[s] | perplexity 45.52
| epoch 24 |  iter 141 / 1327 | time 27[s] | perplexity 46.71
| epoch 24 |  iter 161 / 1327 | time 31[s] | perplexity 54.81
| epoch 24 |  iter 181 / 1327 | time 35[s] | perplexity 58.36
| epoch 24 |  iter 201 / 1327 | time 39[s] | perplexity 58.61
| epoch 24 |  iter 221 / 1327 | time 43[s] | perplexity 56.61
| epoch 24 |  iter 241 / 1327 | time 47[s] | perplexity 52.17
| epoch 24 |  iter 261 / 1327 | time 51[s] | perplexity 55.97
| epoch 24 |  iter 281 / 1327 | time 55[s] | perplexity 55.12
| epoch 24 |  iter 301 / 1327 | time 58[s] | perplexity 44.84
| epoch 24 |  ite

  0%|          | 0/1327 [00:00<?, ?it/s]

| epoch 25 |  iter 1 / 1327 | time 0[s] | perplexity 83.28
| epoch 25 |  iter 21 / 1327 | time 3[s] | perplexity 51.05
| epoch 25 |  iter 41 / 1327 | time 8[s] | perplexity 49.23
| epoch 25 |  iter 61 / 1327 | time 12[s] | perplexity 47.84
| epoch 25 |  iter 81 / 1327 | time 15[s] | perplexity 41.69
| epoch 25 |  iter 101 / 1327 | time 20[s] | perplexity 40.67
| epoch 25 |  iter 121 / 1327 | time 24[s] | perplexity 44.68
| epoch 25 |  iter 141 / 1327 | time 27[s] | perplexity 46.49
| epoch 25 |  iter 161 / 1327 | time 32[s] | perplexity 55.05
| epoch 25 |  iter 181 / 1327 | time 36[s] | perplexity 58.76
| epoch 25 |  iter 201 / 1327 | time 39[s] | perplexity 58.13
| epoch 25 |  iter 221 / 1327 | time 43[s] | perplexity 55.69
| epoch 25 |  iter 241 / 1327 | time 47[s] | perplexity 51.76
| epoch 25 |  iter 261 / 1327 | time 51[s] | perplexity 54.43
| epoch 25 |  iter 281 / 1327 | time 55[s] | perplexity 54.38
| epoch 25 |  iter 301 / 1327 | time 59[s] | perplexity 44.54
| epoch 25 |  ite

  0%|          | 0/1327 [00:00<?, ?it/s]

| epoch 26 |  iter 1 / 1327 | time 0[s] | perplexity 86.10
| epoch 26 |  iter 21 / 1327 | time 4[s] | perplexity 50.11
| epoch 26 |  iter 41 / 1327 | time 8[s] | perplexity 48.03
| epoch 26 |  iter 61 / 1327 | time 12[s] | perplexity 47.15
| epoch 26 |  iter 81 / 1327 | time 16[s] | perplexity 41.08
| epoch 26 |  iter 101 / 1327 | time 20[s] | perplexity 39.97
| epoch 26 |  iter 121 / 1327 | time 24[s] | perplexity 43.57
| epoch 26 |  iter 141 / 1327 | time 28[s] | perplexity 45.52
| epoch 26 |  iter 161 / 1327 | time 32[s] | perplexity 53.68
| epoch 26 |  iter 181 / 1327 | time 35[s] | perplexity 56.25
| epoch 26 |  iter 201 / 1327 | time 40[s] | perplexity 56.88
| epoch 26 |  iter 221 / 1327 | time 44[s] | perplexity 53.39
| epoch 26 |  iter 241 / 1327 | time 47[s] | perplexity 50.77
| epoch 26 |  iter 261 / 1327 | time 51[s] | perplexity 53.89
| epoch 26 |  iter 281 / 1327 | time 55[s] | perplexity 52.49
| epoch 26 |  iter 301 / 1327 | time 59[s] | perplexity 42.77
| epoch 26 |  ite

  0%|          | 0/1327 [00:00<?, ?it/s]

| epoch 27 |  iter 1 / 1327 | time 0[s] | perplexity 81.48
| epoch 27 |  iter 21 / 1327 | time 3[s] | perplexity 50.00
| epoch 27 |  iter 41 / 1327 | time 7[s] | perplexity 47.67
| epoch 27 |  iter 61 / 1327 | time 12[s] | perplexity 45.67
| epoch 27 |  iter 81 / 1327 | time 15[s] | perplexity 40.14
| epoch 27 |  iter 101 / 1327 | time 19[s] | perplexity 39.30
| epoch 27 |  iter 121 / 1327 | time 24[s] | perplexity 42.76
| epoch 27 |  iter 141 / 1327 | time 28[s] | perplexity 44.15
| epoch 27 |  iter 161 / 1327 | time 31[s] | perplexity 53.14
| epoch 27 |  iter 181 / 1327 | time 36[s] | perplexity 55.78
| epoch 27 |  iter 201 / 1327 | time 40[s] | perplexity 55.34
| epoch 27 |  iter 221 / 1327 | time 43[s] | perplexity 53.60
| epoch 27 |  iter 241 / 1327 | time 48[s] | perplexity 49.27
| epoch 27 |  iter 261 / 1327 | time 51[s] | perplexity 53.90
| epoch 27 |  iter 281 / 1327 | time 55[s] | perplexity 52.59
| epoch 27 |  iter 301 / 1327 | time 60[s] | perplexity 42.64
| epoch 27 |  ite

  0%|          | 0/1327 [00:00<?, ?it/s]

| epoch 28 |  iter 1 / 1327 | time 0[s] | perplexity 85.48
| epoch 28 |  iter 21 / 1327 | time 4[s] | perplexity 49.44
| epoch 28 |  iter 41 / 1327 | time 8[s] | perplexity 48.02
| epoch 28 |  iter 61 / 1327 | time 12[s] | perplexity 46.23
| epoch 28 |  iter 81 / 1327 | time 15[s] | perplexity 39.68
| epoch 28 |  iter 101 / 1327 | time 20[s] | perplexity 38.81
| epoch 28 |  iter 121 / 1327 | time 23[s] | perplexity 43.16
| epoch 28 |  iter 141 / 1327 | time 27[s] | perplexity 45.71
| epoch 28 |  iter 161 / 1327 | time 32[s] | perplexity 52.48
| epoch 28 |  iter 181 / 1327 | time 35[s] | perplexity 55.57
| epoch 28 |  iter 201 / 1327 | time 39[s] | perplexity 55.45
| epoch 28 |  iter 221 / 1327 | time 44[s] | perplexity 52.91
| epoch 28 |  iter 241 / 1327 | time 47[s] | perplexity 49.19
| epoch 28 |  iter 261 / 1327 | time 51[s] | perplexity 52.70
| epoch 28 |  iter 281 / 1327 | time 56[s] | perplexity 52.37
| epoch 28 |  iter 301 / 1327 | time 59[s] | perplexity 42.33
| epoch 28 |  ite

  0%|          | 0/1327 [00:00<?, ?it/s]

| epoch 29 |  iter 1 / 1327 | time 0[s] | perplexity 81.27
| epoch 29 |  iter 21 / 1327 | time 4[s] | perplexity 48.68
| epoch 29 |  iter 41 / 1327 | time 7[s] | perplexity 46.40
| epoch 29 |  iter 61 / 1327 | time 12[s] | perplexity 45.12
| epoch 29 |  iter 81 / 1327 | time 16[s] | perplexity 39.28
| epoch 29 |  iter 101 / 1327 | time 19[s] | perplexity 38.69
| epoch 29 |  iter 121 / 1327 | time 24[s] | perplexity 42.78
| epoch 29 |  iter 141 / 1327 | time 28[s] | perplexity 44.54
| epoch 29 |  iter 161 / 1327 | time 31[s] | perplexity 52.24
| epoch 29 |  iter 181 / 1327 | time 35[s] | perplexity 55.12
| epoch 29 |  iter 201 / 1327 | time 40[s] | perplexity 55.57
| epoch 29 |  iter 221 / 1327 | time 43[s] | perplexity 52.99
| epoch 29 |  iter 241 / 1327 | time 47[s] | perplexity 49.46
| epoch 29 |  iter 261 / 1327 | time 52[s] | perplexity 52.63
| epoch 29 |  iter 281 / 1327 | time 55[s] | perplexity 51.59
| epoch 29 |  iter 301 / 1327 | time 59[s] | perplexity 42.28
| epoch 29 |  ite

  0%|          | 0/1327 [00:00<?, ?it/s]

| epoch 30 |  iter 1 / 1327 | time 0[s] | perplexity 81.03
| epoch 30 |  iter 21 / 1327 | time 4[s] | perplexity 49.16
| epoch 30 |  iter 41 / 1327 | time 8[s] | perplexity 47.22
| epoch 30 |  iter 61 / 1327 | time 12[s] | perplexity 45.27
| epoch 30 |  iter 81 / 1327 | time 16[s] | perplexity 39.41
| epoch 30 |  iter 101 / 1327 | time 20[s] | perplexity 38.92
| epoch 30 |  iter 121 / 1327 | time 24[s] | perplexity 42.71
| epoch 30 |  iter 141 / 1327 | time 28[s] | perplexity 44.42
| epoch 30 |  iter 161 / 1327 | time 32[s] | perplexity 52.72
| epoch 30 |  iter 181 / 1327 | time 36[s] | perplexity 54.53
| epoch 30 |  iter 201 / 1327 | time 40[s] | perplexity 54.62
| epoch 30 |  iter 221 / 1327 | time 44[s] | perplexity 52.58
| epoch 30 |  iter 241 / 1327 | time 48[s] | perplexity 49.69
| epoch 30 |  iter 261 / 1327 | time 52[s] | perplexity 53.77
| epoch 30 |  iter 281 / 1327 | time 56[s] | perplexity 50.60
| epoch 30 |  iter 301 / 1327 | time 60[s] | perplexity 41.66
| epoch 30 |  ite

  0%|          | 0/1327 [00:00<?, ?it/s]

| epoch 31 |  iter 1 / 1327 | time 0[s] | perplexity 79.25
| epoch 31 |  iter 21 / 1327 | time 4[s] | perplexity 48.70
| epoch 31 |  iter 41 / 1327 | time 8[s] | perplexity 47.24
| epoch 31 |  iter 61 / 1327 | time 12[s] | perplexity 44.74
| epoch 31 |  iter 81 / 1327 | time 16[s] | perplexity 39.12
| epoch 31 |  iter 101 / 1327 | time 20[s] | perplexity 38.51
| epoch 31 |  iter 121 / 1327 | time 24[s] | perplexity 42.14
| epoch 31 |  iter 141 / 1327 | time 28[s] | perplexity 43.74
| epoch 31 |  iter 161 / 1327 | time 32[s] | perplexity 51.73
| epoch 31 |  iter 181 / 1327 | time 36[s] | perplexity 55.20
| epoch 31 |  iter 201 / 1327 | time 40[s] | perplexity 55.91
| epoch 31 |  iter 221 / 1327 | time 44[s] | perplexity 52.56
| epoch 31 |  iter 241 / 1327 | time 48[s] | perplexity 48.75
| epoch 31 |  iter 261 / 1327 | time 52[s] | perplexity 52.39
| epoch 31 |  iter 281 / 1327 | time 56[s] | perplexity 50.62
| epoch 31 |  iter 301 / 1327 | time 59[s] | perplexity 42.23
| epoch 31 |  ite

  0%|          | 0/1327 [00:00<?, ?it/s]

| epoch 32 |  iter 1 / 1327 | time 0[s] | perplexity 84.42
| epoch 32 |  iter 21 / 1327 | time 4[s] | perplexity 49.32
| epoch 32 |  iter 41 / 1327 | time 8[s] | perplexity 46.59
| epoch 32 |  iter 61 / 1327 | time 12[s] | perplexity 45.12
| epoch 32 |  iter 81 / 1327 | time 16[s] | perplexity 40.13
| epoch 32 |  iter 101 / 1327 | time 21[s] | perplexity 39.04
| epoch 32 |  iter 121 / 1327 | time 25[s] | perplexity 41.95
| epoch 32 |  iter 141 / 1327 | time 28[s] | perplexity 43.83
| epoch 32 |  iter 161 / 1327 | time 33[s] | perplexity 51.42
| epoch 32 |  iter 181 / 1327 | time 37[s] | perplexity 54.83
| epoch 32 |  iter 201 / 1327 | time 41[s] | perplexity 55.28
| epoch 32 |  iter 221 / 1327 | time 45[s] | perplexity 52.18
| epoch 32 |  iter 241 / 1327 | time 49[s] | perplexity 48.96
| epoch 32 |  iter 261 / 1327 | time 53[s] | perplexity 52.76
| epoch 32 |  iter 281 / 1327 | time 57[s] | perplexity 52.39
| epoch 32 |  iter 301 / 1327 | time 61[s] | perplexity 41.84
| epoch 32 |  ite

  0%|          | 0/1327 [00:00<?, ?it/s]

| epoch 33 |  iter 1 / 1327 | time 0[s] | perplexity 75.67
| epoch 33 |  iter 21 / 1327 | time 4[s] | perplexity 49.01
| epoch 33 |  iter 41 / 1327 | time 8[s] | perplexity 46.58
| epoch 33 |  iter 61 / 1327 | time 12[s] | perplexity 45.18
| epoch 33 |  iter 81 / 1327 | time 16[s] | perplexity 39.36
| epoch 33 |  iter 101 / 1327 | time 20[s] | perplexity 38.88
| epoch 33 |  iter 121 / 1327 | time 25[s] | perplexity 42.93
| epoch 33 |  iter 141 / 1327 | time 29[s] | perplexity 44.07
| epoch 33 |  iter 161 / 1327 | time 33[s] | perplexity 51.61
| epoch 33 |  iter 181 / 1327 | time 37[s] | perplexity 54.33
| epoch 33 |  iter 201 / 1327 | time 41[s] | perplexity 55.16
| epoch 33 |  iter 221 / 1327 | time 45[s] | perplexity 53.11
| epoch 33 |  iter 241 / 1327 | time 49[s] | perplexity 48.63
| epoch 33 |  iter 261 / 1327 | time 53[s] | perplexity 52.07
| epoch 33 |  iter 281 / 1327 | time 57[s] | perplexity 51.91
| epoch 33 |  iter 301 / 1327 | time 62[s] | perplexity 41.48
| epoch 33 |  ite

  0%|          | 0/1327 [00:00<?, ?it/s]

| epoch 34 |  iter 1 / 1327 | time 0[s] | perplexity 83.22
| epoch 34 |  iter 21 / 1327 | time 4[s] | perplexity 48.92
| epoch 34 |  iter 41 / 1327 | time 7[s] | perplexity 46.46
| epoch 34 |  iter 61 / 1327 | time 12[s] | perplexity 45.32
| epoch 34 |  iter 81 / 1327 | time 16[s] | perplexity 39.48
| epoch 34 |  iter 101 / 1327 | time 20[s] | perplexity 38.56
| epoch 34 |  iter 121 / 1327 | time 24[s] | perplexity 42.40
| epoch 34 |  iter 141 / 1327 | time 28[s] | perplexity 43.72
| epoch 34 |  iter 161 / 1327 | time 32[s] | perplexity 51.53
| epoch 34 |  iter 181 / 1327 | time 36[s] | perplexity 55.69
| epoch 34 |  iter 201 / 1327 | time 40[s] | perplexity 54.79
| epoch 34 |  iter 221 / 1327 | time 44[s] | perplexity 51.67
| epoch 34 |  iter 241 / 1327 | time 48[s] | perplexity 49.26
| epoch 34 |  iter 261 / 1327 | time 53[s] | perplexity 52.43
| epoch 34 |  iter 281 / 1327 | time 57[s] | perplexity 51.93
| epoch 34 |  iter 301 / 1327 | time 61[s] | perplexity 41.87
| epoch 34 |  ite

  0%|          | 0/1327 [00:00<?, ?it/s]

| epoch 35 |  iter 1 / 1327 | time 0[s] | perplexity 78.99
| epoch 35 |  iter 21 / 1327 | time 4[s] | perplexity 47.92
| epoch 35 |  iter 41 / 1327 | time 8[s] | perplexity 46.79
| epoch 35 |  iter 61 / 1327 | time 12[s] | perplexity 45.67
| epoch 35 |  iter 81 / 1327 | time 16[s] | perplexity 39.80
| epoch 35 |  iter 101 / 1327 | time 20[s] | perplexity 38.43
| epoch 35 |  iter 121 / 1327 | time 24[s] | perplexity 41.36
| epoch 35 |  iter 141 / 1327 | time 29[s] | perplexity 43.17
| epoch 35 |  iter 161 / 1327 | time 33[s] | perplexity 52.26
| epoch 35 |  iter 181 / 1327 | time 36[s] | perplexity 53.93
| epoch 35 |  iter 201 / 1327 | time 41[s] | perplexity 54.55
| epoch 35 |  iter 221 / 1327 | time 45[s] | perplexity 52.35
| epoch 35 |  iter 241 / 1327 | time 48[s] | perplexity 48.60
| epoch 35 |  iter 261 / 1327 | time 53[s] | perplexity 52.77
| epoch 35 |  iter 281 / 1327 | time 57[s] | perplexity 50.68
| epoch 35 |  iter 301 / 1327 | time 61[s] | perplexity 41.19
| epoch 35 |  ite

  0%|          | 0/1327 [00:00<?, ?it/s]

| epoch 36 |  iter 1 / 1327 | time 0[s] | perplexity 79.75
| epoch 36 |  iter 21 / 1327 | time 4[s] | perplexity 48.37
| epoch 36 |  iter 41 / 1327 | time 8[s] | perplexity 46.15
| epoch 36 |  iter 61 / 1327 | time 12[s] | perplexity 45.03
| epoch 36 |  iter 81 / 1327 | time 16[s] | perplexity 39.34
| epoch 36 |  iter 101 / 1327 | time 20[s] | perplexity 38.66
| epoch 36 |  iter 121 / 1327 | time 24[s] | perplexity 42.34
| epoch 36 |  iter 141 / 1327 | time 28[s] | perplexity 44.23
| epoch 36 |  iter 161 / 1327 | time 32[s] | perplexity 51.16
| epoch 36 |  iter 181 / 1327 | time 36[s] | perplexity 54.18
| epoch 36 |  iter 201 / 1327 | time 40[s] | perplexity 54.80
| epoch 36 |  iter 221 / 1327 | time 44[s] | perplexity 53.16
| epoch 36 |  iter 241 / 1327 | time 48[s] | perplexity 49.26
| epoch 36 |  iter 261 / 1327 | time 52[s] | perplexity 52.33
| epoch 36 |  iter 281 / 1327 | time 56[s] | perplexity 52.14
| epoch 36 |  iter 301 / 1327 | time 60[s] | perplexity 40.86
| epoch 36 |  ite

  0%|          | 0/1327 [00:00<?, ?it/s]

| epoch 37 |  iter 1 / 1327 | time 0[s] | perplexity 80.00
| epoch 37 |  iter 21 / 1327 | time 4[s] | perplexity 49.20
| epoch 37 |  iter 41 / 1327 | time 7[s] | perplexity 46.13
| epoch 37 |  iter 61 / 1327 | time 12[s] | perplexity 44.96
| epoch 37 |  iter 81 / 1327 | time 16[s] | perplexity 39.18
| epoch 37 |  iter 101 / 1327 | time 19[s] | perplexity 38.99
| epoch 37 |  iter 121 / 1327 | time 24[s] | perplexity 42.28
| epoch 37 |  iter 141 / 1327 | time 28[s] | perplexity 44.23
| epoch 37 |  iter 161 / 1327 | time 31[s] | perplexity 51.76
| epoch 37 |  iter 181 / 1327 | time 36[s] | perplexity 55.13
| epoch 37 |  iter 201 / 1327 | time 40[s] | perplexity 55.01
| epoch 37 |  iter 221 / 1327 | time 44[s] | perplexity 52.14
| epoch 37 |  iter 241 / 1327 | time 48[s] | perplexity 49.24
| epoch 37 |  iter 261 / 1327 | time 52[s] | perplexity 52.22
| epoch 37 |  iter 281 / 1327 | time 56[s] | perplexity 51.01
| epoch 37 |  iter 301 / 1327 | time 60[s] | perplexity 41.61
| epoch 37 |  ite

  0%|          | 0/1327 [00:00<?, ?it/s]

| epoch 38 |  iter 1 / 1327 | time 0[s] | perplexity 76.28
| epoch 38 |  iter 21 / 1327 | time 4[s] | perplexity 47.69
| epoch 38 |  iter 41 / 1327 | time 8[s] | perplexity 46.55
| epoch 38 |  iter 61 / 1327 | time 12[s] | perplexity 45.16
| epoch 38 |  iter 81 / 1327 | time 16[s] | perplexity 38.88
| epoch 38 |  iter 101 / 1327 | time 20[s] | perplexity 38.47
| epoch 38 |  iter 121 / 1327 | time 24[s] | perplexity 42.53
| epoch 38 |  iter 141 / 1327 | time 27[s] | perplexity 43.90
| epoch 38 |  iter 161 / 1327 | time 32[s] | perplexity 51.52
| epoch 38 |  iter 181 / 1327 | time 36[s] | perplexity 54.35
| epoch 38 |  iter 201 / 1327 | time 39[s] | perplexity 55.24
| epoch 38 |  iter 221 / 1327 | time 44[s] | perplexity 51.95
| epoch 38 |  iter 241 / 1327 | time 48[s] | perplexity 48.47
| epoch 38 |  iter 261 / 1327 | time 52[s] | perplexity 52.35
| epoch 38 |  iter 281 / 1327 | time 56[s] | perplexity 51.24
| epoch 38 |  iter 301 / 1327 | time 60[s] | perplexity 41.97
| epoch 38 |  ite

  0%|          | 0/1327 [00:00<?, ?it/s]

| epoch 39 |  iter 1 / 1327 | time 0[s] | perplexity 73.60
| epoch 39 |  iter 21 / 1327 | time 4[s] | perplexity 48.13
| epoch 39 |  iter 41 / 1327 | time 8[s] | perplexity 46.99
| epoch 39 |  iter 61 / 1327 | time 12[s] | perplexity 44.53
| epoch 39 |  iter 81 / 1327 | time 16[s] | perplexity 38.95
| epoch 39 |  iter 101 / 1327 | time 19[s] | perplexity 38.70
| epoch 39 |  iter 121 / 1327 | time 24[s] | perplexity 42.43
| epoch 39 |  iter 141 / 1327 | time 28[s] | perplexity 43.57
| epoch 39 |  iter 161 / 1327 | time 32[s] | perplexity 51.72
| epoch 39 |  iter 181 / 1327 | time 36[s] | perplexity 54.64
| epoch 39 |  iter 201 / 1327 | time 40[s] | perplexity 54.73
| epoch 39 |  iter 221 / 1327 | time 44[s] | perplexity 52.53
| epoch 39 |  iter 241 / 1327 | time 48[s] | perplexity 48.40
| epoch 39 |  iter 261 / 1327 | time 52[s] | perplexity 52.42
| epoch 39 |  iter 281 / 1327 | time 56[s] | perplexity 51.21
| epoch 39 |  iter 301 / 1327 | time 60[s] | perplexity 41.28
| epoch 39 |  ite

  0%|          | 0/1327 [00:00<?, ?it/s]

| epoch 40 |  iter 1 / 1327 | time 0[s] | perplexity 78.07
| epoch 40 |  iter 21 / 1327 | time 4[s] | perplexity 48.24
| epoch 40 |  iter 41 / 1327 | time 8[s] | perplexity 46.57
| epoch 40 |  iter 61 / 1327 | time 12[s] | perplexity 44.58
| epoch 40 |  iter 81 / 1327 | time 16[s] | perplexity 39.30
| epoch 40 |  iter 101 / 1327 | time 20[s] | perplexity 38.32
| epoch 40 |  iter 121 / 1327 | time 24[s] | perplexity 41.53
| epoch 40 |  iter 141 / 1327 | time 28[s] | perplexity 43.73
| epoch 40 |  iter 161 / 1327 | time 32[s] | perplexity 52.00
| epoch 40 |  iter 181 / 1327 | time 36[s] | perplexity 54.10
| epoch 40 |  iter 201 / 1327 | time 40[s] | perplexity 54.56
| epoch 40 |  iter 221 / 1327 | time 44[s] | perplexity 53.16
| epoch 40 |  iter 241 / 1327 | time 48[s] | perplexity 48.47
| epoch 40 |  iter 261 / 1327 | time 52[s] | perplexity 53.26
| epoch 40 |  iter 281 / 1327 | time 56[s] | perplexity 50.70
| epoch 40 |  iter 301 / 1327 | time 60[s] | perplexity 41.38
| epoch 40 |  ite

In [None]:

# 基于验证数据进行评价
model.reset_state()
ppl_test = eval_perplexity(model, corpus_test)
print('test perplexity: ', ppl_test)


evaluating perplexity ...
234 / 235
test perplexity:  76.23469


CPU需要两天，GPUTesla运行了3个小时……

In [None]:
import torch
torch.cuda.get_device_name(0)

'Tesla T4'