In [1]:
import torch
from torch import nn, optim
from torch.utils.data import DataLoader, Dataset
import torch.nn.functional as F
import numpy as np
import re
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
import random
import time
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from tqdm import tqdm
import wandb
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from IPython.display import display

In [2]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
WANDB_API_KEY = '27687968cab05dfe2e7fe4fb3b59772e1f7c167a'
wandb.login()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

## Preprocessing

In [3]:
def preprocess(data):
    data = re.sub(r'\n|\s+', ' ', data) #newline and multiple spaces -> single space
    data = re.sub(r'[’‘]', '\'', data) #apostrophes
    data = re.sub(r'[“”`\' ]|[–—-]', ' ', data) #quotes and dashes
    data = re.sub(r'(?<!\w)([.!?])(?!\w)', r' \1 ', data) #dont remove punctuation
    data = re.sub(r'[™•]', ' ', data) #remove other unwanted symbols
    return data.strip() #strip extra spaces

## Tokenization

In [4]:
def tokenize(data, min_length_sentences=6):
    sentences = sent_tokenize(data)
    sentences = [sentence for sentence in sentences if len(sentence.split()) >= min_length_sentences]
    print("Length of sentences after filtering:", len(sentences))

    words_sentences = []

    for sentence in sentences:
        words = word_tokenize(sentence)
        
        words = [word.lower() for word in words if word.lower() not in ['.', ',', '!', '?', ';', ':']]
        words = ['<s>'] + words + ['</s>']
        words_sentences.append(words)
    
    return sentences, words_sentences

## Data Preparation

In [5]:
def train_val_test_split(sentences, train_ratio=0.7, val_ratio=0.2, seed=None, num_shuffles=1):
    if seed is not None:
        random.seed(seed)
    
    for _ in range(num_shuffles):
        random.shuffle(sentences)
    
    total_sentences = len(sentences)
    
    train_size = int(total_sentences * train_ratio)
    val_size = int(total_sentences * val_ratio)
    test_size = total_sentences - train_size - val_size  # Remaining for test
    
    train_sentences = sentences[:train_size]
    val_sentences = sentences[train_size:train_size + val_size]
    test_sentences = sentences[train_size + val_size:]
    
    return train_sentences, val_sentences, test_sentences


## Loading Glove Embeddings

In [6]:
def create_glove_embeddings(glove_path):
    glove = {}
    embedding_dim = 0

    with open(glove_path, 'r') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = torch.tensor([float(val) for val in values[1:]])
            glove[word] = vector
            embedding_dim = len(values[1:])

    glove['<UNK>'] = torch.mean(torch.stack(list(glove.values())), dim=0)
    glove['<s>'] = torch.rand(embedding_dim)
    glove['</s>'] = torch.rand(embedding_dim)

    return glove

## Creation of Vocab and Encodings

In [7]:
def create_embeddings_and_encode(train_sentences, val_sentences, test_sentences, glove):
    embedding_dim = len(list(glove.values())[0])
    vocab = set()

    # create vocab from train
    vocab.update(['<s>', '</s>', '<UNK>',])

    for sentence in train_sentences:
        for word in sentence:
            if word in glove:
                vocab.add(word)
            else:
                sentence[sentence.index(word)] = '<UNK>'
                
    embeddings = np.zeros((len(vocab), embedding_dim))
    word_to_idx = {word: idx for idx, word in enumerate(vocab)}

    for word in vocab:
        if word in glove:
            embeddings[word_to_idx[word]] = glove[word]
        else:
            embeddings[word_to_idx[word]] = np.random.rand(embedding_dim)  # Random for unknown words

    def encode_sentences(sentences, word_to_idx):
        encoded_sentences = []
        for sentence in sentences:
            encoded_sentence = [word_to_idx[word] if word in word_to_idx else word_to_idx['<UNK>'] for word in sentence]
            encoded_sentences.append(encoded_sentence)
        return encoded_sentences

    encoded_train_sentences = encode_sentences(train_sentences, word_to_idx)
    encoded_val_sentences = encode_sentences(val_sentences, word_to_idx)
    encoded_test_sentences = encode_sentences(test_sentences, word_to_idx)

    return torch.FloatTensor(embeddings), encoded_train_sentences, encoded_val_sentences, encoded_test_sentences, word_to_idx, list(vocab)


## Dataset for Training NNLM

In [8]:
class NGramDataset(Dataset):
    def __init__(self, data, embeddings, n=5):
        self.n = n
        self.ngrams = []
        self.labels = []
        self.embeddings = embeddings

        for sentence in data:
            for i in range(len(sentence) - self.n):
                context_indices = sentence[i:i + self.n]
                target_index = sentence[i + self.n]

                context_embeddings = torch.cat([self.embeddings[idx] for idx in context_indices], dim=0).float()
                
                self.ngrams.append(context_embeddings)
                self.labels.append(target_index)

    def __len__(self):
        return len(self.ngrams)

    def __getitem__(self, idx):
        return self.ngrams[idx], torch.tensor(self.labels[idx], dtype=torch.long)  # Convert labels to Long



## NNLM Model

In [9]:
class NNLM(nn.Module):
    def __init__(self, embeddings, hidden_dims, n_gram=5, dropout=0.5):
        super(NNLM, self).__init__()

        self.vocab_size = embeddings.shape[0]
        self.embeddings_dim = embeddings.shape[1]

        self.embeddings = nn.Embedding.from_pretrained(embeddings, freeze=True)
        self.fc1 = nn.Linear((self.embeddings_dim) * n_gram, hidden_dims[0])
        self.fc2 = nn.Linear(hidden_dims[0], hidden_dims[1])
        self.fc3 = nn.Linear(hidden_dims[1], self.vocab_size)

        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p = dropout)

    def forward(self, x):
        x = x.view(x.shape[0], -1)
        x = self.dropout(self.relu(self.fc1(x)))
        x = self.dropout(self.relu(self.fc2(x)))
        return self.fc3(x)


## Model Testing

In [10]:
def test_model(model, eval_loader, criterion):
    model.eval()
    total_loss = 0

    with torch.no_grad():
        for i, (x, y) in enumerate(tqdm(eval_loader)):
            x, y = x.float().to(device), y.type(torch.LongTensor).to(device)
            outputs = model(x)
            loss = criterion(outputs, y)
            total_loss += loss.item()
            
        avg_loss = total_loss / len(eval_loader)
        perplexity = torch.exp(torch.tensor(avg_loss))

    return avg_loss, perplexity


## Model Training

In [14]:
def train_model(model, train_loader, val_loader, optimizer, criterion, num_epochs, patience=2):
    model.to(device)
    early_stopping_counter = 0
    best_val_loss = float('inf')
    
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0

        train_data = tqdm(train_loader, desc=f'Epoch {epoch + 1}/{num_epochs}', unit='batch')

        for x, y in train_data:
            x, y = x.float().to(device), y.type(torch.LongTensor).to(device)  # Ensure Float type

            optimizer.zero_grad()
            y_pred = model(x)
            loss = criterion(y_pred, y)
            total_loss += loss.item()

            loss.backward()
            optimizer.step()

        avg_train_loss = total_loss / len(train_loader)
        perplexity = torch.exp(torch.tensor(avg_train_loss))

        print(f'Train Loss: {avg_train_loss:.4f}')
        print(f'Train Perplexity: {perplexity:.4f}')

        avg_val_loss, val_perplexity = test_model(model, val_loader, criterion)
        print(f'Val Loss: {avg_val_loss:.4f}')
        print(f'Val Perplexity: {val_perplexity:.4f}')

        # check for early stopping
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            early_stopping_counter = 0
            torch.save(model.state_dict(), '2021101072_LM1.pt')
        else:
            early_stopping_counter += 1
            if early_stopping_counter >= patience:
                print(f'Early stopping at epoch {epoch + 1}')
                break

    return model


## Save Perplexities in files

In [15]:
def save_perplexities(model, sentences, embeddings, criterion, filename, idx_to_word, n=5):
    model.eval()
    total_loss = 0
    all_sentences = []
    perplexity_scores = []

    with torch.no_grad():
        for sentence in sentences:
            sentence_loss = 0
            sentence_length = 0
            ngrams = []
            targets = []

            for i in range(len(sentence) - n):
                context_indices = sentence[i:i + n]
                target_index = sentence[i + n]

                context_embeddings = torch.cat([embeddings[idx] for idx in context_indices], dim=0).float()

                ngrams.append(context_embeddings)
                targets.append(target_index)

            for j in range(len(ngrams)):
                ngram = ngrams[j].to(device) 
                target_word = torch.tensor(targets[j], dtype=torch.long).to(device)  

                outputs = model(ngram.unsqueeze(0))  
                loss = criterion(outputs, target_word.unsqueeze(0))  
                sentence_loss += loss.item()
                sentence_length += 1

            avg_loss_per_sentence = sentence_loss / sentence_length
            sentence_perplexity = torch.exp(torch.tensor(avg_loss_per_sentence)).item()
            perplexity_scores.append(sentence_perplexity)

            sentence_words = [idx_to_word[idx] for idx in sentence]
            full_sentence = " ".join(sentence_words)
            all_sentences.append(full_sentence)

        avg_perplexity = sum(perplexity_scores) / len(perplexity_scores)

    with open(filename, 'w') as f:
        for i, sentence in enumerate(all_sentences):
            f.write(f"{sentence}\t{perplexity_scores[i]}\n")
        
        f.write(f"Average\t{avg_perplexity}\n")

    return avg_perplexity


In [16]:
def save_model(model, path):
    torch.save(model.state_dict(), path)
    print(f"Model saved to {path}")


## Running the Model

In [17]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [18]:
with open('/kaggle/input/auguste-maquet/Auguste_Maquet.txt', 'r') as f:
    corpus = f.read()

corpus = preprocess(corpus)
min_length_sentences = 6

sentences, word_sentences = tokenize(corpus)

train_sentences, val_sentences, test_sentences = train_val_test_split(word_sentences)

print("Train size:", len(train_sentences))
print("Validation size:", len(val_sentences))
print("Test size:", len(test_sentences))

Length of sentences after filtering: 45102
Train size: 31571
Validation size: 9020
Test size: 4511


In [19]:
glove = create_glove_embeddings('/kaggle/input/glove-300/glove.6B.300d.txt')

embeddings, encoded_train, encoded_val, encoded_test, word_to_idx, vocab = create_embeddings_and_encode(train_sentences, val_sentences, test_sentences, glove)


In [20]:
train_dataset = NGramDataset(encoded_train, embeddings)
val_dataset = NGramDataset(encoded_val, embeddings)
test_dataset = NGramDataset(encoded_test, embeddings)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=True)

print(f'Train dataset size: {len(train_dataset)}')
print(f'Validation dataset size: {len(val_dataset)}')
print(f'Test dataset size: {len(test_dataset)}')

print(train_dataset[0][0].shape)

Train dataset size: 543593
Validation dataset size: 157712
Test dataset size: 77633
torch.Size([1500])


In [21]:
learning_rate = 0.01
model = NNLM(embeddings, [300, 300], 5, 0.1)
optimizer = optim.SGD(model.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()


In [22]:
model = train_model(model, train_loader, val_loader, optimizer, criterion, num_epochs=10)

Epoch 1/10: 100%|██████████| 8494/8494 [00:21<00:00, 392.77batch/s]


Train Loss: 6.7636
Train Perplexity: 865.7474


100%|██████████| 2465/2465 [00:03<00:00, 677.13it/s]


Val Loss: 6.2277
Val Perplexity: 506.5939


Epoch 2/10: 100%|██████████| 8494/8494 [00:21<00:00, 401.81batch/s]


Train Loss: 6.1738
Train Perplexity: 480.0053


100%|██████████| 2465/2465 [00:03<00:00, 673.03it/s]


Val Loss: 5.9725
Val Perplexity: 392.4860


Epoch 3/10: 100%|██████████| 8494/8494 [00:21<00:00, 398.86batch/s]


Train Loss: 5.9584
Train Perplexity: 386.9931


100%|██████████| 2465/2465 [00:03<00:00, 673.57it/s]


Val Loss: 5.8068
Val Perplexity: 332.5564


Epoch 4/10: 100%|██████████| 8494/8494 [00:21<00:00, 395.87batch/s]


Train Loss: 5.8170
Train Perplexity: 335.9633


100%|██████████| 2465/2465 [00:03<00:00, 662.70it/s]


Val Loss: 5.6875
Val Perplexity: 295.1544


Epoch 5/10: 100%|██████████| 8494/8494 [00:21<00:00, 403.65batch/s]


Train Loss: 5.7157
Train Perplexity: 303.5886


100%|██████████| 2465/2465 [00:03<00:00, 666.69it/s]


Val Loss: 5.6035
Val Perplexity: 271.3723


Epoch 6/10: 100%|██████████| 8494/8494 [00:21<00:00, 398.66batch/s]


Train Loss: 5.6340
Train Perplexity: 279.7773


100%|██████████| 2465/2465 [00:03<00:00, 660.41it/s]


Val Loss: 5.5395
Val Perplexity: 254.5395


Epoch 7/10: 100%|██████████| 8494/8494 [00:21<00:00, 392.17batch/s]


Train Loss: 5.5674
Train Perplexity: 261.7607


100%|██████████| 2465/2465 [00:03<00:00, 658.46it/s]


Val Loss: 5.4852
Val Perplexity: 241.0889


Epoch 8/10: 100%|██████████| 8494/8494 [00:21<00:00, 389.44batch/s]


Train Loss: 5.5112
Train Perplexity: 247.4529


100%|██████████| 2465/2465 [00:03<00:00, 634.34it/s]


Val Loss: 5.4401
Val Perplexity: 230.4714


Epoch 9/10: 100%|██████████| 8494/8494 [00:22<00:00, 383.90batch/s]


Train Loss: 5.4629
Train Perplexity: 235.7832


100%|██████████| 2465/2465 [00:03<00:00, 666.19it/s]


Val Loss: 5.4025
Val Perplexity: 221.9567


Epoch 10/10: 100%|██████████| 8494/8494 [00:20<00:00, 404.99batch/s]


Train Loss: 5.4197
Train Perplexity: 225.8150


100%|██████████| 2465/2465 [00:03<00:00, 672.43it/s]

Val Loss: 5.3616
Val Perplexity: 213.0745





## Perplexity Scores

In [23]:
loss, perplexity = test_model(model, train_loader, criterion)
print(f'\nTrain Loss: {loss}')
print(f'Train Perplexity: {perplexity}')

100%|██████████| 8494/8494 [00:13<00:00, 633.66it/s]


Train Loss: 5.323368948657344
Train Perplexity: 205.07362365722656





In [24]:
loss, perplexity = test_model(model, val_loader, criterion)
print(f'\nVal Loss: {loss}')
print(f'Val Perplexity: {perplexity}')

100%|██████████| 2465/2465 [00:03<00:00, 633.28it/s]


Val Loss: 5.361691077582493
Val Perplexity: 213.08497619628906





In [25]:
loss, perplexity = test_model(model, test_loader, criterion)
print(f'\nTest Loss: {loss}')
print(f'Test Perplexity: {perplexity}')

100%|██████████| 1214/1214 [00:01<00:00, 622.98it/s]


Test Loss: 5.343334847659216
Test Perplexity: 209.20919799804688





In [26]:
save_perplexities(model, encoded_train, embeddings, criterion, '2021101072_LM1_train_perplexity.txt', vocab)
save_perplexities(model, encoded_val, embeddings, criterion, '2021101072_LM1_val_perplexity.txt', vocab)
save_perplexities(model, encoded_test, embeddings, criterion, '2021101072_LM1_test_perplexity.txt', vocab)

264.90034703705214

In [33]:
# save_model(model, '2021101072_LM1.pt')

In [34]:
import pickle

with open('data_store_nnlm.pkl', 'wb') as f:
    pickle.dump({
        'embeddings': embeddings,
        'vocab': vocab,
        'word_to_idx': word_to_idx,
        'encoded_train': encoded_train,
        'encoded_val': encoded_val,
        'encoded_test': encoded_test,
    }, f)

print("Data saved successfully!")

Data saved successfully!


## Hyperparameter Tuning

In [19]:
results = []

dropout_rates = [0.1, 0.3, 0.5]
hidden_dims_options = [[100, 100], [200, 200], [300, 300]]
learning_rates = [0.001, 0.01]
optimizers = [optim.Adam, optim.SGD]

In [20]:
for dropout in dropout_rates:
    for hidden_dims in hidden_dims_options:
        for lr in learning_rates:
            for optimizer_class in optimizers:

                wandb.init(project='hyperparameter_tuning_nnlm_a1', config={
                    'dropout': dropout,
                    'hidden_dims': hidden_dims,
                    'learning_rate': lr,
                    'optimizer': optimizer_class.__name__
                })

                model = NNLM(embeddings, hidden_dims, n_gram=5, dropout=dropout)
                optimizer = optimizer_class(model.parameters(), lr=lr)
                criterion = nn.CrossEntropyLoss()

                model = train_model(model, train_loader, val_loader, optimizer, criterion, num_epochs=10)

                avg_train_loss, train_perplexity = test_model(model, train_loader, criterion)
                avg_val_loss, val_perplexity = test_model(model, val_loader, criterion)
                avg_test_loss, test_perplexity = test_model(model, test_loader, criterion)

                wandb.log({
                    'train_loss': avg_train_loss,
                    'train_perplexity': train_perplexity,
                    'val_loss': avg_val_loss,
                    'val_perplexity': val_perplexity,
                    'test_loss': avg_test_loss,
                    'test_perplexity': test_perplexity
                })

                results.append({
                    'dropout': dropout,
                    'hidden_dims': hidden_dims,
                    'learning_rate': lr,
                    'optimizer': optimizer_class.__name__,
                    'train_loss': avg_train_loss,
                    'train_perplexity': train_perplexity,
                    'val_loss': avg_val_loss,
                    'val_perplexity': val_perplexity,
                    'test_loss': avg_test_loss,
                    'test_perplexity': test_perplexity
                })

                wandb.finish()

[34m[1mwandb[0m: Currently logged in as: [33mashnadua[0m ([33mashna-dua[0m). Use [1m`wandb login --relogin`[0m to force relogin


Epoch 1/10: 100%|██████████| 8533/8533 [00:22<00:00, 385.74batch/s]


Train Loss: 5.9378
Train Perplexity: 379.1139


100%|██████████| 2428/2428 [00:03<00:00, 765.58it/s]


Val Loss: 5.5288
Val Perplexity: 251.8535


Epoch 2/10: 100%|██████████| 8533/8533 [00:21<00:00, 394.53batch/s]


Train Loss: 5.5059
Train Perplexity: 246.1484


100%|██████████| 2428/2428 [00:03<00:00, 764.35it/s]


Val Loss: 5.4848
Val Perplexity: 241.0032


Epoch 3/10: 100%|██████████| 8533/8533 [00:21<00:00, 393.43batch/s]


Train Loss: 5.3747
Train Perplexity: 215.8816


100%|██████████| 2428/2428 [00:03<00:00, 775.36it/s]


Val Loss: 5.4952
Val Perplexity: 243.5290


Epoch 4/10: 100%|██████████| 8533/8533 [00:21<00:00, 391.82batch/s]


Train Loss: 5.3105
Train Perplexity: 202.4590


100%|██████████| 2428/2428 [00:03<00:00, 769.30it/s]


Val Loss: 5.5141
Val Perplexity: 248.1787
Early stopping at epoch 4


100%|██████████| 8533/8533 [00:11<00:00, 736.05it/s]
100%|██████████| 2428/2428 [00:03<00:00, 743.34it/s]
100%|██████████| 1212/1212 [00:01<00:00, 714.63it/s]


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
test_loss,▁
test_perplexity,▁
train_loss,▁
train_perplexity,▁
val_loss,▁
val_perplexity,▁

0,1
test_loss,5.5565
test_perplexity,258.91626
train_loss,5.12818
train_perplexity,168.70955
val_loss,5.51347
val_perplexity,248.01074


Epoch 1/10: 100%|██████████| 8533/8533 [00:20<00:00, 420.89batch/s]


Train Loss: 8.8993
Train Perplexity: 7326.9448


100%|██████████| 2428/2428 [00:03<00:00, 740.53it/s]


Val Loss: 7.4946
Val Perplexity: 1798.2776


Epoch 2/10: 100%|██████████| 8533/8533 [00:20<00:00, 423.64batch/s]


Train Loss: 7.3071
Train Perplexity: 1490.7733


100%|██████████| 2428/2428 [00:03<00:00, 733.85it/s]


Val Loss: 6.9709
Val Perplexity: 1065.1714


Epoch 3/10: 100%|██████████| 8533/8533 [00:20<00:00, 422.93batch/s]


Train Loss: 6.9679
Train Perplexity: 1061.9890


100%|██████████| 2428/2428 [00:03<00:00, 737.25it/s]


Val Loss: 6.7369
Val Perplexity: 842.9651


Epoch 4/10: 100%|██████████| 8533/8533 [00:20<00:00, 421.19batch/s]


Train Loss: 6.7876
Train Perplexity: 886.7798


100%|██████████| 2428/2428 [00:03<00:00, 738.55it/s]


Val Loss: 6.5982
Val Perplexity: 733.7787


Epoch 5/10: 100%|██████████| 8533/8533 [00:20<00:00, 421.14batch/s]


Train Loss: 6.6759
Train Perplexity: 793.0472


100%|██████████| 2428/2428 [00:03<00:00, 738.11it/s]


Val Loss: 6.5095
Val Perplexity: 671.4705


Epoch 6/10: 100%|██████████| 8533/8533 [00:20<00:00, 419.70batch/s]


Train Loss: 6.6002
Train Perplexity: 735.2778


100%|██████████| 2428/2428 [00:03<00:00, 740.03it/s]


Val Loss: 6.4446
Val Perplexity: 629.2715


Epoch 7/10: 100%|██████████| 8533/8533 [00:20<00:00, 424.29batch/s]


Train Loss: 6.5411
Train Perplexity: 693.0376


100%|██████████| 2428/2428 [00:03<00:00, 732.32it/s]


Val Loss: 6.3915
Val Perplexity: 596.7249


Epoch 8/10: 100%|██████████| 8533/8533 [00:20<00:00, 418.22batch/s]


Train Loss: 6.4935
Train Perplexity: 660.8119


100%|██████████| 2428/2428 [00:03<00:00, 741.13it/s]


Val Loss: 6.3492
Val Perplexity: 572.0566


Epoch 9/10: 100%|██████████| 8533/8533 [00:20<00:00, 421.27batch/s]


Train Loss: 6.4506
Train Perplexity: 633.1117


100%|██████████| 2428/2428 [00:03<00:00, 733.19it/s]


Val Loss: 6.3098
Val Perplexity: 549.9578


Epoch 10/10: 100%|██████████| 8533/8533 [00:20<00:00, 421.75batch/s]


Train Loss: 6.4144
Train Perplexity: 610.5682


100%|██████████| 2428/2428 [00:03<00:00, 736.83it/s]


Val Loss: 6.2754
Val Perplexity: 531.3635


100%|██████████| 8533/8533 [00:11<00:00, 734.04it/s]
100%|██████████| 2428/2428 [00:03<00:00, 734.54it/s]
100%|██████████| 1212/1212 [00:01<00:00, 750.64it/s]


VBox(children=(Label(value='0.021 MB of 0.021 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
test_loss,▁
test_perplexity,▁
train_loss,▁
train_perplexity,▁
val_loss,▁
val_perplexity,▁

0,1
test_loss,6.30121
test_perplexity,545.22913
train_loss,6.33514
train_perplexity,564.04987
val_loss,6.27582
val_perplexity,531.5625


Epoch 1/10: 100%|██████████| 8533/8533 [00:21<00:00, 392.71batch/s]


Train Loss: 6.5491
Train Perplexity: 698.5938


100%|██████████| 2428/2428 [00:03<00:00, 749.09it/s]


Val Loss: 6.3179
Val Perplexity: 554.3807


Epoch 2/10: 100%|██████████| 8533/8533 [00:21<00:00, 390.88batch/s]


Train Loss: 6.4214
Train Perplexity: 614.8566


100%|██████████| 2428/2428 [00:03<00:00, 787.12it/s]


Val Loss: 6.2749
Val Perplexity: 531.0487


Epoch 3/10: 100%|██████████| 8533/8533 [00:21<00:00, 392.56batch/s]


Train Loss: 6.4027
Train Perplexity: 603.4713


100%|██████████| 2428/2428 [00:03<00:00, 771.86it/s]


Val Loss: 6.2827
Val Perplexity: 535.2332


Epoch 4/10: 100%|██████████| 8533/8533 [00:21<00:00, 391.26batch/s]


Train Loss: 6.4020
Train Perplexity: 603.0255


100%|██████████| 2428/2428 [00:03<00:00, 776.71it/s]


Val Loss: 6.2717
Val Perplexity: 529.3654


Epoch 5/10: 100%|██████████| 8533/8533 [00:21<00:00, 392.17batch/s]


Train Loss: 6.3889
Train Perplexity: 595.2247


100%|██████████| 2428/2428 [00:03<00:00, 777.71it/s]


Val Loss: 6.2702
Val Perplexity: 528.5974


Epoch 6/10: 100%|██████████| 8533/8533 [00:21<00:00, 394.99batch/s]


Train Loss: 6.3957
Train Perplexity: 599.2587


100%|██████████| 2428/2428 [00:03<00:00, 750.41it/s]


Val Loss: 6.2731
Val Perplexity: 530.1406


Epoch 7/10: 100%|██████████| 8533/8533 [00:21<00:00, 395.78batch/s]


Train Loss: 6.3913
Train Perplexity: 596.6569


100%|██████████| 2428/2428 [00:03<00:00, 771.74it/s]


Val Loss: 6.2702
Val Perplexity: 528.5910


Epoch 8/10: 100%|██████████| 8533/8533 [00:21<00:00, 393.23batch/s]


Train Loss: 6.3903
Train Perplexity: 596.0460


100%|██████████| 2428/2428 [00:03<00:00, 773.10it/s]


Val Loss: 6.2717
Val Perplexity: 529.3573


Epoch 9/10: 100%|██████████| 8533/8533 [00:21<00:00, 393.70batch/s]


Train Loss: 6.3879
Train Perplexity: 594.5977


100%|██████████| 2428/2428 [00:03<00:00, 770.19it/s]


Val Loss: 6.2634
Val Perplexity: 525.0107


Epoch 10/10: 100%|██████████| 8533/8533 [00:21<00:00, 391.49batch/s]


Train Loss: 6.3891
Train Perplexity: 595.3186


100%|██████████| 2428/2428 [00:03<00:00, 770.67it/s]


Val Loss: 6.2686
Val Perplexity: 527.7134


100%|██████████| 8533/8533 [00:11<00:00, 734.64it/s]
100%|██████████| 2428/2428 [00:03<00:00, 727.49it/s]
100%|██████████| 1212/1212 [00:01<00:00, 743.97it/s]


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
test_loss,▁
test_perplexity,▁
train_loss,▁
train_perplexity,▁
val_loss,▁
val_perplexity,▁

0,1
test_loss,6.30287
test_perplexity,546.13617
train_loss,6.32281
train_perplexity,557.13696
val_loss,6.26885
val_perplexity,527.87115


Epoch 1/10: 100%|██████████| 8533/8533 [00:20<00:00, 420.21batch/s]


Train Loss: 6.9051
Train Perplexity: 997.3315


100%|██████████| 2428/2428 [00:03<00:00, 735.46it/s]


Val Loss: 6.2772
Val Perplexity: 532.3092


Epoch 2/10: 100%|██████████| 8533/8533 [00:20<00:00, 423.25batch/s]


Train Loss: 6.2741
Train Perplexity: 530.6247


100%|██████████| 2428/2428 [00:03<00:00, 732.22it/s]


Val Loss: 6.0409
Val Perplexity: 420.2906


Epoch 3/10: 100%|██████████| 8533/8533 [00:20<00:00, 418.09batch/s]


Train Loss: 6.0833
Train Perplexity: 438.4634


100%|██████████| 2428/2428 [00:03<00:00, 738.15it/s]


Val Loss: 5.8733
Val Perplexity: 355.4178


Epoch 4/10: 100%|██████████| 8533/8533 [00:20<00:00, 419.38batch/s]


Train Loss: 5.9495
Train Perplexity: 383.5524


100%|██████████| 2428/2428 [00:03<00:00, 729.64it/s]


Val Loss: 5.7637
Val Perplexity: 318.5239


Epoch 5/10: 100%|██████████| 8533/8533 [00:20<00:00, 422.93batch/s]


Train Loss: 5.8537
Train Perplexity: 348.5126


100%|██████████| 2428/2428 [00:03<00:00, 732.06it/s]


Val Loss: 5.6816
Val Perplexity: 293.4106


Epoch 6/10: 100%|██████████| 8533/8533 [00:20<00:00, 425.86batch/s]


Train Loss: 5.7778
Train Perplexity: 323.0410


100%|██████████| 2428/2428 [00:03<00:00, 732.02it/s]


Val Loss: 5.6170
Val Perplexity: 275.0697


Epoch 7/10: 100%|██████████| 8533/8533 [00:20<00:00, 420.29batch/s]


Train Loss: 5.7161
Train Perplexity: 303.7211


100%|██████████| 2428/2428 [00:03<00:00, 730.37it/s]


Val Loss: 5.5616
Val Perplexity: 260.2350


Epoch 8/10: 100%|██████████| 8533/8533 [00:20<00:00, 422.08batch/s]


Train Loss: 5.6659
Train Perplexity: 288.8535


100%|██████████| 2428/2428 [00:03<00:00, 734.86it/s]


Val Loss: 5.5225
Val Perplexity: 250.2541


Epoch 9/10: 100%|██████████| 8533/8533 [00:20<00:00, 422.58batch/s]


Train Loss: 5.6209
Train Perplexity: 276.1486


100%|██████████| 2428/2428 [00:03<00:00, 731.24it/s]


Val Loss: 5.4797
Val Perplexity: 239.7770


Epoch 10/10: 100%|██████████| 8533/8533 [00:20<00:00, 425.11batch/s]


Train Loss: 5.5839
Train Perplexity: 266.1054


100%|██████████| 2428/2428 [00:03<00:00, 731.18it/s]


Val Loss: 5.4523
Val Perplexity: 233.2892


100%|██████████| 8533/8533 [00:11<00:00, 722.39it/s]
100%|██████████| 2428/2428 [00:03<00:00, 737.20it/s]
100%|██████████| 1212/1212 [00:01<00:00, 733.29it/s]


VBox(children=(Label(value='0.021 MB of 0.021 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
test_loss,▁
test_perplexity,▁
train_loss,▁
train_perplexity,▁
val_loss,▁
val_perplexity,▁

0,1
test_loss,5.49246
test_perplexity,242.85326
train_loss,5.46526
train_perplexity,236.33652
val_loss,5.45248
val_perplexity,233.33653


Epoch 1/10: 100%|██████████| 8533/8533 [00:23<00:00, 361.22batch/s]


Train Loss: 5.8981
Train Perplexity: 364.3534


100%|██████████| 2428/2428 [00:03<00:00, 746.17it/s]


Val Loss: 5.5116
Val Perplexity: 247.5587


Epoch 2/10: 100%|██████████| 8533/8533 [00:23<00:00, 360.40batch/s]


Train Loss: 5.4755
Train Perplexity: 238.7714


100%|██████████| 2428/2428 [00:03<00:00, 735.04it/s]


Val Loss: 5.4769
Val Perplexity: 239.1155


Epoch 3/10: 100%|██████████| 8533/8533 [00:23<00:00, 360.13batch/s]


Train Loss: 5.3387
Train Perplexity: 208.2368


100%|██████████| 2428/2428 [00:03<00:00, 746.71it/s]


Val Loss: 5.4969
Val Perplexity: 243.9243


Epoch 4/10: 100%|██████████| 8533/8533 [00:23<00:00, 359.42batch/s]


Train Loss: 5.2747
Train Perplexity: 195.3228


100%|██████████| 2428/2428 [00:03<00:00, 744.09it/s]


Val Loss: 5.5170
Val Perplexity: 248.8852
Early stopping at epoch 4


100%|██████████| 8533/8533 [00:12<00:00, 695.84it/s]
100%|██████████| 2428/2428 [00:03<00:00, 695.26it/s]
100%|██████████| 1212/1212 [00:01<00:00, 682.83it/s]


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
test_loss,▁
test_perplexity,▁
train_loss,▁
train_perplexity,▁
val_loss,▁
val_perplexity,▁

0,1
test_loss,5.55659
test_perplexity,258.93912
train_loss,5.09463
train_perplexity,163.14413
val_loss,5.51713
val_perplexity,248.91924


Epoch 1/10: 100%|██████████| 8533/8533 [00:20<00:00, 419.62batch/s]


Train Loss: 8.6592
Train Perplexity: 5762.7944


100%|██████████| 2428/2428 [00:03<00:00, 721.24it/s]


Val Loss: 7.2389
Val Perplexity: 1392.5350


Epoch 2/10: 100%|██████████| 8533/8533 [00:20<00:00, 420.71batch/s]


Train Loss: 7.0940
Train Perplexity: 1204.7520


100%|██████████| 2428/2428 [00:03<00:00, 713.10it/s]


Val Loss: 6.8086
Val Perplexity: 905.5761


Epoch 3/10: 100%|██████████| 8533/8533 [00:20<00:00, 423.89batch/s]


Train Loss: 6.8159
Train Perplexity: 912.2709


100%|██████████| 2428/2428 [00:03<00:00, 726.38it/s]


Val Loss: 6.6174
Val Perplexity: 747.9844


Epoch 4/10: 100%|██████████| 8533/8533 [00:20<00:00, 422.96batch/s]


Train Loss: 6.6626
Train Perplexity: 782.5961


100%|██████████| 2428/2428 [00:03<00:00, 723.62it/s]


Val Loss: 6.4973
Val Perplexity: 663.3761


Epoch 5/10: 100%|██████████| 8533/8533 [00:20<00:00, 422.72batch/s]


Train Loss: 6.5624
Train Perplexity: 707.9980


100%|██████████| 2428/2428 [00:03<00:00, 724.40it/s]


Val Loss: 6.4163
Val Perplexity: 611.7092


Epoch 6/10: 100%|██████████| 8533/8533 [00:20<00:00, 421.14batch/s]


Train Loss: 6.4918
Train Perplexity: 659.7314


100%|██████████| 2428/2428 [00:03<00:00, 717.71it/s]


Val Loss: 6.3557
Val Perplexity: 575.7861


Epoch 7/10: 100%|██████████| 8533/8533 [00:19<00:00, 427.56batch/s]


Train Loss: 6.4372
Train Perplexity: 624.6643


100%|██████████| 2428/2428 [00:03<00:00, 720.08it/s]


Val Loss: 6.3070
Val Perplexity: 548.3932


Epoch 8/10: 100%|██████████| 8533/8533 [00:20<00:00, 422.21batch/s]


Train Loss: 6.3930
Train Perplexity: 597.6532


100%|██████████| 2428/2428 [00:03<00:00, 706.13it/s]


Val Loss: 6.2663
Val Perplexity: 526.5125


Epoch 9/10: 100%|██████████| 8533/8533 [00:20<00:00, 421.66batch/s]


Train Loss: 6.3554
Train Perplexity: 575.6110


100%|██████████| 2428/2428 [00:03<00:00, 719.74it/s]


Val Loss: 6.2326
Val Perplexity: 509.0916


Epoch 10/10: 100%|██████████| 8533/8533 [00:20<00:00, 421.89batch/s]


Train Loss: 6.3210
Train Perplexity: 556.1021


100%|██████████| 2428/2428 [00:03<00:00, 723.70it/s]


Val Loss: 6.1990
Val Perplexity: 492.2457


100%|██████████| 8533/8533 [00:12<00:00, 691.94it/s]
100%|██████████| 2428/2428 [00:03<00:00, 697.66it/s]
100%|██████████| 1212/1212 [00:01<00:00, 686.48it/s]


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
test_loss,▁
test_perplexity,▁
train_loss,▁
train_perplexity,▁
val_loss,▁
val_perplexity,▁

0,1
test_loss,6.22293
test_perplexity,504.18045
train_loss,6.25886
train_perplexity,522.62213
val_loss,6.19823
val_perplexity,491.87543


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01111355625555714, max=1.0)…

Epoch 1/10: 100%|██████████| 8533/8533 [00:23<00:00, 360.44batch/s]


Train Loss: 6.5826
Train Perplexity: 722.4049


100%|██████████| 2428/2428 [00:03<00:00, 741.60it/s]


Val Loss: 6.3466
Val Perplexity: 570.5730


Epoch 2/10: 100%|██████████| 8533/8533 [00:23<00:00, 361.30batch/s]


Train Loss: 6.4572
Train Perplexity: 637.2770


100%|██████████| 2428/2428 [00:03<00:00, 727.00it/s]


Val Loss: 6.3564
Val Perplexity: 576.1654


Epoch 3/10: 100%|██████████| 8533/8533 [00:23<00:00, 362.55batch/s]


Train Loss: 6.4345
Train Perplexity: 622.9528


100%|██████████| 2428/2428 [00:03<00:00, 735.58it/s]


Val Loss: 6.3320
Val Perplexity: 562.2660


Epoch 4/10: 100%|██████████| 8533/8533 [00:23<00:00, 362.37batch/s]


Train Loss: 6.4329
Train Perplexity: 621.9522


100%|██████████| 2428/2428 [00:03<00:00, 747.80it/s]


Val Loss: 6.3344
Val Perplexity: 563.6277


Epoch 5/10: 100%|██████████| 8533/8533 [00:23<00:00, 360.65batch/s]


Train Loss: 6.4312
Train Perplexity: 620.9097


100%|██████████| 2428/2428 [00:03<00:00, 742.31it/s]


Val Loss: 6.3309
Val Perplexity: 561.6489


Epoch 6/10: 100%|██████████| 8533/8533 [00:23<00:00, 359.91batch/s]


Train Loss: 6.4212
Train Perplexity: 614.7496


100%|██████████| 2428/2428 [00:03<00:00, 739.57it/s]


Val Loss: 6.3637
Val Perplexity: 580.3754


Epoch 7/10: 100%|██████████| 8533/8533 [00:23<00:00, 361.44batch/s]


Train Loss: 6.4067
Train Perplexity: 605.9161


100%|██████████| 2428/2428 [00:03<00:00, 744.71it/s]


Val Loss: 6.3141
Val Perplexity: 552.2930


Epoch 8/10: 100%|██████████| 8533/8533 [00:23<00:00, 359.51batch/s]


Train Loss: 6.4204
Train Perplexity: 614.2506


100%|██████████| 2428/2428 [00:03<00:00, 745.86it/s]


Val Loss: 6.3326
Val Perplexity: 562.5905


Epoch 9/10: 100%|██████████| 8533/8533 [00:23<00:00, 361.27batch/s]


Train Loss: 6.4209
Train Perplexity: 614.5843


100%|██████████| 2428/2428 [00:03<00:00, 712.56it/s]


Val Loss: 6.3340
Val Perplexity: 563.3779
Early stopping at epoch 9


100%|██████████| 8533/8533 [00:12<00:00, 689.87it/s]
100%|██████████| 2428/2428 [00:03<00:00, 691.28it/s]
100%|██████████| 1212/1212 [00:01<00:00, 699.46it/s]


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
test_loss,▁
test_perplexity,▁
train_loss,▁
train_perplexity,▁
val_loss,▁
val_perplexity,▁

0,1
test_loss,6.35973
test_perplexity,578.08899
train_loss,6.37232
train_perplexity,585.41425
val_loss,6.33375
val_perplexity,563.26447


Epoch 1/10: 100%|██████████| 8533/8533 [00:20<00:00, 423.05batch/s]


Train Loss: 6.7966
Train Perplexity: 894.8168


100%|██████████| 2428/2428 [00:03<00:00, 710.61it/s]


Val Loss: 6.2241
Val Perplexity: 504.7727


Epoch 2/10: 100%|██████████| 8533/8533 [00:20<00:00, 425.59batch/s]


Train Loss: 6.1987
Train Perplexity: 492.1208


100%|██████████| 2428/2428 [00:03<00:00, 722.05it/s]


Val Loss: 5.9785
Val Perplexity: 394.8642


Epoch 3/10: 100%|██████████| 8533/8533 [00:20<00:00, 422.56batch/s]


Train Loss: 5.9943
Train Perplexity: 401.1239


100%|██████████| 2428/2428 [00:03<00:00, 722.67it/s]


Val Loss: 5.8067
Val Perplexity: 332.5275


Epoch 4/10: 100%|██████████| 8533/8533 [00:20<00:00, 418.59batch/s]


Train Loss: 5.8579
Train Perplexity: 349.9764


100%|██████████| 2428/2428 [00:03<00:00, 728.35it/s]


Val Loss: 5.6938
Val Perplexity: 297.0087


Epoch 5/10: 100%|██████████| 8533/8533 [00:20<00:00, 423.09batch/s]


Train Loss: 5.7590
Train Perplexity: 317.0287


100%|██████████| 2428/2428 [00:03<00:00, 706.17it/s]


Val Loss: 5.6104
Val Perplexity: 273.2631


Epoch 6/10: 100%|██████████| 8533/8533 [00:20<00:00, 422.00batch/s]


Train Loss: 5.6809
Train Perplexity: 293.2166


100%|██████████| 2428/2428 [00:03<00:00, 714.78it/s]


Val Loss: 5.5421
Val Perplexity: 255.2192


Epoch 7/10: 100%|██████████| 8533/8533 [00:20<00:00, 418.32batch/s]


Train Loss: 5.6161
Train Perplexity: 274.8248


100%|██████████| 2428/2428 [00:03<00:00, 711.30it/s]


Val Loss: 5.5066
Val Perplexity: 246.3181


Epoch 8/10: 100%|██████████| 8533/8533 [00:20<00:00, 423.09batch/s]


Train Loss: 5.5628
Train Perplexity: 260.5616


100%|██████████| 2428/2428 [00:03<00:00, 715.53it/s]


Val Loss: 5.4458
Val Perplexity: 231.7809


Epoch 9/10: 100%|██████████| 8533/8533 [00:20<00:00, 424.29batch/s]


Train Loss: 5.5149
Train Perplexity: 248.3536


100%|██████████| 2428/2428 [00:03<00:00, 708.45it/s]


Val Loss: 5.4046
Val Perplexity: 222.4274


Epoch 10/10: 100%|██████████| 8533/8533 [00:20<00:00, 425.29batch/s]


Train Loss: 5.4727
Train Perplexity: 238.1064


100%|██████████| 2428/2428 [00:03<00:00, 731.53it/s]


Val Loss: 5.3725
Val Perplexity: 215.3946


100%|██████████| 8533/8533 [00:12<00:00, 688.11it/s]
100%|██████████| 2428/2428 [00:03<00:00, 687.81it/s]
100%|██████████| 1212/1212 [00:01<00:00, 695.24it/s]


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
test_loss,▁
test_perplexity,▁
train_loss,▁
train_perplexity,▁
val_loss,▁
val_perplexity,▁

0,1
test_loss,5.41337
test_perplexity,224.38588
train_loss,5.3761
train_perplexity,216.17673
val_loss,5.37272
val_perplexity,215.44794


Epoch 1/10: 100%|██████████| 8533/8533 [00:30<00:00, 283.82batch/s]


Train Loss: 5.9004
Train Perplexity: 365.1703


100%|██████████| 2428/2428 [00:03<00:00, 672.38it/s]


Val Loss: 5.5361
Val Perplexity: 253.6828


Epoch 2/10: 100%|██████████| 8533/8533 [00:30<00:00, 279.64batch/s]


Train Loss: 5.4919
Train Perplexity: 242.7198


100%|██████████| 2428/2428 [00:03<00:00, 684.13it/s]


Val Loss: 5.5149
Val Perplexity: 248.3549


Epoch 3/10: 100%|██████████| 8533/8533 [00:30<00:00, 281.89batch/s]


Train Loss: 5.3579
Train Perplexity: 212.2776


100%|██████████| 2428/2428 [00:03<00:00, 686.73it/s]


Val Loss: 5.5193
Val Perplexity: 249.4588


Epoch 4/10: 100%|██████████| 8533/8533 [00:30<00:00, 281.22batch/s]


Train Loss: 5.2944
Train Perplexity: 199.2245


100%|██████████| 2428/2428 [00:03<00:00, 680.39it/s]


Val Loss: 5.5358
Val Perplexity: 253.6058
Early stopping at epoch 4


100%|██████████| 8533/8533 [00:13<00:00, 647.17it/s]
100%|██████████| 2428/2428 [00:03<00:00, 640.15it/s]
100%|██████████| 1212/1212 [00:01<00:00, 623.94it/s]


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
test_loss,▁
test_perplexity,▁
train_loss,▁
train_perplexity,▁
val_loss,▁
val_perplexity,▁

0,1
test_loss,5.56977
test_perplexity,262.37473
train_loss,5.11041
train_perplexity,165.73872
val_loss,5.53592
val_perplexity,253.64156


Epoch 1/10: 100%|██████████| 8533/8533 [00:21<00:00, 402.55batch/s]


Train Loss: 8.4717
Train Perplexity: 4777.7114


100%|██████████| 2428/2428 [00:03<00:00, 677.56it/s]


Val Loss: 7.1222
Val Perplexity: 1239.1323


Epoch 2/10: 100%|██████████| 8533/8533 [00:21<00:00, 404.51batch/s]


Train Loss: 6.9989
Train Perplexity: 1095.3856


100%|██████████| 2428/2428 [00:03<00:00, 682.50it/s]


Val Loss: 6.7450
Val Perplexity: 849.7609


Epoch 3/10: 100%|██████████| 8533/8533 [00:21<00:00, 404.97batch/s]


Train Loss: 6.7591
Train Perplexity: 861.8394


100%|██████████| 2428/2428 [00:03<00:00, 670.30it/s]


Val Loss: 6.5775
Val Perplexity: 718.7233


Epoch 4/10: 100%|██████████| 8533/8533 [00:20<00:00, 407.62batch/s]


Train Loss: 6.6214
Train Perplexity: 750.9903


100%|██████████| 2428/2428 [00:03<00:00, 689.90it/s]


Val Loss: 6.4650
Val Perplexity: 642.2841


Epoch 5/10: 100%|██████████| 8533/8533 [00:21<00:00, 403.20batch/s]


Train Loss: 6.5261
Train Perplexity: 682.7366


100%|██████████| 2428/2428 [00:03<00:00, 681.86it/s]


Val Loss: 6.3868
Val Perplexity: 593.9258


Epoch 6/10: 100%|██████████| 8533/8533 [00:21<00:00, 404.74batch/s]


Train Loss: 6.4564
Train Perplexity: 636.7363


100%|██████████| 2428/2428 [00:03<00:00, 688.56it/s]


Val Loss: 6.3263
Val Perplexity: 559.0984


Epoch 7/10: 100%|██████████| 8533/8533 [00:21<00:00, 404.76batch/s]


Train Loss: 6.4041
Train Perplexity: 604.2943


100%|██████████| 2428/2428 [00:03<00:00, 685.51it/s]


Val Loss: 6.2811
Val Perplexity: 534.3852


Epoch 8/10: 100%|██████████| 8533/8533 [00:21<00:00, 405.45batch/s]


Train Loss: 6.3587
Train Perplexity: 577.4796


100%|██████████| 2428/2428 [00:03<00:00, 677.74it/s]


Val Loss: 6.2397
Val Perplexity: 512.6926


Epoch 9/10: 100%|██████████| 8533/8533 [00:20<00:00, 407.38batch/s]


Train Loss: 6.3208
Train Perplexity: 555.9960


100%|██████████| 2428/2428 [00:03<00:00, 688.07it/s]


Val Loss: 6.2028
Val Perplexity: 494.1320


Epoch 10/10: 100%|██████████| 8533/8533 [00:21<00:00, 404.10batch/s]


Train Loss: 6.2856
Train Perplexity: 536.7610


100%|██████████| 2428/2428 [00:03<00:00, 683.30it/s]


Val Loss: 6.1727
Val Perplexity: 479.4766


100%|██████████| 8533/8533 [00:13<00:00, 643.95it/s]
100%|██████████| 2428/2428 [00:03<00:00, 633.19it/s]
100%|██████████| 1212/1212 [00:01<00:00, 639.01it/s]


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
test_loss,▁
test_perplexity,▁
train_loss,▁
train_perplexity,▁
val_loss,▁
val_perplexity,▁

0,1
test_loss,6.19718
test_perplexity,491.36182
train_loss,6.22954
train_perplexity,507.52335
val_loss,6.17242
val_perplexity,479.34314


Epoch 1/10: 100%|██████████| 8533/8533 [00:30<00:00, 281.36batch/s]


Train Loss: 6.5953
Train Perplexity: 731.6523


100%|██████████| 2428/2428 [00:03<00:00, 683.52it/s]


Val Loss: 6.3723
Val Perplexity: 585.3827


Epoch 2/10: 100%|██████████| 8533/8533 [00:30<00:00, 280.52batch/s]


Train Loss: 6.4729
Train Perplexity: 647.3527


100%|██████████| 2428/2428 [00:03<00:00, 668.79it/s]


Val Loss: 6.3473
Val Perplexity: 570.9308


Epoch 3/10: 100%|██████████| 8533/8533 [00:30<00:00, 281.16batch/s]


Train Loss: 6.4465
Train Perplexity: 630.5223


100%|██████████| 2428/2428 [00:03<00:00, 672.74it/s]


Val Loss: 6.3333
Val Perplexity: 563.0244


Epoch 4/10: 100%|██████████| 8533/8533 [00:30<00:00, 281.18batch/s]


Train Loss: 6.4363
Train Perplexity: 624.0873


100%|██████████| 2428/2428 [00:03<00:00, 683.06it/s]


Val Loss: 6.3228
Val Perplexity: 557.1038


Epoch 5/10: 100%|██████████| 8533/8533 [00:30<00:00, 281.99batch/s]


Train Loss: 6.4249
Train Perplexity: 617.0206


100%|██████████| 2428/2428 [00:03<00:00, 685.08it/s]


Val Loss: 6.3316
Val Perplexity: 562.0764


Epoch 6/10: 100%|██████████| 8533/8533 [00:30<00:00, 281.03batch/s]


Train Loss: 6.4202
Train Perplexity: 614.1501


100%|██████████| 2428/2428 [00:03<00:00, 671.19it/s]


Val Loss: 6.3322
Val Perplexity: 562.4194
Early stopping at epoch 6


100%|██████████| 8533/8533 [00:13<00:00, 643.82it/s]
100%|██████████| 2428/2428 [00:03<00:00, 643.80it/s]
100%|██████████| 1212/1212 [00:01<00:00, 649.52it/s]


VBox(children=(Label(value='0.020 MB of 0.020 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
test_loss,▁
test_perplexity,▁
train_loss,▁
train_perplexity,▁
val_loss,▁
val_perplexity,▁

0,1
test_loss,6.36123
test_perplexity,578.95551
train_loss,6.38081
train_perplexity,590.40448
val_loss,6.33234
val_perplexity,562.46869


Epoch 1/10: 100%|██████████| 8533/8533 [00:21<00:00, 399.40batch/s]


Train Loss: 6.7423
Train Perplexity: 847.4775


100%|██████████| 2428/2428 [00:03<00:00, 683.28it/s]


Val Loss: 6.2028
Val Perplexity: 494.1473


Epoch 2/10: 100%|██████████| 8533/8533 [00:21<00:00, 404.05batch/s]


Train Loss: 6.1706
Train Perplexity: 478.4836


100%|██████████| 2428/2428 [00:03<00:00, 681.37it/s]


Val Loss: 5.9545
Val Perplexity: 385.4787


Epoch 3/10: 100%|██████████| 8533/8533 [00:21<00:00, 405.85batch/s]


Train Loss: 5.9614
Train Perplexity: 388.1432


100%|██████████| 2428/2428 [00:03<00:00, 677.47it/s]


Val Loss: 5.7791
Val Perplexity: 323.4582


Epoch 4/10: 100%|██████████| 8533/8533 [00:21<00:00, 400.56batch/s]


Train Loss: 5.8183
Train Perplexity: 336.4025


100%|██████████| 2428/2428 [00:03<00:00, 672.87it/s]


Val Loss: 5.6664
Val Perplexity: 288.9891


Epoch 5/10: 100%|██████████| 8533/8533 [00:21<00:00, 402.51batch/s]


Train Loss: 5.7158
Train Perplexity: 303.6339


100%|██████████| 2428/2428 [00:03<00:00, 676.97it/s]


Val Loss: 5.5815
Val Perplexity: 265.4590


Epoch 6/10: 100%|██████████| 8533/8533 [00:21<00:00, 403.39batch/s]


Train Loss: 5.6354
Train Perplexity: 280.1662


100%|██████████| 2428/2428 [00:03<00:00, 685.05it/s]


Val Loss: 5.5182
Val Perplexity: 249.1969


Epoch 7/10: 100%|██████████| 8533/8533 [00:21<00:00, 405.82batch/s]


Train Loss: 5.5695
Train Perplexity: 262.2920


100%|██████████| 2428/2428 [00:03<00:00, 669.36it/s]


Val Loss: 5.4572
Val Perplexity: 234.4458


Epoch 8/10: 100%|██████████| 8533/8533 [00:21<00:00, 406.31batch/s]


Train Loss: 5.5125
Train Perplexity: 247.7589


100%|██████████| 2428/2428 [00:03<00:00, 681.72it/s]


Val Loss: 5.4195
Val Perplexity: 225.7571


Epoch 9/10: 100%|██████████| 8533/8533 [00:21<00:00, 403.98batch/s]


Train Loss: 5.4628
Train Perplexity: 235.7669


100%|██████████| 2428/2428 [00:03<00:00, 688.84it/s]


Val Loss: 5.3707
Val Perplexity: 215.0077


Epoch 10/10: 100%|██████████| 8533/8533 [00:21<00:00, 401.17batch/s]


Train Loss: 5.4191
Train Perplexity: 225.6785


100%|██████████| 2428/2428 [00:03<00:00, 683.65it/s]


Val Loss: 5.3409
Val Perplexity: 208.7016


100%|██████████| 8533/8533 [00:13<00:00, 643.82it/s]
100%|██████████| 2428/2428 [00:03<00:00, 639.66it/s]
100%|██████████| 1212/1212 [00:01<00:00, 620.15it/s]


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
test_loss,▁
test_perplexity,▁
train_loss,▁
train_perplexity,▁
val_loss,▁
val_perplexity,▁

0,1
test_loss,5.37866
test_perplexity,216.73056
train_loss,5.32523
train_perplexity,205.45631
val_loss,5.34142
val_perplexity,208.80864


Epoch 1/10: 100%|██████████| 8533/8533 [00:21<00:00, 390.89batch/s]


Train Loss: 6.1534
Train Perplexity: 470.3238


100%|██████████| 2428/2428 [00:03<00:00, 766.14it/s]


Val Loss: 5.6953
Val Perplexity: 297.4697


Epoch 2/10: 100%|██████████| 8533/8533 [00:21<00:00, 390.20batch/s]


Train Loss: 5.8276
Train Perplexity: 339.5363


100%|██████████| 2428/2428 [00:03<00:00, 777.87it/s]


Val Loss: 5.6340
Val Perplexity: 279.7908


Epoch 3/10: 100%|██████████| 8533/8533 [00:21<00:00, 393.48batch/s]


Train Loss: 5.7446
Train Perplexity: 312.4971


100%|██████████| 2428/2428 [00:03<00:00, 752.13it/s]


Val Loss: 5.6373
Val Perplexity: 280.7023


Epoch 4/10: 100%|██████████| 8533/8533 [00:21<00:00, 396.59batch/s]


Train Loss: 5.7106
Train Perplexity: 302.0561


100%|██████████| 2428/2428 [00:03<00:00, 770.52it/s]


Val Loss: 5.6408
Val Perplexity: 281.6798
Early stopping at epoch 4


100%|██████████| 8533/8533 [00:11<00:00, 728.10it/s]
100%|██████████| 2428/2428 [00:03<00:00, 735.92it/s]
100%|██████████| 1212/1212 [00:01<00:00, 722.80it/s]


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
test_loss,▁
test_perplexity,▁
train_loss,▁
train_perplexity,▁
val_loss,▁
val_perplexity,▁

0,1
test_loss,5.68239
test_perplexity,293.64963
train_loss,5.50272
train_perplexity,245.35896
val_loss,5.64108
val_perplexity,281.76776


Epoch 1/10: 100%|██████████| 8533/8533 [00:20<00:00, 424.60batch/s]


Train Loss: 9.0364
Train Perplexity: 8403.0596


100%|██████████| 2428/2428 [00:03<00:00, 728.50it/s]


Val Loss: 7.5978
Val Perplexity: 1993.7482


Epoch 2/10: 100%|██████████| 8533/8533 [00:20<00:00, 423.14batch/s]


Train Loss: 7.5141
Train Perplexity: 1833.6981


100%|██████████| 2428/2428 [00:03<00:00, 735.85it/s]


Val Loss: 7.0393
Val Perplexity: 1140.5410


Epoch 3/10: 100%|██████████| 8533/8533 [00:20<00:00, 423.56batch/s]


Train Loss: 7.1478
Train Perplexity: 1271.3441


100%|██████████| 2428/2428 [00:03<00:00, 733.65it/s]


Val Loss: 6.8153
Val Perplexity: 911.6730


Epoch 4/10: 100%|██████████| 8533/8533 [00:20<00:00, 423.82batch/s]


Train Loss: 6.9639
Train Perplexity: 1057.7201


100%|██████████| 2428/2428 [00:03<00:00, 732.18it/s]


Val Loss: 6.6828
Val Perplexity: 798.5354


Epoch 5/10: 100%|██████████| 8533/8533 [00:20<00:00, 424.02batch/s]


Train Loss: 6.8493
Train Perplexity: 943.2059


100%|██████████| 2428/2428 [00:03<00:00, 725.96it/s]


Val Loss: 6.5901
Val Perplexity: 727.8622


Epoch 6/10: 100%|██████████| 8533/8533 [00:20<00:00, 420.52batch/s]


Train Loss: 6.7656
Train Perplexity: 867.4466


100%|██████████| 2428/2428 [00:03<00:00, 737.44it/s]


Val Loss: 6.5212
Val Perplexity: 679.3713


Epoch 7/10: 100%|██████████| 8533/8533 [00:20<00:00, 418.75batch/s]


Train Loss: 6.7014
Train Perplexity: 813.5209


100%|██████████| 2428/2428 [00:03<00:00, 729.15it/s]


Val Loss: 6.4671
Val Perplexity: 643.6417


Epoch 8/10: 100%|██████████| 8533/8533 [00:20<00:00, 424.15batch/s]


Train Loss: 6.6524
Train Perplexity: 774.6043


100%|██████████| 2428/2428 [00:03<00:00, 725.84it/s]


Val Loss: 6.4253
Val Perplexity: 617.2490


Epoch 9/10: 100%|██████████| 8533/8533 [00:20<00:00, 426.26batch/s]


Train Loss: 6.6086
Train Perplexity: 741.4161


100%|██████████| 2428/2428 [00:03<00:00, 739.26it/s]


Val Loss: 6.3862
Val Perplexity: 593.6209


Epoch 10/10: 100%|██████████| 8533/8533 [00:20<00:00, 422.93batch/s]


Train Loss: 6.5736
Train Perplexity: 715.9612


100%|██████████| 2428/2428 [00:03<00:00, 741.13it/s]


Val Loss: 6.3543
Val Perplexity: 574.9389


100%|██████████| 8533/8533 [00:11<00:00, 727.98it/s]
100%|██████████| 2428/2428 [00:03<00:00, 731.25it/s]
100%|██████████| 1212/1212 [00:01<00:00, 735.89it/s]


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
test_loss,▁
test_perplexity,▁
train_loss,▁
train_perplexity,▁
val_loss,▁
val_perplexity,▁

0,1
test_loss,6.37929
test_perplexity,589.50989
train_loss,6.41487
train_perplexity,610.86237
val_loss,6.35446
val_perplexity,575.04993


Epoch 1/10: 100%|██████████| 8533/8533 [00:21<00:00, 396.90batch/s]


Train Loss: 6.6137
Train Perplexity: 745.2122


100%|██████████| 2428/2428 [00:03<00:00, 762.25it/s]


Val Loss: 6.4146
Val Perplexity: 610.6725


Epoch 2/10: 100%|██████████| 8533/8533 [00:21<00:00, 395.95batch/s]


Train Loss: 6.5116
Train Perplexity: 672.8977


100%|██████████| 2428/2428 [00:03<00:00, 778.84it/s]


Val Loss: 6.4122
Val Perplexity: 609.2156


Epoch 3/10: 100%|██████████| 8533/8533 [00:21<00:00, 395.15batch/s]


Train Loss: 6.5050
Train Perplexity: 668.4534


100%|██████████| 2428/2428 [00:03<00:00, 767.24it/s]


Val Loss: 6.4154
Val Perplexity: 611.1628


Epoch 4/10: 100%|██████████| 8533/8533 [00:21<00:00, 398.61batch/s]


Train Loss: 6.5044
Train Perplexity: 668.0497


100%|██████████| 2428/2428 [00:03<00:00, 774.67it/s]


Val Loss: 6.4089
Val Perplexity: 607.2490


Epoch 5/10: 100%|██████████| 8533/8533 [00:21<00:00, 395.43batch/s]


Train Loss: 6.5043
Train Perplexity: 668.0143


100%|██████████| 2428/2428 [00:03<00:00, 768.84it/s]


Val Loss: 6.4067
Val Perplexity: 605.9191


Epoch 6/10: 100%|██████████| 8533/8533 [00:21<00:00, 397.03batch/s]


Train Loss: 6.5016
Train Perplexity: 666.1948


100%|██████████| 2428/2428 [00:03<00:00, 775.29it/s]


Val Loss: 6.4112
Val Perplexity: 608.5946


Epoch 7/10: 100%|██████████| 8533/8533 [00:21<00:00, 394.69batch/s]


Train Loss: 6.5055
Train Perplexity: 668.7841


100%|██████████| 2428/2428 [00:03<00:00, 772.52it/s]


Val Loss: 6.4135
Val Perplexity: 610.0555
Early stopping at epoch 7


100%|██████████| 8533/8533 [00:11<00:00, 731.47it/s]
100%|██████████| 2428/2428 [00:03<00:00, 736.25it/s]
100%|██████████| 1212/1212 [00:01<00:00, 738.34it/s]


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
test_loss,▁
test_perplexity,▁
train_loss,▁
train_perplexity,▁
val_loss,▁
val_perplexity,▁

0,1
test_loss,6.44166
test_perplexity,627.44897
train_loss,6.47754
train_perplexity,650.36816
val_loss,6.41253
val_perplexity,609.43243


Epoch 1/10: 100%|██████████| 8533/8533 [00:20<00:00, 424.94batch/s]


Train Loss: 7.0467
Train Perplexity: 1149.0447


100%|██████████| 2428/2428 [00:03<00:00, 734.10it/s]


Val Loss: 6.3460
Val Perplexity: 570.1835


Epoch 2/10: 100%|██████████| 8533/8533 [00:20<00:00, 426.44batch/s]


Train Loss: 6.4285
Train Perplexity: 619.2415


100%|██████████| 2428/2428 [00:03<00:00, 730.82it/s]


Val Loss: 6.1361
Val Perplexity: 462.2437


Epoch 3/10: 100%|██████████| 8533/8533 [00:20<00:00, 426.02batch/s]


Train Loss: 6.2595
Train Perplexity: 522.9464


100%|██████████| 2428/2428 [00:03<00:00, 744.89it/s]


Val Loss: 5.9744
Val Perplexity: 393.2257


Epoch 4/10: 100%|██████████| 8533/8533 [00:20<00:00, 422.50batch/s]


Train Loss: 6.1282
Train Perplexity: 458.6021


100%|██████████| 2428/2428 [00:03<00:00, 731.55it/s]


Val Loss: 5.8577
Val Perplexity: 349.9134


Epoch 5/10: 100%|██████████| 8533/8533 [00:20<00:00, 424.16batch/s]


Train Loss: 6.0331
Train Perplexity: 416.9855


100%|██████████| 2428/2428 [00:03<00:00, 743.15it/s]


Val Loss: 5.7834
Val Perplexity: 324.8612


Epoch 6/10: 100%|██████████| 8533/8533 [00:20<00:00, 425.97batch/s]


Train Loss: 5.9584
Train Perplexity: 387.0071


100%|██████████| 2428/2428 [00:03<00:00, 729.37it/s]


Val Loss: 5.7084
Val Perplexity: 301.3872


Epoch 7/10: 100%|██████████| 8533/8533 [00:20<00:00, 426.00batch/s]


Train Loss: 5.8982
Train Perplexity: 364.3930


100%|██████████| 2428/2428 [00:03<00:00, 735.51it/s]


Val Loss: 5.6567
Val Perplexity: 286.2035


Epoch 8/10: 100%|██████████| 8533/8533 [00:20<00:00, 423.62batch/s]


Train Loss: 5.8500
Train Perplexity: 347.2198


100%|██████████| 2428/2428 [00:03<00:00, 740.31it/s]


Val Loss: 5.6095
Val Perplexity: 273.0096


Epoch 9/10: 100%|██████████| 8533/8533 [00:20<00:00, 425.76batch/s]


Train Loss: 5.8079
Train Perplexity: 332.9342


100%|██████████| 2428/2428 [00:03<00:00, 733.84it/s]


Val Loss: 5.5764
Val Perplexity: 264.1101


Epoch 10/10: 100%|██████████| 8533/8533 [00:20<00:00, 426.34batch/s]


Train Loss: 5.7719
Train Perplexity: 321.1463


100%|██████████| 2428/2428 [00:03<00:00, 735.11it/s]


Val Loss: 5.5458
Val Perplexity: 256.1706


100%|██████████| 8533/8533 [00:11<00:00, 732.49it/s]
100%|██████████| 2428/2428 [00:03<00:00, 737.00it/s]
100%|██████████| 1212/1212 [00:01<00:00, 740.45it/s]


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
test_loss,▁
test_perplexity,▁
train_loss,▁
train_perplexity,▁
val_loss,▁
val_perplexity,▁

0,1
test_loss,5.58612
test_perplexity,266.69797
train_loss,5.5772
train_perplexity,264.33118
val_loss,5.54617
val_perplexity,256.25513


Epoch 1/10: 100%|██████████| 8533/8533 [00:23<00:00, 360.88batch/s]


Train Loss: 6.1207
Train Perplexity: 455.1994


100%|██████████| 2428/2428 [00:03<00:00, 729.07it/s]


Val Loss: 5.7221
Val Perplexity: 305.5311


Epoch 2/10: 100%|██████████| 8533/8533 [00:23<00:00, 362.03batch/s]


Train Loss: 5.8154
Train Perplexity: 335.4323


100%|██████████| 2428/2428 [00:03<00:00, 737.45it/s]


Val Loss: 5.6626
Val Perplexity: 287.9087


Epoch 3/10: 100%|██████████| 8533/8533 [00:23<00:00, 359.41batch/s]


Train Loss: 5.7408
Train Perplexity: 311.3225


100%|██████████| 2428/2428 [00:03<00:00, 743.96it/s]


Val Loss: 5.6518
Val Perplexity: 284.8033


Epoch 4/10: 100%|██████████| 8533/8533 [00:23<00:00, 360.96batch/s]


Train Loss: 5.7115
Train Perplexity: 302.3182


100%|██████████| 2428/2428 [00:03<00:00, 736.47it/s]


Val Loss: 5.6475
Val Perplexity: 283.5822


Epoch 5/10: 100%|██████████| 8533/8533 [00:23<00:00, 360.81batch/s]


Train Loss: 5.7011
Train Perplexity: 299.2014


100%|██████████| 2428/2428 [00:03<00:00, 736.55it/s]


Val Loss: 5.6567
Val Perplexity: 286.1941


Epoch 6/10: 100%|██████████| 8533/8533 [00:23<00:00, 361.23batch/s]


Train Loss: 5.7043
Train Perplexity: 300.1611


100%|██████████| 2428/2428 [00:03<00:00, 751.58it/s]


Val Loss: 5.6521
Val Perplexity: 284.8855
Early stopping at epoch 6


100%|██████████| 8533/8533 [00:12<00:00, 694.03it/s]
100%|██████████| 2428/2428 [00:03<00:00, 694.16it/s]
100%|██████████| 1212/1212 [00:01<00:00, 681.33it/s]


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
test_loss,▁
test_perplexity,▁
train_loss,▁
train_perplexity,▁
val_loss,▁
val_perplexity,▁

0,1
test_loss,5.68206
test_perplexity,293.55261
train_loss,5.47783
train_perplexity,239.32655
val_loss,5.65221
val_perplexity,284.92093


Epoch 1/10: 100%|██████████| 8533/8533 [00:20<00:00, 424.81batch/s]


Train Loss: 8.7677
Train Perplexity: 6423.0645


100%|██████████| 2428/2428 [00:03<00:00, 732.20it/s]


Val Loss: 7.2834
Val Perplexity: 1455.8701


Epoch 2/10: 100%|██████████| 8533/8533 [00:20<00:00, 425.13batch/s]


Train Loss: 7.2323
Train Perplexity: 1383.4301


100%|██████████| 2428/2428 [00:03<00:00, 706.22it/s]


Val Loss: 6.8520
Val Perplexity: 945.8177


Epoch 3/10: 100%|██████████| 8533/8533 [00:19<00:00, 426.79batch/s]


Train Loss: 6.9358
Train Perplexity: 1028.4075


100%|██████████| 2428/2428 [00:03<00:00, 729.58it/s]


Val Loss: 6.6620
Val Perplexity: 782.1249


Epoch 4/10: 100%|██████████| 8533/8533 [00:20<00:00, 426.07batch/s]


Train Loss: 6.7763
Train Perplexity: 876.7975


100%|██████████| 2428/2428 [00:03<00:00, 705.49it/s]


Val Loss: 6.5412
Val Perplexity: 693.1080


Epoch 5/10: 100%|██████████| 8533/8533 [00:20<00:00, 424.90batch/s]


Train Loss: 6.6713
Train Perplexity: 789.4272


100%|██████████| 2428/2428 [00:03<00:00, 715.84it/s]


Val Loss: 6.4599
Val Perplexity: 638.9877


Epoch 6/10: 100%|██████████| 8533/8533 [00:20<00:00, 423.84batch/s]


Train Loss: 6.5986
Train Perplexity: 734.0310


100%|██████████| 2428/2428 [00:03<00:00, 711.66it/s]


Val Loss: 6.3993
Val Perplexity: 601.4403


Epoch 7/10: 100%|██████████| 8533/8533 [00:19<00:00, 427.73batch/s]


Train Loss: 6.5437
Train Perplexity: 694.8523


100%|██████████| 2428/2428 [00:03<00:00, 710.15it/s]


Val Loss: 6.3535
Val Perplexity: 574.4807


Epoch 8/10: 100%|██████████| 8533/8533 [00:20<00:00, 423.81batch/s]


Train Loss: 6.4991
Train Perplexity: 664.5529


100%|██████████| 2428/2428 [00:03<00:00, 725.22it/s]


Val Loss: 6.3137
Val Perplexity: 552.1053


Epoch 9/10: 100%|██████████| 8533/8533 [00:20<00:00, 425.31batch/s]


Train Loss: 6.4628
Train Perplexity: 640.8441


100%|██████████| 2428/2428 [00:03<00:00, 727.94it/s]


Val Loss: 6.2816
Val Perplexity: 534.6273


Epoch 10/10: 100%|██████████| 8533/8533 [00:20<00:00, 425.53batch/s]


Train Loss: 6.4287
Train Perplexity: 619.3452


100%|██████████| 2428/2428 [00:03<00:00, 715.66it/s]


Val Loss: 6.2504
Val Perplexity: 518.1944


100%|██████████| 8533/8533 [00:12<00:00, 685.57it/s]
100%|██████████| 2428/2428 [00:03<00:00, 688.56it/s]
100%|██████████| 1212/1212 [00:01<00:00, 688.74it/s]


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
test_loss,▁
test_perplexity,▁
train_loss,▁
train_perplexity,▁
val_loss,▁
val_perplexity,▁

0,1
test_loss,6.27366
test_perplexity,530.41675
train_loss,6.30917
train_perplexity,549.58899
val_loss,6.25045
val_perplexity,518.24652


Epoch 1/10: 100%|██████████| 8533/8533 [00:23<00:00, 359.41batch/s]


Train Loss: 6.6165
Train Perplexity: 747.3362


100%|██████████| 2428/2428 [00:03<00:00, 746.69it/s]


Val Loss: 6.3787
Val Perplexity: 589.1603


Epoch 2/10: 100%|██████████| 8533/8533 [00:23<00:00, 362.89batch/s]


Train Loss: 6.4951
Train Perplexity: 661.8787


100%|██████████| 2428/2428 [00:03<00:00, 726.75it/s]


Val Loss: 6.3637
Val Perplexity: 580.3668


Epoch 3/10: 100%|██████████| 8533/8533 [00:23<00:00, 361.99batch/s]


Train Loss: 6.4899
Train Perplexity: 658.4476


100%|██████████| 2428/2428 [00:03<00:00, 740.39it/s]


Val Loss: 6.3767
Val Perplexity: 588.0009


Epoch 4/10: 100%|██████████| 8533/8533 [00:23<00:00, 360.12batch/s]


Train Loss: 6.4809
Train Perplexity: 652.5411


100%|██████████| 2428/2428 [00:03<00:00, 748.99it/s]


Val Loss: 6.4069
Val Perplexity: 606.0029
Early stopping at epoch 4


100%|██████████| 8533/8533 [00:12<00:00, 690.56it/s]
100%|██████████| 2428/2428 [00:03<00:00, 687.67it/s]
100%|██████████| 1212/1212 [00:01<00:00, 700.84it/s]


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
test_loss,▁
test_perplexity,▁
train_loss,▁
train_perplexity,▁
val_loss,▁
val_perplexity,▁

0,1
test_loss,6.43238
test_perplexity,621.65186
train_loss,6.46514
train_perplexity,642.35571
val_loss,6.40697
val_perplexity,606.05487


Epoch 1/10: 100%|██████████| 8533/8533 [00:20<00:00, 418.39batch/s]


Train Loss: 6.9078
Train Perplexity: 1000.0416


100%|██████████| 2428/2428 [00:03<00:00, 720.98it/s]


Val Loss: 6.2829
Val Perplexity: 535.3625


Epoch 2/10: 100%|██████████| 8533/8533 [00:20<00:00, 422.18batch/s]


Train Loss: 6.3220
Train Perplexity: 556.6595


100%|██████████| 2428/2428 [00:03<00:00, 725.14it/s]


Val Loss: 6.0493
Val Perplexity: 423.8151


Epoch 3/10: 100%|██████████| 8533/8533 [00:20<00:00, 424.53batch/s]


Train Loss: 6.1358
Train Perplexity: 462.1084


100%|██████████| 2428/2428 [00:03<00:00, 724.09it/s]


Val Loss: 5.8760
Val Perplexity: 356.3880


Epoch 4/10: 100%|██████████| 8533/8533 [00:19<00:00, 426.69batch/s]


Train Loss: 5.9978
Train Perplexity: 402.5378


100%|██████████| 2428/2428 [00:03<00:00, 728.36it/s]


Val Loss: 5.7607
Val Perplexity: 317.5623


Epoch 5/10: 100%|██████████| 8533/8533 [00:20<00:00, 426.23batch/s]


Train Loss: 5.8994
Train Perplexity: 364.8139


100%|██████████| 2428/2428 [00:03<00:00, 721.07it/s]


Val Loss: 5.6787
Val Perplexity: 292.5808


Epoch 6/10: 100%|██████████| 8533/8533 [00:20<00:00, 425.16batch/s]


Train Loss: 5.8244
Train Perplexity: 338.4673


100%|██████████| 2428/2428 [00:03<00:00, 731.62it/s]


Val Loss: 5.6103
Val Perplexity: 273.2163


Epoch 7/10: 100%|██████████| 8533/8533 [00:19<00:00, 427.04batch/s]


Train Loss: 5.7641
Train Perplexity: 318.6382


100%|██████████| 2428/2428 [00:03<00:00, 723.42it/s]


Val Loss: 5.5548
Val Perplexity: 258.4739


Epoch 8/10: 100%|██████████| 8533/8533 [00:19<00:00, 429.38batch/s]


Train Loss: 5.7122
Train Perplexity: 302.5498


100%|██████████| 2428/2428 [00:03<00:00, 721.80it/s]


Val Loss: 5.5212
Val Perplexity: 249.9446


Epoch 9/10: 100%|██████████| 8533/8533 [00:20<00:00, 425.04batch/s]


Train Loss: 5.6679
Train Perplexity: 289.4264


100%|██████████| 2428/2428 [00:03<00:00, 726.27it/s]


Val Loss: 5.4764
Val Perplexity: 238.9846


Epoch 10/10: 100%|██████████| 8533/8533 [00:20<00:00, 422.96batch/s]


Train Loss: 5.6310
Train Perplexity: 278.9517


100%|██████████| 2428/2428 [00:03<00:00, 726.04it/s]


Val Loss: 5.4456
Val Perplexity: 231.7256


100%|██████████| 8533/8533 [00:12<00:00, 689.48it/s]
100%|██████████| 2428/2428 [00:03<00:00, 683.09it/s]
100%|██████████| 1212/1212 [00:01<00:00, 683.20it/s]


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
test_loss,▁
test_perplexity,▁
train_loss,▁
train_perplexity,▁
val_loss,▁
val_perplexity,▁

0,1
test_loss,5.48534
test_perplexity,241.13152
train_loss,5.46819
train_perplexity,237.03116
val_loss,5.44491
val_perplexity,231.57655


Epoch 1/10: 100%|██████████| 8533/8533 [00:29<00:00, 284.45batch/s]


Train Loss: 6.1171
Train Perplexity: 453.5426


100%|██████████| 2428/2428 [00:03<00:00, 681.69it/s]


Val Loss: 5.7194
Val Perplexity: 304.7272


Epoch 2/10: 100%|██████████| 8533/8533 [00:30<00:00, 280.45batch/s]


Train Loss: 5.8275
Train Perplexity: 339.5235


100%|██████████| 2428/2428 [00:03<00:00, 678.48it/s]


Val Loss: 5.6744
Val Perplexity: 291.3253


Epoch 3/10: 100%|██████████| 8533/8533 [00:30<00:00, 280.80batch/s]


Train Loss: 5.7422
Train Perplexity: 311.7535


100%|██████████| 2428/2428 [00:03<00:00, 687.76it/s]


Val Loss: 5.6424
Val Perplexity: 282.1266


Epoch 4/10: 100%|██████████| 8533/8533 [00:30<00:00, 280.86batch/s]


Train Loss: 5.7079
Train Perplexity: 301.2453


100%|██████████| 2428/2428 [00:03<00:00, 685.42it/s]


Val Loss: 5.6474
Val Perplexity: 283.5530


Epoch 5/10: 100%|██████████| 8533/8533 [00:30<00:00, 280.50batch/s]


Train Loss: 5.6981
Train Perplexity: 298.3061


100%|██████████| 2428/2428 [00:03<00:00, 670.80it/s]


Val Loss: 5.6516
Val Perplexity: 284.7563
Early stopping at epoch 5


100%|██████████| 8533/8533 [00:13<00:00, 644.47it/s]
100%|██████████| 2428/2428 [00:03<00:00, 641.25it/s]
100%|██████████| 1212/1212 [00:01<00:00, 641.89it/s]


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
test_loss,▁
test_perplexity,▁
train_loss,▁
train_perplexity,▁
val_loss,▁
val_perplexity,▁

0,1
test_loss,5.6881
test_perplexity,295.33163
train_loss,5.48367
train_perplexity,240.72885
val_loss,5.65148
val_perplexity,284.71204


Epoch 1/10: 100%|██████████| 8533/8533 [00:21<00:00, 402.87batch/s]


Train Loss: 8.6052
Train Perplexity: 5459.9297


100%|██████████| 2428/2428 [00:03<00:00, 685.79it/s]


Val Loss: 7.1998
Val Perplexity: 1339.1150


Epoch 2/10: 100%|██████████| 8533/8533 [00:20<00:00, 407.12batch/s]


Train Loss: 7.1283
Train Perplexity: 1246.7933


100%|██████████| 2428/2428 [00:03<00:00, 674.36it/s]


Val Loss: 6.7944
Val Perplexity: 892.8051


Epoch 3/10: 100%|██████████| 8533/8533 [00:21<00:00, 404.06batch/s]


Train Loss: 6.8620
Train Perplexity: 955.2524


100%|██████████| 2428/2428 [00:03<00:00, 685.71it/s]


Val Loss: 6.6207
Val Perplexity: 750.4519


Epoch 4/10: 100%|██████████| 8533/8533 [00:21<00:00, 404.97batch/s]


Train Loss: 6.7160
Train Perplexity: 825.5353


100%|██████████| 2428/2428 [00:03<00:00, 685.20it/s]


Val Loss: 6.5064
Val Perplexity: 669.4305


Epoch 5/10: 100%|██████████| 8533/8533 [00:21<00:00, 403.21batch/s]


Train Loss: 6.6177
Train Perplexity: 748.2162


100%|██████████| 2428/2428 [00:03<00:00, 677.31it/s]


Val Loss: 6.4271
Val Perplexity: 618.3528


Epoch 6/10: 100%|██████████| 8533/8533 [00:20<00:00, 406.41batch/s]


Train Loss: 6.5481
Train Perplexity: 697.9175


100%|██████████| 2428/2428 [00:03<00:00, 672.19it/s]


Val Loss: 6.3704
Val Perplexity: 584.2714


Epoch 7/10: 100%|██████████| 8533/8533 [00:21<00:00, 404.49batch/s]


Train Loss: 6.4950
Train Perplexity: 661.8307


100%|██████████| 2428/2428 [00:03<00:00, 681.42it/s]


Val Loss: 6.3245
Val Perplexity: 558.0928


Epoch 8/10: 100%|██████████| 8533/8533 [00:21<00:00, 402.15batch/s]


Train Loss: 6.4520
Train Perplexity: 633.9730


100%|██████████| 2428/2428 [00:03<00:00, 683.93it/s]


Val Loss: 6.2870
Val Perplexity: 537.5266


Epoch 9/10: 100%|██████████| 8533/8533 [00:21<00:00, 404.51batch/s]


Train Loss: 6.4150
Train Perplexity: 610.9562


100%|██████████| 2428/2428 [00:03<00:00, 685.89it/s]


Val Loss: 6.2523
Val Perplexity: 519.2194


Epoch 10/10: 100%|██████████| 8533/8533 [00:21<00:00, 405.21batch/s]


Train Loss: 6.3817
Train Perplexity: 590.9238


100%|██████████| 2428/2428 [00:03<00:00, 688.64it/s]


Val Loss: 6.2218
Val Perplexity: 503.6326


100%|██████████| 8533/8533 [00:13<00:00, 647.48it/s]
100%|██████████| 2428/2428 [00:03<00:00, 643.54it/s]
100%|██████████| 1212/1212 [00:01<00:00, 639.98it/s]


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
test_loss,▁
test_perplexity,▁
train_loss,▁
train_perplexity,▁
val_loss,▁
val_perplexity,▁

0,1
test_loss,6.24518
test_perplexity,515.52283
train_loss,6.2815
train_perplexity,534.59064
val_loss,6.22141
val_perplexity,503.41028


Epoch 1/10: 100%|██████████| 8533/8533 [00:30<00:00, 281.32batch/s]


Train Loss: 6.6424
Train Perplexity: 766.9230


100%|██████████| 2428/2428 [00:03<00:00, 683.85it/s]


Val Loss: 6.4228
Val Perplexity: 615.6940


Epoch 2/10: 100%|██████████| 8533/8533 [00:30<00:00, 281.44batch/s]


Train Loss: 6.5240
Train Perplexity: 681.2937


100%|██████████| 2428/2428 [00:03<00:00, 686.36it/s]


Val Loss: 6.4158
Val Perplexity: 611.4166


Epoch 3/10: 100%|██████████| 8533/8533 [00:30<00:00, 282.88batch/s]


Train Loss: 6.5129
Train Perplexity: 673.7868


100%|██████████| 2428/2428 [00:03<00:00, 682.73it/s]


Val Loss: 6.4148
Val Perplexity: 610.8024


Epoch 4/10: 100%|██████████| 8533/8533 [00:30<00:00, 282.89batch/s]


Train Loss: 6.5104
Train Perplexity: 672.0646


100%|██████████| 2428/2428 [00:03<00:00, 671.82it/s]


Val Loss: 6.4124
Val Perplexity: 609.3255


Epoch 5/10: 100%|██████████| 8533/8533 [00:30<00:00, 281.83batch/s]


Train Loss: 6.5095
Train Perplexity: 671.5104


100%|██████████| 2428/2428 [00:03<00:00, 678.00it/s]


Val Loss: 6.4133
Val Perplexity: 609.8900


Epoch 6/10: 100%|██████████| 8533/8533 [00:30<00:00, 280.70batch/s]


Train Loss: 6.5104
Train Perplexity: 672.1188


100%|██████████| 2428/2428 [00:03<00:00, 687.46it/s]


Val Loss: 6.4168
Val Perplexity: 612.0190
Early stopping at epoch 6


100%|██████████| 8533/8533 [00:13<00:00, 642.04it/s]
100%|██████████| 2428/2428 [00:03<00:00, 645.40it/s]
100%|██████████| 1212/1212 [00:01<00:00, 642.28it/s]


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
test_loss,▁
test_perplexity,▁
train_loss,▁
train_perplexity,▁
val_loss,▁
val_perplexity,▁

0,1
test_loss,6.44331
test_perplexity,628.48291
train_loss,6.48713
train_perplexity,656.63843
val_loss,6.41688
val_perplexity,612.08789


Epoch 1/10: 100%|██████████| 8533/8533 [00:21<00:00, 405.22batch/s]


Train Loss: 6.8379
Train Perplexity: 932.5400


100%|██████████| 2428/2428 [00:03<00:00, 691.66it/s]


Val Loss: 6.2330
Val Perplexity: 509.2758


Epoch 2/10: 100%|██████████| 8533/8533 [00:21<00:00, 404.06batch/s]


Train Loss: 6.2520
Train Perplexity: 519.0639


100%|██████████| 2428/2428 [00:03<00:00, 684.19it/s]


Val Loss: 5.9905
Val Perplexity: 399.6279


Epoch 3/10: 100%|██████████| 8533/8533 [00:21<00:00, 402.09batch/s]


Train Loss: 6.0554
Train Perplexity: 426.4116


100%|██████████| 2428/2428 [00:03<00:00, 681.64it/s]


Val Loss: 5.8303
Val Perplexity: 340.4470


Epoch 4/10: 100%|██████████| 8533/8533 [00:21<00:00, 404.53batch/s]


Train Loss: 5.9197
Train Perplexity: 372.3155


100%|██████████| 2428/2428 [00:03<00:00, 682.58it/s]


Val Loss: 5.7112
Val Perplexity: 302.2373


Epoch 5/10: 100%|██████████| 8533/8533 [00:21<00:00, 401.44batch/s]


Train Loss: 5.8228
Train Perplexity: 337.9021


100%|██████████| 2428/2428 [00:03<00:00, 677.67it/s]


Val Loss: 5.6302
Val Perplexity: 278.7077


Epoch 6/10: 100%|██████████| 8533/8533 [00:21<00:00, 405.70batch/s]


Train Loss: 5.7474
Train Perplexity: 313.3824


100%|██████████| 2428/2428 [00:03<00:00, 678.11it/s]


Val Loss: 5.5602
Val Perplexity: 259.8817


Epoch 7/10: 100%|██████████| 8533/8533 [00:21<00:00, 403.74batch/s]


Train Loss: 5.6866
Train Perplexity: 294.8757


100%|██████████| 2428/2428 [00:03<00:00, 683.78it/s]


Val Loss: 5.5104
Val Perplexity: 247.2541


Epoch 8/10: 100%|██████████| 8533/8533 [00:21<00:00, 403.90batch/s]


Train Loss: 5.6341
Train Perplexity: 279.7975


100%|██████████| 2428/2428 [00:03<00:00, 676.63it/s]


Val Loss: 5.4619
Val Perplexity: 235.5523


Epoch 9/10: 100%|██████████| 8533/8533 [00:21<00:00, 405.58batch/s]


Train Loss: 5.5904
Train Perplexity: 267.8383


100%|██████████| 2428/2428 [00:03<00:00, 683.29it/s]


Val Loss: 5.4265
Val Perplexity: 227.3617


Epoch 10/10: 100%|██████████| 8533/8533 [00:21<00:00, 404.94batch/s]


Train Loss: 5.5522
Train Perplexity: 257.8049


100%|██████████| 2428/2428 [00:03<00:00, 668.82it/s]


Val Loss: 5.3953
Val Perplexity: 220.3663


100%|██████████| 8533/8533 [00:13<00:00, 642.54it/s]
100%|██████████| 2428/2428 [00:03<00:00, 636.13it/s]
100%|██████████| 1212/1212 [00:01<00:00, 645.92it/s]


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
test_loss,▁
test_perplexity,▁
train_loss,▁
train_perplexity,▁
val_loss,▁
val_perplexity,▁

0,1
test_loss,5.43446
test_perplexity,229.16878
train_loss,5.40272
train_perplexity,222.00935
val_loss,5.39506
val_perplexity,220.31528


Epoch 1/10: 100%|██████████| 8533/8533 [00:21<00:00, 396.31batch/s]


Train Loss: 6.3970
Train Perplexity: 600.0298


100%|██████████| 2428/2428 [00:03<00:00, 745.54it/s]


Val Loss: 6.0265
Val Perplexity: 414.2443


Epoch 2/10: 100%|██████████| 8533/8533 [00:21<00:00, 392.27batch/s]


Train Loss: 6.1621
Train Perplexity: 474.4361


100%|██████████| 2428/2428 [00:03<00:00, 763.79it/s]


Val Loss: 5.8942
Val Perplexity: 362.9248


Epoch 3/10: 100%|██████████| 8533/8533 [00:21<00:00, 389.05batch/s]


Train Loss: 6.1054
Train Perplexity: 448.2553


100%|██████████| 2428/2428 [00:03<00:00, 754.40it/s]


Val Loss: 5.8874
Val Perplexity: 360.4549


Epoch 4/10: 100%|██████████| 8533/8533 [00:21<00:00, 392.64batch/s]


Train Loss: 6.0850
Train Perplexity: 439.2075


100%|██████████| 2428/2428 [00:03<00:00, 766.44it/s]


Val Loss: 5.8779
Val Perplexity: 357.0411


Epoch 5/10: 100%|██████████| 8533/8533 [00:21<00:00, 391.00batch/s]


Train Loss: 6.0789
Train Perplexity: 436.5468


100%|██████████| 2428/2428 [00:03<00:00, 761.86it/s]


Val Loss: 5.8885
Val Perplexity: 360.8747


Epoch 6/10: 100%|██████████| 8533/8533 [00:21<00:00, 391.00batch/s]


Train Loss: 6.0815
Train Perplexity: 437.7020


100%|██████████| 2428/2428 [00:03<00:00, 735.25it/s]


Val Loss: 5.8867
Val Perplexity: 360.2317
Early stopping at epoch 6


100%|██████████| 8533/8533 [00:11<00:00, 730.24it/s]
100%|██████████| 2428/2428 [00:03<00:00, 729.28it/s]
100%|██████████| 1212/1212 [00:01<00:00, 735.94it/s]


VBox(children=(Label(value='0.020 MB of 0.020 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
test_loss,▁
test_perplexity,▁
train_loss,▁
train_perplexity,▁
val_loss,▁
val_perplexity,▁

0,1
test_loss,5.9213
test_perplexity,372.89597
train_loss,5.87762
train_perplexity,356.95773
val_loss,5.8863
val_perplexity,360.07162


Epoch 1/10: 100%|██████████| 8533/8533 [00:20<00:00, 420.52batch/s]


Train Loss: 9.0541
Train Perplexity: 8553.3242


100%|██████████| 2428/2428 [00:03<00:00, 713.69it/s]


Val Loss: 7.6898
Val Perplexity: 2186.0234


Epoch 2/10: 100%|██████████| 8533/8533 [00:20<00:00, 420.91batch/s]


Train Loss: 7.6999
Train Perplexity: 2208.0464


100%|██████████| 2428/2428 [00:03<00:00, 730.60it/s]


Val Loss: 7.0940
Val Perplexity: 1204.7760


Epoch 3/10: 100%|██████████| 8533/8533 [00:20<00:00, 420.12batch/s]


Train Loss: 7.3135
Train Perplexity: 1500.3585


100%|██████████| 2428/2428 [00:03<00:00, 732.35it/s]


Val Loss: 6.8628
Val Perplexity: 956.0700


Epoch 4/10: 100%|██████████| 8533/8533 [00:20<00:00, 415.49batch/s]


Train Loss: 7.1216
Train Perplexity: 1238.4636


100%|██████████| 2428/2428 [00:03<00:00, 731.41it/s]


Val Loss: 6.7369
Val Perplexity: 842.9028


Epoch 5/10: 100%|██████████| 8533/8533 [00:19<00:00, 427.30batch/s]


Train Loss: 7.0014
Train Perplexity: 1098.1586


100%|██████████| 2428/2428 [00:03<00:00, 712.56it/s]


Val Loss: 6.6472
Val Perplexity: 770.5917


Epoch 6/10: 100%|██████████| 8533/8533 [00:20<00:00, 422.17batch/s]


Train Loss: 6.9140
Train Perplexity: 1006.2739


100%|██████████| 2428/2428 [00:03<00:00, 733.01it/s]


Val Loss: 6.5740
Val Perplexity: 716.2351


Epoch 7/10: 100%|██████████| 8533/8533 [00:20<00:00, 420.86batch/s]


Train Loss: 6.8481
Train Perplexity: 942.0804


100%|██████████| 2428/2428 [00:03<00:00, 731.20it/s]


Val Loss: 6.5188
Val Perplexity: 677.7806


Epoch 8/10: 100%|██████████| 8533/8533 [00:20<00:00, 422.57batch/s]


Train Loss: 6.7932
Train Perplexity: 891.7418


100%|██████████| 2428/2428 [00:03<00:00, 738.63it/s]


Val Loss: 6.4757
Val Perplexity: 649.2025


Epoch 9/10: 100%|██████████| 8533/8533 [00:20<00:00, 423.61batch/s]


Train Loss: 6.7485
Train Perplexity: 852.7993


100%|██████████| 2428/2428 [00:03<00:00, 719.48it/s]


Val Loss: 6.4370
Val Perplexity: 624.5115


Epoch 10/10: 100%|██████████| 8533/8533 [00:20<00:00, 422.27batch/s]


Train Loss: 6.7092
Train Perplexity: 819.9382


100%|██████████| 2428/2428 [00:03<00:00, 735.28it/s]


Val Loss: 6.4032
Val Perplexity: 603.7974


100%|██████████| 8533/8533 [00:11<00:00, 722.79it/s]
100%|██████████| 2428/2428 [00:03<00:00, 732.95it/s]
100%|██████████| 1212/1212 [00:01<00:00, 736.59it/s]


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
test_loss,▁
test_perplexity,▁
train_loss,▁
train_perplexity,▁
val_loss,▁
val_perplexity,▁

0,1
test_loss,6.42913
test_perplexity,619.63702
train_loss,6.46244
train_perplexity,640.62073
val_loss,6.40286
val_perplexity,603.56628


Epoch 1/10: 100%|██████████| 8533/8533 [00:21<00:00, 391.37batch/s]


Train Loss: 6.6293
Train Perplexity: 756.9836


100%|██████████| 2428/2428 [00:03<00:00, 767.32it/s]


Val Loss: 6.4176
Val Perplexity: 612.5331


Epoch 2/10: 100%|██████████| 8533/8533 [00:21<00:00, 391.02batch/s]


Train Loss: 6.5185
Train Perplexity: 677.5680


100%|██████████| 2428/2428 [00:03<00:00, 775.22it/s]


Val Loss: 6.4096
Val Perplexity: 607.6617


Epoch 3/10: 100%|██████████| 8533/8533 [00:21<00:00, 392.19batch/s]


Train Loss: 6.5125
Train Perplexity: 673.5205


100%|██████████| 2428/2428 [00:03<00:00, 775.65it/s]


Val Loss: 6.4069
Val Perplexity: 606.0208


Epoch 4/10: 100%|██████████| 8533/8533 [00:21<00:00, 393.69batch/s]


Train Loss: 6.5062
Train Perplexity: 669.2722


100%|██████████| 2428/2428 [00:03<00:00, 758.09it/s]


Val Loss: 6.4087
Val Perplexity: 607.1146


Epoch 5/10: 100%|██████████| 8533/8533 [00:21<00:00, 387.91batch/s]


Train Loss: 6.5058
Train Perplexity: 668.9866


100%|██████████| 2428/2428 [00:03<00:00, 777.88it/s]


Val Loss: 6.4111
Val Perplexity: 608.5450
Early stopping at epoch 5


100%|██████████| 8533/8533 [00:11<00:00, 729.06it/s]
100%|██████████| 2428/2428 [00:03<00:00, 741.19it/s]
100%|██████████| 1212/1212 [00:01<00:00, 732.01it/s]


VBox(children=(Label(value='0.020 MB of 0.020 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
test_loss,▁
test_perplexity,▁
train_loss,▁
train_perplexity,▁
val_loss,▁
val_perplexity,▁

0,1
test_loss,6.42916
test_perplexity,619.65033
train_loss,6.47628
train_perplexity,649.54901
val_loss,6.41064
val_perplexity,608.28125


Epoch 1/10: 100%|██████████| 8533/8533 [00:20<00:00, 422.80batch/s]


Train Loss: 7.2790
Train Perplexity: 1449.4974


100%|██████████| 2428/2428 [00:03<00:00, 736.60it/s]


Val Loss: 6.4420
Val Perplexity: 627.6548


Epoch 2/10: 100%|██████████| 8533/8533 [00:20<00:00, 419.76batch/s]


Train Loss: 6.6129
Train Perplexity: 744.6537


100%|██████████| 2428/2428 [00:03<00:00, 728.38it/s]


Val Loss: 6.2519
Val Perplexity: 518.9986


Epoch 3/10: 100%|██████████| 8533/8533 [00:21<00:00, 404.88batch/s]


Train Loss: 6.4571
Train Perplexity: 637.2396


100%|██████████| 2428/2428 [00:03<00:00, 689.62it/s]


Val Loss: 6.1345
Val Perplexity: 461.5211


Epoch 4/10: 100%|██████████| 8533/8533 [00:20<00:00, 407.31batch/s]


Train Loss: 6.3502
Train Perplexity: 572.6024


100%|██████████| 2428/2428 [00:03<00:00, 729.42it/s]


Val Loss: 6.0364
Val Perplexity: 418.3859


Epoch 5/10: 100%|██████████| 8533/8533 [00:20<00:00, 425.95batch/s]


Train Loss: 6.2590
Train Perplexity: 522.6949


100%|██████████| 2428/2428 [00:03<00:00, 728.83it/s]


Val Loss: 5.9391
Val Perplexity: 379.5885


Epoch 6/10: 100%|██████████| 8533/8533 [00:20<00:00, 423.67batch/s]


Train Loss: 6.1813
Train Perplexity: 483.5977


100%|██████████| 2428/2428 [00:03<00:00, 738.00it/s]


Val Loss: 5.8732
Val Perplexity: 355.3718


Epoch 7/10: 100%|██████████| 8533/8533 [00:20<00:00, 423.37batch/s]


Train Loss: 6.1201
Train Perplexity: 454.8902


100%|██████████| 2428/2428 [00:03<00:00, 733.65it/s]


Val Loss: 5.8118
Val Perplexity: 334.2345


Epoch 8/10: 100%|██████████| 8533/8533 [00:20<00:00, 424.66batch/s]


Train Loss: 6.0690
Train Perplexity: 432.2638


100%|██████████| 2428/2428 [00:03<00:00, 727.70it/s]


Val Loss: 5.7673
Val Perplexity: 319.6791


Epoch 9/10: 100%|██████████| 8533/8533 [00:19<00:00, 427.04batch/s]


Train Loss: 6.0256
Train Perplexity: 413.8964


100%|██████████| 2428/2428 [00:03<00:00, 732.95it/s]


Val Loss: 5.7234
Val Perplexity: 305.9539


Epoch 10/10: 100%|██████████| 8533/8533 [00:20<00:00, 422.54batch/s]


Train Loss: 5.9887
Train Perplexity: 398.8768


100%|██████████| 2428/2428 [00:03<00:00, 740.52it/s]


Val Loss: 5.6901
Val Perplexity: 295.9165


100%|██████████| 8533/8533 [00:11<00:00, 725.72it/s]
100%|██████████| 2428/2428 [00:03<00:00, 720.39it/s]
100%|██████████| 1212/1212 [00:01<00:00, 720.04it/s]


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
test_loss,▁
test_perplexity,▁
train_loss,▁
train_perplexity,▁
val_loss,▁
val_perplexity,▁

0,1
test_loss,5.72551
test_perplexity,306.59109
train_loss,5.73333
train_perplexity,308.99527
val_loss,5.69007
val_perplexity,295.91409


Epoch 1/10: 100%|██████████| 8533/8533 [00:23<00:00, 362.99batch/s]


Train Loss: 6.3756
Train Perplexity: 587.3385


100%|██████████| 2428/2428 [00:03<00:00, 740.82it/s]


Val Loss: 6.0104
Val Perplexity: 407.6377


Epoch 2/10: 100%|██████████| 8533/8533 [00:23<00:00, 361.60batch/s]


Train Loss: 6.1636
Train Perplexity: 475.1160


100%|██████████| 2428/2428 [00:03<00:00, 733.05it/s]


Val Loss: 5.9303
Val Perplexity: 376.2542


Epoch 3/10: 100%|██████████| 8533/8533 [00:23<00:00, 362.58batch/s]


Train Loss: 6.1127
Train Perplexity: 451.5382


100%|██████████| 2428/2428 [00:03<00:00, 746.87it/s]


Val Loss: 5.9216
Val Perplexity: 373.0098


Epoch 4/10: 100%|██████████| 8533/8533 [00:23<00:00, 361.04batch/s]


Train Loss: 6.0973
Train Perplexity: 444.6666


100%|██████████| 2428/2428 [00:03<00:00, 744.34it/s]


Val Loss: 5.9167
Val Perplexity: 371.1676


Epoch 5/10: 100%|██████████| 8533/8533 [00:23<00:00, 360.65batch/s]


Train Loss: 6.0910
Train Perplexity: 441.8578


100%|██████████| 2428/2428 [00:03<00:00, 737.39it/s]


Val Loss: 5.9404
Val Perplexity: 380.1058


Epoch 6/10: 100%|██████████| 8533/8533 [00:23<00:00, 360.75batch/s]


Train Loss: 6.0949
Train Perplexity: 443.5925


100%|██████████| 2428/2428 [00:03<00:00, 738.76it/s]


Val Loss: 5.9183
Val Perplexity: 371.7723
Early stopping at epoch 6


100%|██████████| 8533/8533 [00:12<00:00, 683.96it/s]
100%|██████████| 2428/2428 [00:03<00:00, 696.63it/s]
100%|██████████| 1212/1212 [00:01<00:00, 694.93it/s]


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
test_loss,▁
test_perplexity,▁
train_loss,▁
train_perplexity,▁
val_loss,▁
val_perplexity,▁

0,1
test_loss,5.95408
test_perplexity,385.32101
train_loss,5.91054
train_perplexity,368.90411
val_loss,5.918
val_perplexity,371.66772


Epoch 1/10: 100%|██████████| 8533/8533 [00:19<00:00, 426.82batch/s]


Train Loss: 8.8381
Train Perplexity: 6891.9805


100%|██████████| 2428/2428 [00:03<00:00, 727.08it/s]


Val Loss: 7.3854
Val Perplexity: 1612.2638


Epoch 2/10: 100%|██████████| 8533/8533 [00:19<00:00, 426.97batch/s]


Train Loss: 7.4093
Train Perplexity: 1651.2312


100%|██████████| 2428/2428 [00:03<00:00, 731.31it/s]


Val Loss: 6.9309
Val Perplexity: 1023.4040


Epoch 3/10: 100%|██████████| 8533/8533 [00:20<00:00, 425.49batch/s]


Train Loss: 7.0826
Train Perplexity: 1191.0625


100%|██████████| 2428/2428 [00:03<00:00, 728.21it/s]


Val Loss: 6.7317
Val Perplexity: 838.5472


Epoch 4/10: 100%|██████████| 8533/8533 [00:20<00:00, 425.94batch/s]


Train Loss: 6.9116
Train Perplexity: 1003.8091


100%|██████████| 2428/2428 [00:03<00:00, 714.49it/s]


Val Loss: 6.6084
Val Perplexity: 741.2645


Epoch 5/10: 100%|██████████| 8533/8533 [00:20<00:00, 425.83batch/s]


Train Loss: 6.8028
Train Perplexity: 900.3306


100%|██████████| 2428/2428 [00:03<00:00, 726.27it/s]


Val Loss: 6.5242
Val Perplexity: 681.4572


Epoch 6/10: 100%|██████████| 8533/8533 [00:20<00:00, 426.48batch/s]


Train Loss: 6.7275
Train Perplexity: 835.0928


100%|██████████| 2428/2428 [00:03<00:00, 729.84it/s]


Val Loss: 6.4608
Val Perplexity: 639.5851


Epoch 7/10: 100%|██████████| 8533/8533 [00:20<00:00, 424.62batch/s]


Train Loss: 6.6696
Train Perplexity: 788.0634


100%|██████████| 2428/2428 [00:03<00:00, 727.05it/s]


Val Loss: 6.4124
Val Perplexity: 609.3328


Epoch 8/10: 100%|██████████| 8533/8533 [00:19<00:00, 427.44batch/s]


Train Loss: 6.6244
Train Perplexity: 753.2892


100%|██████████| 2428/2428 [00:03<00:00, 730.86it/s]


Val Loss: 6.3760
Val Perplexity: 587.5606


Epoch 9/10: 100%|██████████| 8533/8533 [00:19<00:00, 428.52batch/s]


Train Loss: 6.5865
Train Perplexity: 725.2655


100%|██████████| 2428/2428 [00:03<00:00, 721.42it/s]


Val Loss: 6.3395
Val Perplexity: 566.5002


Epoch 10/10: 100%|██████████| 8533/8533 [00:20<00:00, 425.75batch/s]


Train Loss: 6.5531
Train Perplexity: 701.4307


100%|██████████| 2428/2428 [00:03<00:00, 725.81it/s]


Val Loss: 6.3105
Val Perplexity: 550.3372


100%|██████████| 8533/8533 [00:12<00:00, 688.58it/s]
100%|██████████| 2428/2428 [00:03<00:00, 693.01it/s]
100%|██████████| 1212/1212 [00:01<00:00, 697.56it/s]


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
test_loss,▁
test_perplexity,▁
train_loss,▁
train_perplexity,▁
val_loss,▁
val_perplexity,▁

0,1
test_loss,6.33543
test_perplexity,564.20935
train_loss,6.37214
train_perplexity,585.30847
val_loss,6.31069
val_perplexity,550.42218


Epoch 1/10: 100%|██████████| 8533/8533 [00:23<00:00, 361.45batch/s]


Train Loss: 6.6509
Train Perplexity: 773.5181


100%|██████████| 2428/2428 [00:03<00:00, 737.15it/s]


Val Loss: 6.4196
Val Perplexity: 613.7819


Epoch 2/10: 100%|██████████| 8533/8533 [00:23<00:00, 359.34batch/s]


Train Loss: 6.5216
Train Perplexity: 679.6697


100%|██████████| 2428/2428 [00:03<00:00, 746.13it/s]


Val Loss: 6.4134
Val Perplexity: 609.9458


Epoch 3/10: 100%|██████████| 8533/8533 [00:23<00:00, 361.94batch/s]


Train Loss: 6.5183
Train Perplexity: 677.4252


100%|██████████| 2428/2428 [00:03<00:00, 741.10it/s]


Val Loss: 6.4147
Val Perplexity: 610.7339


Epoch 4/10: 100%|██████████| 8533/8533 [00:23<00:00, 361.78batch/s]


Train Loss: 6.5108
Train Perplexity: 672.3583


100%|██████████| 2428/2428 [00:03<00:00, 733.79it/s]


Val Loss: 6.4131
Val Perplexity: 609.8097


Epoch 5/10: 100%|██████████| 8533/8533 [00:23<00:00, 358.70batch/s]


Train Loss: 6.5095
Train Perplexity: 671.5140


100%|██████████| 2428/2428 [00:03<00:00, 741.20it/s]


Val Loss: 6.4115
Val Perplexity: 608.8312


Epoch 6/10: 100%|██████████| 8533/8533 [00:23<00:00, 360.85batch/s]


Train Loss: 6.5095
Train Perplexity: 671.5223


100%|██████████| 2428/2428 [00:03<00:00, 714.54it/s]


Val Loss: 6.4135
Val Perplexity: 610.0118


Epoch 7/10: 100%|██████████| 8533/8533 [00:23<00:00, 360.91batch/s]


Train Loss: 6.5096
Train Perplexity: 671.5787


100%|██████████| 2428/2428 [00:03<00:00, 736.23it/s]


Val Loss: 6.4160
Val Perplexity: 611.5301
Early stopping at epoch 7


100%|██████████| 8533/8533 [00:12<00:00, 688.02it/s]
100%|██████████| 2428/2428 [00:03<00:00, 692.57it/s]
100%|██████████| 1212/1212 [00:01<00:00, 693.08it/s]


VBox(children=(Label(value='0.020 MB of 0.020 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
test_loss,▁
test_perplexity,▁
train_loss,▁
train_perplexity,▁
val_loss,▁
val_perplexity,▁

0,1
test_loss,6.4411
test_perplexity,627.09302
train_loss,6.48582
train_perplexity,655.77386
val_loss,6.41539
val_perplexity,611.18201


Epoch 1/10: 100%|██████████| 8533/8533 [00:20<00:00, 423.40batch/s]


Train Loss: 7.0436
Train Perplexity: 1145.5134


100%|██████████| 2428/2428 [00:03<00:00, 727.91it/s]


Val Loss: 6.3213
Val Perplexity: 556.2910


Epoch 2/10: 100%|██████████| 8533/8533 [00:20<00:00, 421.37batch/s]


Train Loss: 6.4373
Train Perplexity: 624.7141


100%|██████████| 2428/2428 [00:03<00:00, 726.70it/s]


Val Loss: 6.1127
Val Perplexity: 451.5636


Epoch 3/10: 100%|██████████| 8533/8533 [00:20<00:00, 420.64batch/s]


Train Loss: 6.2707
Train Perplexity: 528.8333


100%|██████████| 2428/2428 [00:03<00:00, 720.58it/s]


Val Loss: 5.9626
Val Perplexity: 388.6026


Epoch 4/10: 100%|██████████| 8533/8533 [00:20<00:00, 422.70batch/s]


Train Loss: 6.1463
Train Perplexity: 467.0065


100%|██████████| 2428/2428 [00:03<00:00, 729.03it/s]


Val Loss: 5.8590
Val Perplexity: 350.3759


Epoch 5/10: 100%|██████████| 8533/8533 [00:20<00:00, 426.35batch/s]


Train Loss: 6.0553
Train Perplexity: 426.3728


100%|██████████| 2428/2428 [00:03<00:00, 718.45it/s]


Val Loss: 5.7705
Val Perplexity: 320.6842


Epoch 6/10: 100%|██████████| 8533/8533 [00:20<00:00, 424.17batch/s]


Train Loss: 5.9831
Train Perplexity: 396.6800


100%|██████████| 2428/2428 [00:03<00:00, 718.63it/s]


Val Loss: 5.7051
Val Perplexity: 300.4017


Epoch 7/10: 100%|██████████| 8533/8533 [00:20<00:00, 420.18batch/s]


Train Loss: 5.9227
Train Perplexity: 373.4077


100%|██████████| 2428/2428 [00:03<00:00, 720.87it/s]


Val Loss: 5.6517
Val Perplexity: 284.7634


Epoch 8/10: 100%|██████████| 8533/8533 [00:20<00:00, 419.54batch/s]


Train Loss: 5.8751
Train Perplexity: 356.0603


100%|██████████| 2428/2428 [00:03<00:00, 710.93it/s]


Val Loss: 5.6086
Val Perplexity: 272.7733


Epoch 9/10: 100%|██████████| 8533/8533 [00:20<00:00, 421.21batch/s]


Train Loss: 5.8323
Train Perplexity: 341.1545


100%|██████████| 2428/2428 [00:03<00:00, 706.50it/s]


Val Loss: 5.5730
Val Perplexity: 263.2292


Epoch 10/10: 100%|██████████| 8533/8533 [00:20<00:00, 420.24batch/s]


Train Loss: 5.7948
Train Perplexity: 328.6003


100%|██████████| 2428/2428 [00:03<00:00, 728.89it/s]


Val Loss: 5.5402
Val Perplexity: 254.7328


100%|██████████| 8533/8533 [00:12<00:00, 685.49it/s]
100%|██████████| 2428/2428 [00:03<00:00, 684.78it/s]
100%|██████████| 1212/1212 [00:01<00:00, 690.23it/s]


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
test_loss,▁
test_perplexity,▁
train_loss,▁
train_perplexity,▁
val_loss,▁
val_perplexity,▁

0,1
test_loss,5.57973
test_perplexity,265.00006
train_loss,5.57182
train_perplexity,262.91299
val_loss,5.53989
val_perplexity,254.64958


Epoch 1/10: 100%|██████████| 8533/8533 [00:29<00:00, 284.54batch/s]


Train Loss: 6.3760
Train Perplexity: 587.5623


100%|██████████| 2428/2428 [00:03<00:00, 675.38it/s]


Val Loss: 5.9905
Val Perplexity: 399.6004


Epoch 2/10: 100%|██████████| 8533/8533 [00:30<00:00, 281.88batch/s]


Train Loss: 6.1859
Train Perplexity: 485.8712


100%|██████████| 2428/2428 [00:03<00:00, 671.88it/s]


Val Loss: 5.9931
Val Perplexity: 400.6412


Epoch 3/10: 100%|██████████| 8533/8533 [00:30<00:00, 280.49batch/s]


Train Loss: 6.1308
Train Perplexity: 459.7902


100%|██████████| 2428/2428 [00:03<00:00, 688.38it/s]


Val Loss: 5.9258
Val Perplexity: 374.5604


Epoch 4/10: 100%|██████████| 8533/8533 [00:30<00:00, 280.54batch/s]


Train Loss: 6.1211
Train Perplexity: 455.3744


100%|██████████| 2428/2428 [00:03<00:00, 676.49it/s]


Val Loss: 5.9451
Val Perplexity: 381.8800


Epoch 5/10: 100%|██████████| 8533/8533 [00:30<00:00, 280.80batch/s]


Train Loss: 6.1187
Train Perplexity: 454.2943


100%|██████████| 2428/2428 [00:03<00:00, 689.15it/s]


Val Loss: 5.9195
Val Perplexity: 372.2225


Epoch 6/10: 100%|██████████| 8533/8533 [00:30<00:00, 279.80batch/s]


Train Loss: 6.1215
Train Perplexity: 455.5575


100%|██████████| 2428/2428 [00:03<00:00, 686.54it/s]


Val Loss: 5.9413
Val Perplexity: 380.4215


Epoch 7/10: 100%|██████████| 8533/8533 [00:30<00:00, 279.47batch/s]


Train Loss: 6.1219
Train Perplexity: 455.7515


100%|██████████| 2428/2428 [00:03<00:00, 683.76it/s]


Val Loss: 5.8956
Val Perplexity: 363.4400


Epoch 8/10: 100%|██████████| 8533/8533 [00:30<00:00, 278.48batch/s]


Train Loss: 6.1204
Train Perplexity: 455.0360


100%|██████████| 2428/2428 [00:03<00:00, 681.16it/s]


Val Loss: 5.9104
Val Perplexity: 368.8543


Epoch 9/10: 100%|██████████| 8533/8533 [00:30<00:00, 279.61batch/s]


Train Loss: 6.1185
Train Perplexity: 454.2021


100%|██████████| 2428/2428 [00:03<00:00, 677.91it/s]


Val Loss: 5.9007
Val Perplexity: 365.3029
Early stopping at epoch 9


100%|██████████| 8533/8533 [00:13<00:00, 645.32it/s]
100%|██████████| 2428/2428 [00:03<00:00, 643.09it/s]
100%|██████████| 1212/1212 [00:01<00:00, 642.00it/s]


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
test_loss,▁
test_perplexity,▁
train_loss,▁
train_perplexity,▁
val_loss,▁
val_perplexity,▁

0,1
test_loss,5.94016
test_perplexity,379.99527
train_loss,5.88808
train_perplexity,360.7121
val_loss,5.90096
val_perplexity,365.38702


Epoch 1/10: 100%|██████████| 8533/8533 [00:21<00:00, 402.15batch/s]


Train Loss: 8.7031
Train Perplexity: 6021.3218


100%|██████████| 2428/2428 [00:03<00:00, 677.48it/s]


Val Loss: 7.2312
Val Perplexity: 1381.8846


Epoch 2/10: 100%|██████████| 8533/8533 [00:21<00:00, 403.12batch/s]


Train Loss: 7.2598
Train Perplexity: 1421.9280


100%|██████████| 2428/2428 [00:03<00:00, 686.36it/s]


Val Loss: 6.8298
Val Perplexity: 925.0099


Epoch 3/10: 100%|██████████| 8533/8533 [00:21<00:00, 402.03batch/s]


Train Loss: 6.9669
Train Perplexity: 1060.9023


100%|██████████| 2428/2428 [00:03<00:00, 684.32it/s]


Val Loss: 6.6507
Val Perplexity: 773.3410


Epoch 4/10: 100%|██████████| 8533/8533 [00:21<00:00, 403.80batch/s]


Train Loss: 6.8143
Train Perplexity: 910.7341


100%|██████████| 2428/2428 [00:03<00:00, 668.13it/s]


Val Loss: 6.5401
Val Perplexity: 692.3420


Epoch 5/10: 100%|██████████| 8533/8533 [00:21<00:00, 405.39batch/s]


Train Loss: 6.7107
Train Perplexity: 821.1558


100%|██████████| 2428/2428 [00:03<00:00, 681.04it/s]


Val Loss: 6.4582
Val Perplexity: 637.9114


Epoch 6/10: 100%|██████████| 8533/8533 [00:21<00:00, 402.97batch/s]


Train Loss: 6.6415
Train Perplexity: 766.2698


100%|██████████| 2428/2428 [00:03<00:00, 682.47it/s]


Val Loss: 6.4029
Val Perplexity: 603.5933


Epoch 7/10: 100%|██████████| 8533/8533 [00:21<00:00, 402.85batch/s]


Train Loss: 6.5879
Train Perplexity: 726.2456


100%|██████████| 2428/2428 [00:03<00:00, 685.68it/s]


Val Loss: 6.3585
Val Perplexity: 577.4028


Epoch 8/10: 100%|██████████| 8533/8533 [00:21<00:00, 404.29batch/s]


Train Loss: 6.5439
Train Perplexity: 695.0153


100%|██████████| 2428/2428 [00:03<00:00, 683.38it/s]


Val Loss: 6.3211
Val Perplexity: 556.2058


Epoch 9/10: 100%|██████████| 8533/8533 [00:21<00:00, 399.70batch/s]


Train Loss: 6.5084
Train Perplexity: 670.7526


100%|██████████| 2428/2428 [00:03<00:00, 668.38it/s]


Val Loss: 6.2886
Val Perplexity: 538.3846


Epoch 10/10: 100%|██████████| 8533/8533 [00:21<00:00, 405.10batch/s]


Train Loss: 6.4780
Train Perplexity: 650.6737


100%|██████████| 2428/2428 [00:03<00:00, 679.18it/s]


Val Loss: 6.2607
Val Perplexity: 523.5665


100%|██████████| 8533/8533 [00:13<00:00, 641.23it/s]
100%|██████████| 2428/2428 [00:03<00:00, 641.72it/s]
100%|██████████| 1212/1212 [00:01<00:00, 633.28it/s]


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
test_loss,▁
test_perplexity,▁
train_loss,▁
train_perplexity,▁
val_loss,▁
val_perplexity,▁

0,1
test_loss,6.28581
test_perplexity,536.90106
train_loss,6.32223
train_perplexity,556.81213
val_loss,6.26065
val_perplexity,523.5567


Epoch 1/10: 100%|██████████| 8533/8533 [00:30<00:00, 281.14batch/s]


Train Loss: 6.6640
Train Perplexity: 783.6645


100%|██████████| 2428/2428 [00:03<00:00, 682.90it/s]


Val Loss: 6.4215
Val Perplexity: 614.8959


Epoch 2/10: 100%|██████████| 8533/8533 [00:30<00:00, 282.81batch/s]


Train Loss: 6.5201
Train Perplexity: 678.6661


100%|██████████| 2428/2428 [00:03<00:00, 683.81it/s]


Val Loss: 6.4160
Val Perplexity: 611.5377


Epoch 3/10: 100%|██████████| 8533/8533 [00:30<00:00, 282.95batch/s]


Train Loss: 6.5118
Train Perplexity: 673.0078


100%|██████████| 2428/2428 [00:03<00:00, 689.11it/s]


Val Loss: 6.4157
Val Perplexity: 611.3630


Epoch 4/10: 100%|██████████| 8533/8533 [00:30<00:00, 283.94batch/s]


Train Loss: 6.5099
Train Perplexity: 671.7375


100%|██████████| 2428/2428 [00:03<00:00, 690.37it/s]


Val Loss: 6.4121
Val Perplexity: 609.1875


Epoch 5/10: 100%|██████████| 8533/8533 [00:30<00:00, 283.15batch/s]


Train Loss: 6.5097
Train Perplexity: 671.6104


100%|██████████| 2428/2428 [00:03<00:00, 683.06it/s]


Val Loss: 6.4122
Val Perplexity: 609.2563


Epoch 6/10: 100%|██████████| 8533/8533 [00:30<00:00, 281.76batch/s]


Train Loss: 6.5093
Train Perplexity: 671.3865


100%|██████████| 2428/2428 [00:03<00:00, 684.40it/s]


Val Loss: 6.4135
Val Perplexity: 609.9955
Early stopping at epoch 6


100%|██████████| 8533/8533 [00:13<00:00, 646.93it/s]
100%|██████████| 2428/2428 [00:03<00:00, 628.39it/s]
100%|██████████| 1212/1212 [00:01<00:00, 646.89it/s]


VBox(children=(Label(value='0.020 MB of 0.020 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
test_loss,▁
test_perplexity,▁
train_loss,▁
train_perplexity,▁
val_loss,▁
val_perplexity,▁

0,1
test_loss,6.44122
test_perplexity,627.16925
train_loss,6.48556
train_perplexity,655.60815
val_loss,6.41301
val_perplexity,609.7254


Epoch 1/10: 100%|██████████| 8533/8533 [00:21<00:00, 404.78batch/s]


Train Loss: 6.9455
Train Perplexity: 1038.4714


100%|██████████| 2428/2428 [00:03<00:00, 685.24it/s]


Val Loss: 6.2797
Val Perplexity: 533.6493


Epoch 2/10: 100%|██████████| 8533/8533 [00:21<00:00, 402.71batch/s]


Train Loss: 6.3710
Train Perplexity: 584.6555


100%|██████████| 2428/2428 [00:03<00:00, 688.50it/s]


Val Loss: 6.0786
Val Perplexity: 436.4319


Epoch 3/10: 100%|██████████| 8533/8533 [00:21<00:00, 403.47batch/s]


Train Loss: 6.1992
Train Perplexity: 492.3396


100%|██████████| 2428/2428 [00:03<00:00, 674.04it/s]


Val Loss: 5.9156
Val Perplexity: 370.7662


Epoch 4/10: 100%|██████████| 8533/8533 [00:21<00:00, 404.93batch/s]


Train Loss: 6.0659
Train Perplexity: 430.9026


100%|██████████| 2428/2428 [00:03<00:00, 680.28it/s]


Val Loss: 5.7963
Val Perplexity: 329.0857


Epoch 5/10: 100%|██████████| 8533/8533 [00:21<00:00, 401.63batch/s]


Train Loss: 5.9695
Train Perplexity: 391.3217


100%|██████████| 2428/2428 [00:03<00:00, 683.77it/s]


Val Loss: 5.7175
Val Perplexity: 304.1444


Epoch 6/10: 100%|██████████| 8533/8533 [00:21<00:00, 403.51batch/s]


Train Loss: 5.8951
Train Perplexity: 363.2369


100%|██████████| 2428/2428 [00:03<00:00, 682.50it/s]


Val Loss: 5.6508
Val Perplexity: 284.5166


Epoch 7/10: 100%|██████████| 8533/8533 [00:21<00:00, 404.51batch/s]


Train Loss: 5.8340
Train Perplexity: 341.7371


100%|██████████| 2428/2428 [00:03<00:00, 688.52it/s]


Val Loss: 5.5929
Val Perplexity: 268.5058


Epoch 8/10: 100%|██████████| 8533/8533 [00:21<00:00, 406.14batch/s]


Train Loss: 5.7825
Train Perplexity: 324.5702


100%|██████████| 2428/2428 [00:03<00:00, 665.47it/s]


Val Loss: 5.5515
Val Perplexity: 257.6226


Epoch 9/10: 100%|██████████| 8533/8533 [00:21<00:00, 404.98batch/s]


Train Loss: 5.7401
Train Perplexity: 311.0913


100%|██████████| 2428/2428 [00:03<00:00, 686.79it/s]


Val Loss: 5.5115
Val Perplexity: 247.5229


Epoch 10/10: 100%|██████████| 8533/8533 [00:21<00:00, 404.84batch/s]


Train Loss: 5.7015
Train Perplexity: 299.3176


100%|██████████| 2428/2428 [00:03<00:00, 679.04it/s]


Val Loss: 5.4805
Val Perplexity: 239.9600


100%|██████████| 8533/8533 [00:13<00:00, 641.83it/s]
100%|██████████| 2428/2428 [00:03<00:00, 647.38it/s]
100%|██████████| 1212/1212 [00:01<00:00, 641.43it/s]


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
test_loss,▁
test_perplexity,▁
train_loss,▁
train_perplexity,▁
val_loss,▁
val_perplexity,▁

0,1
test_loss,5.51914
test_perplexity,249.42097
train_loss,5.50115
train_perplexity,244.97247
val_loss,5.47999
val_perplexity,239.84546


In [24]:
# Convert tensors to floats
for result in results:
    result['train_perplexity'] = result['train_perplexity'].item()  # Convert tensor to float
    result['val_perplexity'] = result['val_perplexity'].item()
    result['test_perplexity'] = result['test_perplexity'].item()


In [27]:
results_df = pd.DataFrame(results)

results_df['hyperparam_combination'] = results_df.apply(
    lambda row: f"Dropout: {row['dropout']}, LR: {row['learning_rate']}, HD: {row['hidden_dims']}, Opt: {row['optimizer']}", axis=1)

In [28]:
best_val_perplexity_row = results_df.loc[results_df['val_perplexity'].idxmin()]
best_test_perplexity_row = results_df.loc[results_df['test_perplexity'].idxmin()]

best_val_perplexity_df = pd.DataFrame([best_val_perplexity_row])
best_test_perplexity_df = pd.DataFrame([best_test_perplexity_row])


In [29]:
print("Best Hyperparameters with Lowest Validation Perplexity:")
display(best_val_perplexity_df)

Best Hyperparameters with Lowest Validation Perplexity:


Unnamed: 0,dropout,hidden_dims,learning_rate,optimizer,train_loss,train_perplexity,val_loss,val_perplexity,test_loss,test_perplexity,hyperparam_combination
11,0.1,"[300, 300]",0.01,SGD,5.325233,205.456314,5.341418,208.80864,5.378655,216.73056,"Dropout: 0.1, LR: 0.01, HD: [300, 300], Opt: SGD"


In [30]:
print("\nBest Hyperparameters with Lowest Test Perplexity:")
display(best_test_perplexity_df)


Best Hyperparameters with Lowest Test Perplexity:


Unnamed: 0,dropout,hidden_dims,learning_rate,optimizer,train_loss,train_perplexity,val_loss,val_perplexity,test_loss,test_perplexity,hyperparam_combination
11,0.1,"[300, 300]",0.01,SGD,5.325233,205.456314,5.341418,208.80864,5.378655,216.73056,"Dropout: 0.1, LR: 0.01, HD: [300, 300], Opt: SGD"


In [31]:
import plotly.graph_objects as go

# Create an interactive line plot
fig = go.Figure()

# Plot Train Perplexity
fig.add_trace(go.Scatter(
    x=results_df['hyperparam_combination'],
    y=results_df['train_perplexity'],
    mode='lines+markers',
    name='Train Perplexity',
    marker=dict(size=8),
    line=dict(width=2)
))

# Plot Validation Perplexity
fig.add_trace(go.Scatter(
    x=results_df['hyperparam_combination'],
    y=results_df['val_perplexity'],
    mode='lines+markers',
    name='Validation Perplexity',
    marker=dict(size=8),
    line=dict(width=2)
))

# Plot Test Perplexity
fig.add_trace(go.Scatter(
    x=results_df['hyperparam_combination'],
    y=results_df['test_perplexity'],
    mode='lines+markers',
    name='Test Perplexity',
    marker=dict(size=8),
    line=dict(width=2)
))

# Update layout with increased size
fig.update_layout(
    title='Perplexity vs. Hyperparameters',
    xaxis_title='Hyperparameters',
    yaxis_title='Perplexity',
    xaxis_tickangle=-90,
    legend_title='Perplexity Type',
    template='plotly_white',
    width=1400,  # Increase width
    height=1000,   # Increase height
    xaxis_tickfont=dict(size=10),  # Smaller x-axis tick font size
)

# Show plot
fig.show()


In [32]:
import plotly.express as px

# Melt the dataframe for easier plotting with Plotly Express
melted_df = results_df.melt(id_vars='hyperparam_combination', 
                            value_vars=['train_perplexity', 'val_perplexity', 'test_perplexity'], 
                            var_name='Perplexity Type', value_name='Perplexity')

# Create an interactive bar plot
fig_bar = px.bar(melted_df, 
                 x='hyperparam_combination', 
                 y='Perplexity', 
                 color='Perplexity Type', 
                 barmode='group',
                 color_discrete_sequence=px.colors.qualitative.D3,  # Use different colors
                 labels={'hyperparam_combination': 'Hyperparameter Combinations', 'Perplexity': 'Perplexity'},
                 title='Perplexity vs. Hyperparameter Combinations')

# Update layout
fig_bar.update_layout(
    xaxis_tickangle=-90,
    template='plotly_white',
    width=1400,  # Increase width
    height=800,   # Increase height
    xaxis_tickfont=dict(size=8),  # Smaller x-axis tick font size
    bargap=0.2  # Increase space between bars
)

# Show plot
fig_bar.show()


In [26]:
results_df.to_csv('hyperparameter_tuning_results_nnlm.csv', index=False)

In [32]:
results_df

Unnamed: 0,dropout,hidden_dims,learning_rate,optimizer,train_loss,train_perplexity,val_loss,val_perplexity,test_loss,test_perplexity,hyperparam_combination
0,0.1,"[100, 100]",0.001,Adam,5.128179,168.709549,5.513472,248.010742,5.556505,258.91626,"Dropout: 0.1, LR: 0.001, HD: [100, 100], Opt: ..."
1,0.1,"[100, 100]",0.001,SGD,6.335142,564.049866,6.275821,531.5625,6.301206,545.229126,"Dropout: 0.1, LR: 0.001, HD: [100, 100], Opt: SGD"
2,0.1,"[100, 100]",0.01,Adam,6.322811,557.136963,6.268852,527.871155,6.302869,546.136169,"Dropout: 0.1, LR: 0.01, HD: [100, 100], Opt: Adam"
3,0.1,"[100, 100]",0.01,SGD,5.465257,236.336517,5.452482,233.336533,5.492457,242.853256,"Dropout: 0.1, LR: 0.01, HD: [100, 100], Opt: SGD"
4,0.1,"[200, 200]",0.001,Adam,5.094634,163.144135,5.517128,248.919235,5.556593,258.939117,"Dropout: 0.1, LR: 0.001, HD: [200, 200], Opt: ..."
5,0.1,"[200, 200]",0.001,SGD,6.258859,522.622131,6.198226,491.875427,6.222934,504.18045,"Dropout: 0.1, LR: 0.001, HD: [200, 200], Opt: SGD"
6,0.1,"[200, 200]",0.01,Adam,6.37232,585.414246,6.333749,563.264465,6.359728,578.088989,"Dropout: 0.1, LR: 0.01, HD: [200, 200], Opt: Adam"
7,0.1,"[200, 200]",0.01,SGD,5.376096,216.176727,5.372719,215.447937,5.413367,224.38588,"Dropout: 0.1, LR: 0.01, HD: [200, 200], Opt: SGD"
8,0.1,"[300, 300]",0.001,Adam,5.110413,165.738724,5.535922,253.641556,5.569774,262.374725,"Dropout: 0.1, LR: 0.001, HD: [300, 300], Opt: ..."
9,0.1,"[300, 300]",0.001,SGD,6.229542,507.523346,6.172417,479.34314,6.197181,491.361816,"Dropout: 0.1, LR: 0.001, HD: [300, 300], Opt: SGD"


## Loading and Running the model again

In [39]:
with open('/kaggle/working/data_store_nnlm.pkl', 'rb') as f:
    data = pickle.load(f)

embeddings = data['embeddings']
vocab = data['vocab']
word_to_idx = data['word_to_idx']
encoded_train = data['encoded_train']
encoded_val = data['encoded_val']
encoded_test = data['encoded_test']

train_dataset = NGramDataset(encoded_train, embeddings)
val_dataset = NGramDataset(encoded_val, embeddings)
test_dataset = NGramDataset(encoded_test, embeddings)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=True)

print("Data loaded successfully!")

Data loaded successfully!


In [40]:
model_new = NNLM(embeddings, [300, 300], 5, 0.1)

# Load the saved state_dict
model_new.load_state_dict(torch.load('/kaggle/working/2021101072_LM1.pt', weights_only=True))

model_new.eval()
model_new.to(device)

optimizer = optim.SGD(model_new.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()


In [41]:
loss, perplexity = test_model(model_new, val_loader, criterion)
print(f'\nVal Loss: {loss}')
print(f'Val Perplexity: {perplexity}')

100%|██████████| 2465/2465 [00:04<00:00, 588.90it/s]


Val Loss: 5.361447884441631
Val Perplexity: 213.0331573486328





In [42]:
loss, perplexity = test_model(model_new, test_loader, criterion)
print(f'\nTest Loss: {loss}')
print(f'Test Perplexity: {perplexity}')

100%|██████████| 1214/1214 [00:02<00:00, 605.10it/s]


Test Loss: 5.342875943348868
Test Perplexity: 209.11325073242188



