In [1]:
is_startup = True

In [2]:
if is_startup:
  !pip install transformers[torch]
  !pip install accelerate -U

Collecting accelerate>=0.20.3 (from transformers[torch])
  Downloading accelerate-0.24.1-py3-none-any.whl (261 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m261.4/261.4 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.24.1


In [3]:
from google.colab import drive
import sys
drive.mount('/content/drive')
project_path = "/content/drive/MyDrive/Detalytics/chatbot/"
sys.path.insert(0, project_path)

Mounted at /content/drive


In [4]:
import os
if is_startup:
  os.chdir(project_path)
  print (os.getcwd())
  !pip install -e .
  !pip install -e alutils

/content/drive/MyDrive/Detalytics/chatbot
Obtaining file:///content/drive/MyDrive/Detalytics/chatbot
  Preparing metadata (setup.py) ... [?25l[?25hdone
Installing collected packages: chatbot
  Running setup.py develop for chatbot
Successfully installed chatbot-0.0.0
Obtaining file:///content/drive/MyDrive/Detalytics/chatbot/alutils
  Preparing metadata (setup.py) ... [?25l[?25hdone
Installing collected packages: alutils
  Running setup.py develop for alutils
Successfully installed alutils-0.0.0


In [39]:
test_dialogs = [
    "Do you have any pets?",
    "Do you have dogs?",
    "What is your favorite movie?",
    "do you like spending your (or not so) money?",
    "I am from the south, where are you from?",
    "what are u gonna do this weekend?",
    "I do not like sports.",
    "Do you have any hobbies ?",
    "I like to go to the gym.",
    "i like to eat fish",
    "Hello there!",
    "How are you?",
    "Tell me a fun fact about space.",
    "Can you recommend a good book to read?",
    "What are the health benefits of eating bananas?",
    "How can I make a chocolate cake?",
    "What's the weather like today?",
    "Can you recommend a good restaurant nearby?",
    "What are the benefits of exercise?",
    "How do I reset my email password?"
]

In [40]:
from torch.utils.data import Dataset
import json
import random

class ChatData(Dataset):
    def __init__(self, path:str, tokenizer, split='train'):
        self.data = json.load(open(path, "r"))

        X = []
        for dialog in self.data:
            texts = []
            last_sender = None
            for turn in dialog['dialog']:
                this_sender = turn['sender']
                if this_sender != last_sender:
                    texts.append(turn['text'])
                else:
                    texts[-1] += turn['text']
                last_sender = this_sender

            #add texts to self.X
            for i in range(len(texts)-1):
                X.append("<startofstring> " + texts[i] +" <bot>: "+ texts[i+1] + " <endofstring>")

        random.seed(259)
        random.shuffle(X)
        num_train = len(X) // 5 * 4

        if split == 'train':
            self.X = X[:num_train]
        else:
            self.X = X[num_train:]

        print ("Number of dialogs: ", len(self.X))
        for one_turn_dialog in self.X[:10]:
            print(one_turn_dialog)

        self.X_encoded = tokenizer(self.X, max_length=64, truncation=True, padding="max_length", return_tensors="pt")
        self.input_ids = self.X_encoded['input_ids']
        self.attention_mask = self.X_encoded['attention_mask']

        output_path = f"{constants.DATA_PATH}convai_{split}.txt"
        output_file = open(output_path, 'w', encoding="utf-8")
        for text in self.X:
            output_file.write(f"{text}\n")

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return (self.input_ids[idx], self.attention_mask[idx])

In [41]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from torch.optim import Adam
from torch.utils.data import DataLoader
import tqdm
import torch

In [47]:
from chatbot import constants

def train(train_loader, val_loader, model, optim, num_epochs):
    min_val_loss = 1e6
    test_epoch = 8
    log_path = f"{constants.RESULT_PATH}programming_hut.txt"
    log_file = open(log_path, 'w', encoding="utf-8")
    for num_epoch in tqdm.tqdm(range(num_epochs)):
        #training
        avg_loss = []
        model.train()
        for X, a in train_loader:
            X = X.to(device)
            a = a.to(device)
            optim.zero_grad()
            loss = model(X, attention_mask=a, labels=X).loss
            avg_loss.append(loss.item())
            loss.backward()
            optim.step()
        train_loss = sum(avg_loss) / len(avg_loss)

        #validation
        avg_loss = []
        model.eval()
        for X, a in val_loader:
            X = X.to(device)
            a = a.to(device)
            optim.zero_grad()
            loss = model(X, attention_mask=a, labels=X).loss
            avg_loss.append(loss.item())
            loss.backward()
            optim.step()
        val_loss = sum(avg_loss) / len(avg_loss)

        #epoch summary
        print (f"Epoch {num_epoch+1}: train_loss = {train_loss:.4f}, val_loss = {val_loss:.4f}")

        #save epoch
        if val_loss < min_val_loss:
            min_val_loss = val_loss
            torch.save(model.state_dict(), f"{constants.CKPT_PATH}programming_hut/model_state.pt")

        if (num_epoch + 1) % test_epoch == 0:
            response = "------------------------\n"
            for prompt in test_dialogs:
                response_ = infer(prompt)
                response_ = response_.split('<bot>:')[-1].split('<endofstring>')[0]
                response += f"user: {prompt} \n bot: {response_} \n\n"
            log_file.write(response)
            log_file.flush()


def infer(prompt):
    prompt = f"<startofstring> {prompt} <bot>: "
    input = tokenizer(prompt, return_tensors="pt")
    X = input["input_ids"].to(device)
    a = input["attention_mask"].to(device)
    output = model.generate(X, attention_mask=a )
    output = tokenizer.decode(output[0])
    return output

In [48]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.add_special_tokens({"pad_token": "<pad>",
                                "bos_token": "<startofstring>",
                                "eos_token": "<endofstring>"})
tokenizer.add_tokens(["<bot>:"])

1

In [None]:
import warnings
warnings.filterwarnings('ignore')

num_epochs = 256
device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
model = GPT2LMHeadModel.from_pretrained("gpt2")
model.to(device)
# model.eval()
# for query in test_dialogs:
#     print(infer(query))
model.resize_token_embeddings(len(tokenizer))

train_dataset = ChatData(f"{constants.DATA_PATH}convai_chat_data.json", tokenizer, split='train')
train_loader =  DataLoader(train_dataset, batch_size=32)
val_dataset = ChatData(f"{constants.DATA_PATH}convai_chat_data.json", tokenizer, split='val')
val_loader =  DataLoader(val_dataset, batch_size=8)

optim = Adam(model.parameters(), lr=1e-4)

print("training .... ")
train(train_loader, val_loader, model, optim, num_epochs)

Number of dialogs:  2376
<startofstring> i am a huge gamer <bot>: ok <endofstring>
<startofstring> No!  You don't behave, you go in the bad girl closet. <bot>: I'm just a little girl <endofstring>
<startofstring> On Monday, I'm going to have to tell my kindergarten class, who I teach not to run with scissors, that my fiance ran me through with a frickin' broadsword. <bot>: I like gray color <endofstring>
<startofstring> I do not like swimming, but i do like to swim <bot>: and what about the beach, sand? <endofstring>
<startofstring> where are you from? <bot>: I live in the midwest <endofstring>
<startofstring> You are stupid <bot>: I am not sure what you mean. <endofstring>
<startofstring> i am a cali . cali . <bot>: nice to meet you, i'm iza <endofstring>
<startofstring> What do you like to read <bot>: i read a lot , especially when i have time . <endofstring>
<startofstring> Are you going <bot>: Ha Na, I will go to your school tomorrow. <endofstring>
<startofstring> I don't understan

  0%|          | 0/256 [00:00<?, ?it/s]

Epoch 1: train_loss = 3.2317, val_loss = 1.1197


  0%|          | 1/256 [00:14<59:52, 14.09s/it]

Epoch 2: train_loss = 1.0962, val_loss = 0.8524


  1%|          | 2/256 [00:28<59:37, 14.09s/it]

Epoch 3: train_loss = 0.9279, val_loss = 0.7210


  1%|          | 3/256 [00:42<59:17, 14.06s/it]

Epoch 4: train_loss = 0.8475, val_loss = 0.6090


  2%|▏         | 4/256 [00:56<59:03, 14.06s/it]

Epoch 5: train_loss = 0.7899, val_loss = 0.4963


  2%|▏         | 5/256 [01:10<58:45, 14.04s/it]

Epoch 6: train_loss = 0.7446, val_loss = 0.3935


  2%|▏         | 6/256 [01:24<58:30, 14.04s/it]

Epoch 7: train_loss = 0.7005, val_loss = 0.3106


  3%|▎         | 7/256 [01:38<58:17, 14.04s/it]

Epoch 8: train_loss = 0.6655, val_loss = 0.2572


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end gene

Epoch 9: train_loss = 0.6164, val_loss = 0.2186


  4%|▎         | 9/256 [02:09<1:00:07, 14.61s/it]

Epoch 10: train_loss = 0.5758, val_loss = 0.1991


  4%|▍         | 10/256 [02:23<59:10, 14.43s/it] 

Epoch 11: train_loss = 0.5414, val_loss = 0.1800


  4%|▍         | 11/256 [02:37<58:29, 14.32s/it]

Epoch 12: train_loss = 0.5075, val_loss = 0.1728


  5%|▍         | 12/256 [02:51<57:59, 14.26s/it]

Epoch 13: train_loss = 0.4727, val_loss = 0.1591


  5%|▌         | 13/256 [03:05<57:39, 14.24s/it]

Epoch 14: train_loss = 0.4396, val_loss = 0.1539


  5%|▌         | 14/256 [03:19<57:16, 14.20s/it]

Epoch 15: train_loss = 0.4209, val_loss = 0.1522


  6%|▌         | 15/256 [03:33<56:58, 14.18s/it]

Epoch 16: train_loss = 0.3925, val_loss = 0.1434


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end gene

Epoch 17: train_loss = 0.3678, val_loss = 0.1379


  7%|▋         | 17/256 [04:04<58:34, 14.71s/it]

Epoch 18: train_loss = 0.3480, val_loss = 0.1344


  7%|▋         | 18/256 [04:18<57:33, 14.51s/it]

Epoch 19: train_loss = 0.3314, val_loss = 0.1323


  7%|▋         | 19/256 [04:32<56:47, 14.38s/it]

Epoch 20: train_loss = 0.3174, val_loss = 0.1316


  8%|▊         | 20/256 [04:46<56:10, 14.28s/it]

Epoch 21: train_loss = 0.3058, val_loss = 0.1311


  8%|▊         | 21/256 [05:00<55:43, 14.23s/it]

Epoch 22: train_loss = 0.2935, val_loss = 0.1302


  9%|▉         | 23/256 [05:27<53:30, 13.78s/it]

Epoch 23: train_loss = 0.2829, val_loss = 0.1303
Epoch 24: train_loss = 0.2719, val_loss = 0.1283


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end gene

Epoch 25: train_loss = 0.2615, val_loss = 0.1283


 10%|█         | 26/256 [06:11<53:33, 13.97s/it]

Epoch 26: train_loss = 0.2542, val_loss = 0.1286
Epoch 27: train_loss = 0.2472, val_loss = 0.1282


 11%|█         | 27/256 [06:25<53:27, 14.01s/it]

Epoch 28: train_loss = 0.2404, val_loss = 0.1269


 11%|█         | 28/256 [06:39<53:18, 14.03s/it]

Epoch 29: train_loss = 0.2324, val_loss = 0.1261


 12%|█▏        | 30/256 [07:06<51:33, 13.69s/it]

Epoch 30: train_loss = 0.2262, val_loss = 0.1274


 12%|█▏        | 31/256 [07:19<50:22, 13.43s/it]

Epoch 31: train_loss = 0.2204, val_loss = 0.1272


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Epoch 32: train_loss = 0.2113, val_loss = 0.1270


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end gene

Epoch 33: train_loss = 0.2101, val_loss = 0.1278


 13%|█▎        | 34/256 [08:00<49:42, 13.44s/it]

Epoch 34: train_loss = 0.2046, val_loss = 0.1278


 14%|█▎        | 35/256 [08:13<48:52, 13.27s/it]

Epoch 35: train_loss = 0.1999, val_loss = 0.1272


 14%|█▍        | 36/256 [08:25<48:10, 13.14s/it]

Epoch 36: train_loss = 0.1971, val_loss = 0.1279


 14%|█▍        | 37/256 [08:38<47:36, 13.04s/it]

Epoch 37: train_loss = 0.1925, val_loss = 0.1269
Epoch 38: train_loss = 0.1891, val_loss = 0.1259


 15%|█▍        | 38/256 [08:52<48:25, 13.33s/it]

Epoch 39: train_loss = 0.1841, val_loss = 0.1253


 15%|█▌        | 39/256 [09:06<48:58, 13.54s/it]

Epoch 40: train_loss = 0.1841, val_loss = 0.1237


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end gene

Epoch 41: train_loss = 0.1813, val_loss = 0.1243
Epoch 42: train_loss = 0.1782, val_loss = 0.1229


 16%|█▋        | 42/256 [09:50<49:55, 14.00s/it]

Epoch 43: train_loss = 0.1797, val_loss = 0.1226


 17%|█▋        | 43/256 [10:04<49:43, 14.01s/it]

Epoch 44: train_loss = 0.1769, val_loss = 0.1226


 18%|█▊        | 45/256 [10:32<49:02, 13.95s/it]

Epoch 45: train_loss = 0.1760, val_loss = 0.1233


 18%|█▊        | 46/256 [10:45<47:40, 13.62s/it]

Epoch 46: train_loss = 0.1750, val_loss = 0.1233
Epoch 47: train_loss = 0.1728, val_loss = 0.1199


 18%|█▊        | 47/256 [10:59<47:49, 13.73s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Epoch 48: train_loss = 0.1734, val_loss = 0.1199


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end gene

Epoch 49: train_loss = 0.1707, val_loss = 0.1220
Epoch 50: train_loss = 0.1710, val_loss = 0.1198


 20%|█▉        | 50/256 [11:41<47:35, 13.86s/it]

Epoch 51: train_loss = 0.1706, val_loss = 0.1192


 20%|██        | 52/256 [12:08<46:15, 13.60s/it]

Epoch 52: train_loss = 0.1691, val_loss = 0.5967


 21%|██        | 53/256 [12:21<45:13, 13.37s/it]

Epoch 53: train_loss = 0.2695, val_loss = 0.1454


 21%|██        | 54/256 [12:34<44:27, 13.21s/it]

Epoch 54: train_loss = 0.1989, val_loss = 0.1199
Epoch 55: train_loss = 0.1862, val_loss = 0.1181


 21%|██▏       | 55/256 [12:48<45:13, 13.50s/it]

Epoch 56: train_loss = 0.1780, val_loss = 0.1177


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end gene

Epoch 57: train_loss = 0.1725, val_loss = 0.1175


 22%|██▏       | 57/256 [13:18<47:21, 14.28s/it]

Epoch 58: train_loss = 0.1686, val_loss = 0.1167


 23%|██▎       | 59/256 [13:45<45:16, 13.79s/it]

Epoch 59: train_loss = 0.1656, val_loss = 0.1168


 23%|██▎       | 60/256 [13:58<44:11, 13.53s/it]

Epoch 60: train_loss = 0.1624, val_loss = 0.1170


 24%|██▍       | 61/256 [14:11<43:26, 13.37s/it]

Epoch 61: train_loss = 0.1612, val_loss = 0.1172


 24%|██▍       | 62/256 [14:24<42:42, 13.21s/it]

Epoch 62: train_loss = 0.1593, val_loss = 0.1171


 25%|██▍       | 63/256 [14:37<42:08, 13.10s/it]

Epoch 63: train_loss = 0.1577, val_loss = 0.1174


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Epoch 64: train_loss = 0.1573, val_loss = 0.1176


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end gene

Epoch 65: train_loss = 0.1557, val_loss = 0.1173


 26%|██▌       | 66/256 [15:18<42:06, 13.30s/it]

Epoch 66: train_loss = 0.1554, val_loss = 0.1176


 26%|██▌       | 67/256 [15:31<41:25, 13.15s/it]

Epoch 67: train_loss = 0.1548, val_loss = 0.1175


 27%|██▋       | 68/256 [15:43<40:53, 13.05s/it]

Epoch 68: train_loss = 0.1552, val_loss = 0.1178


 27%|██▋       | 69/256 [15:56<40:27, 12.98s/it]

Epoch 69: train_loss = 0.1546, val_loss = 0.1173


 27%|██▋       | 70/256 [16:09<40:06, 12.94s/it]

Epoch 70: train_loss = 0.1529, val_loss = 0.1173


 28%|██▊       | 71/256 [16:22<39:49, 12.92s/it]

Epoch 71: train_loss = 0.1529, val_loss = 0.1176


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Epoch 72: train_loss = 0.1519, val_loss = 0.1179


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end gene

Epoch 73: train_loss = 0.1532, val_loss = 0.1179


 29%|██▉       | 74/256 [17:03<40:04, 13.21s/it]

Epoch 74: train_loss = 0.1531, val_loss = 0.1183


 29%|██▉       | 75/256 [17:16<39:29, 13.09s/it]

Epoch 75: train_loss = 0.1537, val_loss = 0.1177


 30%|██▉       | 76/256 [17:29<39:01, 13.01s/it]

Epoch 76: train_loss = 0.1534, val_loss = 0.1173


 30%|███       | 77/256 [17:41<38:37, 12.95s/it]

Epoch 77: train_loss = 0.1536, val_loss = 0.1176


 30%|███       | 78/256 [17:54<38:17, 12.91s/it]

Epoch 78: train_loss = 0.1526, val_loss = 0.1168


 31%|███       | 79/256 [18:07<37:59, 12.88s/it]

Epoch 79: train_loss = 0.1528, val_loss = 0.1172


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Epoch 80: train_loss = 0.1542, val_loss = 0.1177


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end gene

Epoch 81: train_loss = 0.1535, val_loss = 0.1175


 32%|███▏      | 82/256 [18:48<38:19, 13.21s/it]

Epoch 82: train_loss = 0.1536, val_loss = 0.1198


 32%|███▏      | 83/256 [19:01<37:45, 13.10s/it]

Epoch 83: train_loss = 0.1545, val_loss = 0.1198


 33%|███▎      | 84/256 [19:14<37:20, 13.03s/it]

Epoch 84: train_loss = 0.1566, val_loss = 0.1187


 33%|███▎      | 85/256 [19:26<36:57, 12.97s/it]

Epoch 85: train_loss = 0.1571, val_loss = 0.1201


 34%|███▎      | 86/256 [19:39<36:37, 12.93s/it]

Epoch 86: train_loss = 0.1557, val_loss = 0.1201


 34%|███▍      | 87/256 [19:52<36:19, 12.90s/it]

Epoch 87: train_loss = 0.1533, val_loss = 0.1192


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Epoch 88: train_loss = 0.1524, val_loss = 0.1191


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end gene

Epoch 89: train_loss = 0.1515, val_loss = 0.1186


 35%|███▌      | 90/256 [20:33<36:37, 13.24s/it]

Epoch 90: train_loss = 0.1516, val_loss = 0.1194


 36%|███▌      | 91/256 [20:46<36:03, 13.11s/it]

Epoch 91: train_loss = 0.1526, val_loss = 0.1192


 36%|███▌      | 92/256 [20:59<35:36, 13.03s/it]

Epoch 92: train_loss = 0.1540, val_loss = 0.1184


 36%|███▋      | 93/256 [21:12<35:14, 12.97s/it]

Epoch 93: train_loss = 0.1537, val_loss = 0.1199


 37%|███▋      | 94/256 [21:24<34:54, 12.93s/it]

Epoch 94: train_loss = 0.1538, val_loss = 0.1202


 37%|███▋      | 95/256 [21:37<34:37, 12.91s/it]

Epoch 95: train_loss = 0.1546, val_loss = 0.1189


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Epoch 96: train_loss = 0.1535, val_loss = 0.1190


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end gene

Epoch 97: train_loss = 0.1518, val_loss = 0.1187
Epoch 98: train_loss = 0.1501, val_loss = 0.1163


 38%|███▊      | 98/256 [22:20<35:46, 13.59s/it]

Epoch 99: train_loss = 0.1508, val_loss = 0.1160


 39%|███▉      | 100/256 [22:46<34:59, 13.46s/it]

Epoch 100: train_loss = 0.1536, val_loss = 0.1164


 39%|███▉      | 101/256 [22:59<34:16, 13.27s/it]

Epoch 101: train_loss = 0.1548, val_loss = 0.1228


 40%|███▉      | 102/256 [23:12<33:41, 13.13s/it]

Epoch 102: train_loss = 0.1563, val_loss = 0.1193
Epoch 103: train_loss = 0.1555, val_loss = 0.1160


 40%|████      | 103/256 [23:26<34:12, 13.42s/it]

Epoch 104: train_loss = 0.1520, val_loss = 0.1145


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end gene

Epoch 105: train_loss = 0.1510, val_loss = 0.1131


 41%|████▏     | 106/256 [24:10<34:46, 13.91s/it]

Epoch 106: train_loss = 0.1512, val_loss = 0.1135
Epoch 107: train_loss = 0.1508, val_loss = 0.1130


 42%|████▏     | 107/256 [24:24<34:44, 13.99s/it]

Epoch 108: train_loss = 0.1506, val_loss = 0.1114


 42%|████▏     | 108/256 [24:38<34:34, 14.01s/it]

Epoch 109: train_loss = 0.1487, val_loss = 0.1099


 43%|████▎     | 110/256 [25:05<33:17, 13.68s/it]

Epoch 110: train_loss = 0.1485, val_loss = 0.1104


 43%|████▎     | 111/256 [25:18<32:27, 13.43s/it]

Epoch 111: train_loss = 0.1473, val_loss = 0.1110


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Epoch 112: train_loss = 0.1484, val_loss = 0.1111


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end gene

Epoch 113: train_loss = 0.1470, val_loss = 0.1097


 44%|████▍     | 113/256 [25:47<33:29, 14.05s/it]

Epoch 114: train_loss = 0.1468, val_loss = 0.1096


 45%|████▍     | 115/256 [26:14<32:10, 13.69s/it]

Epoch 115: train_loss = 0.1502, val_loss = 0.1112


 45%|████▌     | 116/256 [26:27<31:20, 13.43s/it]

Epoch 116: train_loss = 0.1503, val_loss = 0.1123


 46%|████▌     | 117/256 [26:40<30:41, 13.25s/it]

Epoch 117: train_loss = 0.1518, val_loss = 0.1126


 46%|████▌     | 118/256 [26:53<30:11, 13.13s/it]

Epoch 118: train_loss = 0.1498, val_loss = 0.1124


 46%|████▋     | 119/256 [27:06<29:46, 13.04s/it]

Epoch 119: train_loss = 0.1499, val_loss = 0.1122


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Epoch 120: train_loss = 0.1497, val_loss = 0.1101


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end gene

Epoch 121: train_loss = 0.1503, val_loss = 0.1111


 48%|████▊     | 122/256 [27:47<29:39, 13.28s/it]

Epoch 122: train_loss = 0.1501, val_loss = 0.1123


 48%|████▊     | 123/256 [28:00<29:08, 13.15s/it]

Epoch 123: train_loss = 0.1495, val_loss = 0.1124


 48%|████▊     | 124/256 [28:12<28:44, 13.06s/it]

Epoch 124: train_loss = 0.1491, val_loss = 0.1126


 49%|████▉     | 125/256 [28:25<28:23, 13.00s/it]

Epoch 125: train_loss = 0.1492, val_loss = 0.1141


 49%|████▉     | 126/256 [28:38<28:04, 12.96s/it]

Epoch 126: train_loss = 0.1495, val_loss = 0.1155


 50%|████▉     | 127/256 [28:51<27:49, 12.94s/it]

Epoch 127: train_loss = 0.1494, val_loss = 0.1162


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Epoch 128: train_loss = 0.1516, val_loss = 0.1193


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end gene

Epoch 129: train_loss = 0.1538, val_loss = 0.1160


 51%|█████     | 130/256 [29:32<27:51, 13.27s/it]

Epoch 130: train_loss = 0.1549, val_loss = 0.1145


 51%|█████     | 131/256 [29:45<27:23, 13.15s/it]

Epoch 131: train_loss = 0.1518, val_loss = 0.1141


 52%|█████▏    | 132/256 [29:58<27:00, 13.07s/it]

Epoch 132: train_loss = 0.1511, val_loss = 0.1165


 52%|█████▏    | 133/256 [30:11<26:40, 13.01s/it]

Epoch 133: train_loss = 0.1518, val_loss = 0.1189


 52%|█████▏    | 134/256 [30:24<26:22, 12.97s/it]

Epoch 134: train_loss = 0.1509, val_loss = 0.1197


 53%|█████▎    | 135/256 [30:37<26:06, 12.95s/it]

Epoch 135: train_loss = 0.1502, val_loss = 0.1221


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Epoch 136: train_loss = 0.1492, val_loss = 0.1215


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end gene

Epoch 137: train_loss = 0.1490, val_loss = 0.1193


 54%|█████▍    | 138/256 [31:18<26:02, 13.24s/it]

Epoch 138: train_loss = 0.1473, val_loss = 0.1179


 54%|█████▍    | 139/256 [31:30<25:34, 13.12s/it]

Epoch 139: train_loss = 0.1458, val_loss = 0.1169


 55%|█████▍    | 140/256 [31:43<25:11, 13.03s/it]

Epoch 140: train_loss = 0.1460, val_loss = 0.1172


 55%|█████▌    | 141/256 [31:56<24:51, 12.97s/it]

Epoch 141: train_loss = 0.1469, val_loss = 0.1196


 55%|█████▌    | 142/256 [32:09<24:33, 12.93s/it]

Epoch 142: train_loss = 0.1478, val_loss = 0.1178


 56%|█████▌    | 143/256 [32:22<24:16, 12.89s/it]

Epoch 143: train_loss = 0.1474, val_loss = 0.1191


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Epoch 144: train_loss = 0.1490, val_loss = 0.1197


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end gene

Epoch 145: train_loss = 0.1492, val_loss = 0.1205


 57%|█████▋    | 146/256 [33:03<24:14, 13.22s/it]

Epoch 146: train_loss = 0.1492, val_loss = 0.1202


 57%|█████▋    | 147/256 [33:16<23:48, 13.11s/it]

Epoch 147: train_loss = 0.1472, val_loss = 0.1176


 58%|█████▊    | 148/256 [33:28<23:26, 13.03s/it]

Epoch 148: train_loss = 0.1456, val_loss = 0.1159


 58%|█████▊    | 149/256 [33:41<23:05, 12.95s/it]

Epoch 149: train_loss = 0.1448, val_loss = 0.1168


 59%|█████▊    | 150/256 [33:54<22:47, 12.90s/it]

Epoch 150: train_loss = 0.1452, val_loss = 0.1176


 59%|█████▉    | 151/256 [34:07<22:30, 12.86s/it]

Epoch 151: train_loss = 0.1461, val_loss = 0.1154


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Epoch 152: train_loss = 0.1449, val_loss = 0.1145


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end gene

Epoch 153: train_loss = 0.1444, val_loss = 0.1149


 60%|██████    | 154/256 [34:47<22:23, 13.17s/it]

Epoch 154: train_loss = 0.1437, val_loss = 0.1146


 61%|██████    | 155/256 [35:00<21:58, 13.06s/it]

Epoch 155: train_loss = 0.1433, val_loss = 0.1141


 61%|██████    | 156/256 [35:13<21:37, 12.98s/it]

Epoch 156: train_loss = 0.1442, val_loss = 0.1152


 61%|██████▏   | 157/256 [35:26<21:18, 12.91s/it]

Epoch 157: train_loss = 0.1462, val_loss = 0.1170


 62%|██████▏   | 158/256 [35:39<21:01, 12.87s/it]

Epoch 158: train_loss = 0.1447, val_loss = 0.1159


 62%|██████▏   | 159/256 [35:51<20:45, 12.84s/it]

Epoch 159: train_loss = 0.1445, val_loss = 0.1163


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Epoch 160: train_loss = 0.1439, val_loss = 0.1180


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end gene

Epoch 161: train_loss = 0.1455, val_loss = 0.1179


 63%|██████▎   | 162/256 [36:32<20:37, 13.16s/it]

Epoch 162: train_loss = 0.1457, val_loss = 0.1172


 64%|██████▎   | 163/256 [36:45<20:12, 13.04s/it]

Epoch 163: train_loss = 0.1450, val_loss = 0.1203


 64%|██████▍   | 164/256 [36:58<19:52, 12.96s/it]

Epoch 164: train_loss = 0.1458, val_loss = 0.1207


 64%|██████▍   | 165/256 [37:10<19:33, 12.90s/it]

Epoch 165: train_loss = 0.1453, val_loss = 0.1179


 65%|██████▍   | 166/256 [37:23<19:16, 12.85s/it]

Epoch 166: train_loss = 0.1441, val_loss = 0.1160


 65%|██████▌   | 167/256 [37:36<19:02, 12.83s/it]

Epoch 167: train_loss = 0.1423, val_loss = 0.1149


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Epoch 168: train_loss = 0.1422, val_loss = 0.1130


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end gene

Epoch 169: train_loss = 0.1428, val_loss = 0.1132


 66%|██████▋   | 170/256 [38:17<18:50, 13.15s/it]

Epoch 170: train_loss = 0.1428, val_loss = 0.1141


 67%|██████▋   | 171/256 [38:29<18:27, 13.03s/it]

Epoch 171: train_loss = 0.1431, val_loss = 0.1159


 67%|██████▋   | 172/256 [38:42<18:08, 12.95s/it]

Epoch 172: train_loss = 0.1435, val_loss = 0.1177


 68%|██████▊   | 173/256 [38:55<17:50, 12.90s/it]

Epoch 173: train_loss = 0.1440, val_loss = 0.1153


 68%|██████▊   | 174/256 [39:08<17:34, 12.86s/it]

Epoch 174: train_loss = 0.1433, val_loss = 0.1135


 68%|██████▊   | 175/256 [39:21<17:18, 12.82s/it]

Epoch 175: train_loss = 0.1439, val_loss = 0.1171


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Epoch 176: train_loss = 0.1435, val_loss = 0.1153


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end gene

Epoch 177: train_loss = 0.1432, val_loss = 0.1134


 70%|██████▉   | 178/256 [40:01<17:06, 13.15s/it]

Epoch 178: train_loss = 0.1432, val_loss = 0.1139


 70%|██████▉   | 179/256 [40:14<16:44, 13.04s/it]

Epoch 179: train_loss = 0.1416, val_loss = 0.1122


 70%|███████   | 180/256 [40:27<16:25, 12.96s/it]

Epoch 180: train_loss = 0.1413, val_loss = 0.1132


 71%|███████   | 181/256 [40:40<16:07, 12.91s/it]

Epoch 181: train_loss = 0.1424, val_loss = 0.1127


 71%|███████   | 182/256 [40:52<15:51, 12.86s/it]

Epoch 182: train_loss = 0.1415, val_loss = 0.1123


 71%|███████▏  | 183/256 [41:05<15:38, 12.85s/it]

Epoch 183: train_loss = 0.1402, val_loss = 0.1111


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Epoch 184: train_loss = 0.1417, val_loss = 0.1136


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end gene

Epoch 185: train_loss = 0.1422, val_loss = 0.1139


 73%|███████▎  | 186/256 [41:46<15:22, 13.17s/it]

Epoch 186: train_loss = 0.1426, val_loss = 0.1136


 73%|███████▎  | 187/256 [41:59<15:00, 13.05s/it]

Epoch 187: train_loss = 0.1447, val_loss = 0.1156


 73%|███████▎  | 188/256 [42:12<14:41, 12.97s/it]

Epoch 188: train_loss = 0.1452, val_loss = 0.1163


 74%|███████▍  | 189/256 [42:24<14:24, 12.91s/it]

Epoch 189: train_loss = 0.1450, val_loss = 0.1192


 74%|███████▍  | 190/256 [42:37<14:10, 12.88s/it]

Epoch 190: train_loss = 0.1451, val_loss = 0.1175


 75%|███████▍  | 191/256 [42:50<13:55, 12.85s/it]

Epoch 191: train_loss = 0.1436, val_loss = 0.1176


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Epoch 192: train_loss = 0.1419, val_loss = 0.1149


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end gene

Epoch 193: train_loss = 0.1412, val_loss = 0.1143


 76%|███████▌  | 194/256 [43:31<13:36, 13.17s/it]

Epoch 194: train_loss = 0.1410, val_loss = 0.1152


 76%|███████▌  | 195/256 [43:44<13:16, 13.06s/it]

Epoch 195: train_loss = 0.1414, val_loss = 0.1165


 77%|███████▋  | 196/256 [43:56<12:57, 12.97s/it]

Epoch 196: train_loss = 0.1409, val_loss = 0.1162


 77%|███████▋  | 197/256 [44:09<12:42, 12.92s/it]

Epoch 197: train_loss = 0.1414, val_loss = 0.1174


 77%|███████▋  | 198/256 [44:22<12:26, 12.87s/it]

Epoch 198: train_loss = 0.1417, val_loss = 0.1180


 78%|███████▊  | 199/256 [44:35<12:12, 12.85s/it]

Epoch 199: train_loss = 0.1406, val_loss = 0.1158


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Epoch 200: train_loss = 0.1394, val_loss = 0.1147


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end gene

Epoch 201: train_loss = 0.1397, val_loss = 0.1135


 79%|███████▉  | 202/256 [45:16<11:51, 13.18s/it]

Epoch 202: train_loss = 0.1398, val_loss = 0.1192


 79%|███████▉  | 203/256 [45:28<11:32, 13.06s/it]

Epoch 203: train_loss = 0.1442, val_loss = 0.1154


 80%|███████▉  | 204/256 [45:41<11:15, 12.99s/it]

Epoch 204: train_loss = 0.1404, val_loss = 0.1132


 80%|████████  | 205/256 [45:54<10:59, 12.93s/it]

Epoch 205: train_loss = 0.1387, val_loss = 0.1144


 80%|████████  | 206/256 [46:07<10:44, 12.90s/it]

Epoch 206: train_loss = 0.1397, val_loss = 0.1123


 81%|████████  | 207/256 [46:20<10:31, 12.88s/it]

Epoch 207: train_loss = 0.1407, val_loss = 0.1114


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Epoch 208: train_loss = 0.1388, val_loss = 0.1110


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end gene

Epoch 209: train_loss = 0.1388, val_loss = 0.1106


 82%|████████▏ | 210/256 [47:00<10:05, 13.17s/it]

Epoch 210: train_loss = 0.1386, val_loss = 0.1108


 82%|████████▏ | 211/256 [47:13<09:47, 13.05s/it]

Epoch 211: train_loss = 0.1389, val_loss = 0.1113


 83%|████████▎ | 212/256 [47:26<09:30, 12.96s/it]

Epoch 212: train_loss = 0.1379, val_loss = 0.1160


 83%|████████▎ | 213/256 [47:39<09:15, 12.92s/it]

Epoch 213: train_loss = 0.1393, val_loss = 0.1173


 84%|████████▎ | 214/256 [47:52<09:01, 12.88s/it]

Epoch 214: train_loss = 0.1386, val_loss = 0.1147


 84%|████████▍ | 215/256 [48:04<08:47, 12.86s/it]

Epoch 215: train_loss = 0.1370, val_loss = 0.1132


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Epoch 216: train_loss = 0.1371, val_loss = 0.1113


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end gene

Epoch 217: train_loss = 0.1364, val_loss = 0.1111


 85%|████████▌ | 218/256 [48:45<08:20, 13.18s/it]

Epoch 218: train_loss = 0.1371, val_loss = 0.1126


 86%|████████▌ | 219/256 [48:58<08:03, 13.06s/it]

Epoch 219: train_loss = 0.1371, val_loss = 0.1120
Epoch 220: train_loss = 0.1374, val_loss = 0.1083


 86%|████████▋ | 221/256 [49:25<07:42, 13.20s/it]

Epoch 221: train_loss = 0.1367, val_loss = 0.1106


 87%|████████▋ | 222/256 [49:38<07:25, 13.09s/it]

Epoch 222: train_loss = 0.1371, val_loss = 0.1096


 87%|████████▋ | 223/256 [49:50<07:09, 13.00s/it]

Epoch 223: train_loss = 0.1372, val_loss = 0.1088
Epoch 224: train_loss = 0.1363, val_loss = 0.1082


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end gene

Epoch 225: train_loss = 0.1364, val_loss = 0.1098


 88%|████████▊ | 226/256 [50:32<06:42, 13.40s/it]

Epoch 226: train_loss = 0.1366, val_loss = 0.1093
Epoch 227: train_loss = 0.1373, val_loss = 0.1076


 89%|████████▊ | 227/256 [50:46<06:33, 13.57s/it]

Epoch 228: train_loss = 0.1381, val_loss = 0.1074


 89%|████████▉ | 228/256 [51:00<06:23, 13.71s/it]

Epoch 229: train_loss = 0.1375, val_loss = 0.1074


 90%|████████▉ | 230/256 [51:27<05:51, 13.51s/it]

Epoch 230: train_loss = 0.1378, val_loss = 0.1090


 90%|█████████ | 231/256 [51:40<05:32, 13.31s/it]

Epoch 231: train_loss = 0.1385, val_loss = 0.1085
Epoch 232: train_loss = 0.1387, val_loss = 0.1073


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end gene

Epoch 233: train_loss = 0.1377, val_loss = 0.1080


 91%|█████████▏| 234/256 [52:22<04:58, 13.56s/it]

Epoch 234: train_loss = 0.1384, val_loss = 0.1076


 92%|█████████▏| 235/256 [52:35<04:40, 13.36s/it]

Epoch 235: train_loss = 0.1382, val_loss = 0.1075


 92%|█████████▏| 236/256 [52:48<04:24, 13.22s/it]

Epoch 236: train_loss = 0.1385, val_loss = 0.1085


 93%|█████████▎| 237/256 [53:01<04:08, 13.11s/it]

Epoch 237: train_loss = 0.1406, val_loss = 0.1100


 93%|█████████▎| 238/256 [53:14<03:54, 13.02s/it]

Epoch 238: train_loss = 0.1408, val_loss = 0.1093


 93%|█████████▎| 239/256 [53:27<03:40, 12.95s/it]

Epoch 239: train_loss = 0.1400, val_loss = 0.1127


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Epoch 240: train_loss = 0.1401, val_loss = 0.1158


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end gene

Epoch 241: train_loss = 0.1399, val_loss = 0.1199


 95%|█████████▍| 242/256 [54:08<03:05, 13.24s/it]

Epoch 242: train_loss = 0.1391, val_loss = 0.1171


 95%|█████████▍| 243/256 [54:20<02:50, 13.11s/it]

Epoch 243: train_loss = 0.1377, val_loss = 0.1142


 95%|█████████▌| 244/256 [54:33<02:36, 13.02s/it]

Epoch 244: train_loss = 0.1375, val_loss = 0.1145


 96%|█████████▌| 245/256 [54:46<02:22, 12.96s/it]

Epoch 245: train_loss = 0.1352, val_loss = 0.1141


 96%|█████████▌| 246/256 [54:59<02:09, 12.91s/it]

Epoch 246: train_loss = 0.1354, val_loss = 0.1118


 96%|█████████▋| 247/256 [55:12<01:55, 12.88s/it]

Epoch 247: train_loss = 0.1338, val_loss = 0.1101


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Epoch 248: train_loss = 0.1344, val_loss = 0.1088


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end gene

Epoch 249: train_loss = 0.1340, val_loss = 0.1082
Epoch 250: train_loss = 0.1361, val_loss = 0.1066


 98%|█████████▊| 250/256 [55:54<01:21, 13.57s/it]

Epoch 251: train_loss = 0.1353, val_loss = 0.1061


 98%|█████████▊| 252/256 [56:21<00:53, 13.44s/it]

Epoch 252: train_loss = 0.1358, val_loss = 0.1082


 99%|█████████▉| 253/256 [56:33<00:39, 13.24s/it]

Epoch 253: train_loss = 0.1356, val_loss = 0.1096


 99%|█████████▉| 254/256 [56:46<00:26, 13.13s/it]

Epoch 254: train_loss = 0.1364, val_loss = 0.1085


100%|█████████▉| 255/256 [56:59<00:13, 13.04s/it]

Epoch 255: train_loss = 0.1367, val_loss = 0.1092


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Epoch 256: train_loss = 0.1373, val_loss = 0.1082


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end gene