In [1]:
import random

import torch

from aptorch.data import (
    DivinaCommediaDataset,
    divina_commedia,
    divina_commedia_tokenizer,
)
from aptorch.dlm import DLM, pretraining

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
train_dataset, test_dataset = divina_commedia()
tokenizer = divina_commedia_tokenizer(train_dataset)
train_set = DivinaCommediaDataset(dataset=train_dataset)
test_set = DivinaCommediaDataset(dataset=test_dataset)


def collate_fn(batch):
    prompts = [tup[0] for tup in batch]
    responses = [tup[1] for tup in batch]
    prompts_enc = torch.tensor(
        [enc.ids for enc in tokenizer.encode_batch(prompts)])
    responses_enc = torch.tensor(
        [enc.ids for enc in tokenizer.encode_batch(responses)])
    return prompts_enc, responses_enc







In [None]:
lr = 1e-3
n_epochs = 1
batch_size = 32
emb_dim = 32
ff_dim = 512
mask_ratio = random.uniform(0.01, 0.99)
print(f"mask_ratio={mask_ratio}")
pad_token_id = (tokenizer.encode("[PAD]").ids)[0]
mask_token_id = (tokenizer.encode("[MASK]").ids)[0]
num_tokens = tokenizer.get_vocab_size()

model = DLM(
    num_tokens=num_tokens,
    emb_dim=emb_dim,
    ff_dim=ff_dim,
    pad_idx=pad_token_id,
    mask_idx=mask_token_id,
)
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
print(
    f"Number of parameters={sum(p.numel() for p in model.parameters() if p.requires_grad)}")

pretraining(
    model=model,
    optim=optimizer,
    training_set=train_set,
    collate_fn=collate_fn,
    lr=lr,
    n_epochs=n_epochs,
    batch_size=batch_size,
    emb_dim=emb_dim,
    ff_dim=ff_dim,
    mask_ratio=mask_ratio,
    pad_idx=pad_token_id,
    mask_idx=mask_token_id,
    num_tokens=num_tokens,
)