In [118]:
import torch

from datasets import load_dataset, Dataset

from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace

from torch.utils.data import Dataset, DataLoader

In [119]:
dataset = load_dataset("maiurilorenzo/divina-commedia", split="train")
train_size = int(len(dataset) * 0.8)

train_dataset = dataset[:train_size]
test_dataset = dataset[train_size:]

tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
tokenizer.pre_tokenizer = Whitespace()
tokenizer.enable_padding(pad_token="[PAD]")
trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])

tokenizer.train_from_iterator(
    train_dataset["text"],
    trainer=trainer,
)







In [180]:
class DivinaCommediaDataset(Dataset):

    def __init__(self, dataset, tokenizer):
        self.dataset = dataset
        # self.tokenizer = tokenizer

    def __len__(self):
        return len(self.dataset["text"])

    def __getitem__(self, index):
        # texts = self.dataset["text"][index]
        # if isinstance(texts, str):
        #     texts = [texts]

        # inputs = [enc.ids for enc in self.tokenizer.encode_batch(texts)]
        # inputs = torch.tensor(inputs)
        # print(inputs.size())
        # return inputs
        return self.dataset["text"][index]


train_set = DivinaCommediaDataset(dataset=train_dataset, tokenizer=tokenizer)
test_set = DivinaCommediaDataset(dataset=test_dataset, tokenizer=tokenizer)

print(train_set[:5])

['Nel mezzo del cammin di nostra vita', 'mi ritrovai per una selva oscura,', 'ché la diritta via era smarrita.', 'Ahi quanto a dir qual era è cosa dura', 'esta selva selvaggia e aspra e forte']


In [None]:
import torch
import torch.nn.functional as F
from torch import Tensor
from torch.nn import Module, Parameter, Embedding


num_tokens = tokenizer.get_vocab_size()


class DLM(Module):

    def __init__(self, num_tokens: int, emb_dim: int):
        super().__init__()
        self.num_tokens = num_tokens
        self.emb_dim = emb_dim
        self.emb_token = Embedding(
            num_embeddings=self.num_tokens,
            embedding_dim=self.emb_dim,
        )

    def forward(self, x):
        x = self.emb_token(x)
        return x


def collate_fn(batch):
    return torch.tensor([enc.ids for enc in tokenizer.encode_batch(batch)])


train_loader = DataLoader(train_set, collate_fn=collate_fn, batch_size=16)

model = DLM(num_tokens=num_tokens, emb_dim=128)
for x in train_loader:
    y = model(x)
    print(y.shape)

torch.Size([16, 15, 128])
torch.Size([16, 13, 128])
torch.Size([16, 11, 128])
torch.Size([16, 12, 128])
torch.Size([16, 12, 128])
torch.Size([16, 12, 128])
torch.Size([16, 11, 128])
torch.Size([16, 12, 128])
torch.Size([16, 12, 128])
torch.Size([16, 15, 128])
torch.Size([16, 14, 128])
torch.Size([16, 12, 128])
torch.Size([16, 13, 128])
torch.Size([16, 14, 128])
torch.Size([16, 13, 128])
torch.Size([16, 13, 128])
torch.Size([16, 13, 128])
torch.Size([16, 13, 128])
torch.Size([16, 16, 128])
torch.Size([16, 14, 128])
torch.Size([16, 10, 128])
torch.Size([16, 12, 128])
torch.Size([16, 12, 128])
torch.Size([16, 16, 128])
torch.Size([16, 14, 128])
torch.Size([16, 11, 128])
torch.Size([16, 10, 128])
torch.Size([16, 13, 128])
torch.Size([16, 11, 128])
torch.Size([16, 11, 128])
torch.Size([16, 11, 128])
torch.Size([16, 12, 128])
torch.Size([16, 12, 128])
torch.Size([16, 11, 128])
torch.Size([16, 11, 128])
torch.Size([16, 12, 128])
torch.Size([16, 13, 128])
torch.Size([16, 14, 128])
torch.Size([