In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import torch
import numpy as np
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2LMHeadModel, GPT2Tokenizer

In [3]:
class TextDataset(Dataset):
    def __init__(self, path, tokenizer, seq_length=512):
        with open(path) as f:
            data = f.read()
        tokens = tokenizer.encode(data)
        examples = []
        for i in range(0, len(tokens) - seq_length + 1, seq_length):
            examples.append(tokens[i:i + seq_length])
        self.samples = torch.LongTensor(examples)
        print('Loaded samples:', len(self.samples))
    
    def __len__(self):
        return len(self.samples)

    def __getitem__(self, item):
        return self.samples[item]

In [4]:
tokenizer = GPT2Tokenizer.from_pretrained("sberbank-ai/mGPT")

In [5]:
dataset = TextDataset("./data/sah.txt", tokenizer)
dataloader = DataLoader(dataset, batch_size=1, shuffle=True, drop_last=True)

Loaded samples: 2173


In [6]:
model = GPT2LMHeadModel.from_pretrained("sberbank-ai/mGPT")

In [7]:
if torch.cuda.is_available():
    model.cuda()

In [8]:
progressbar = tqdm(dataloader)
losses = []
for batch in progressbar:
    batch = batch.to(model.device)
    outputs = model(batch, labels=batch)
    loss = outputs.loss
    losses.append(loss.detach().item())

100%|██████████| 2173/2173 [01:06<00:00, 32.43it/s]


In [9]:
print("loss:", np.mean(losses))
print("ppl:", np.exp(np.mean(losses)))

loss: 2.2140769756065226
ppl: 9.152956808961022
