In [28]:
import torch
from tokenizers import ByteLevelBPETokenizer
from transformers import GPT2Tokenizer
from transformers import GPT2LMHeadModel, GPT2Config
from transformers import Trainer, TrainingArguments
from transformers import DataCollatorForLanguageModeling
from datasets import load_dataset

In [2]:
tokenizer = ByteLevelBPETokenizer(add_prefix_space=True)
tokenizer.train(
    'data/small_unsegmented.txt',
    vocab_size=20000,
    special_tokens=['<eos>', '<pad>', '<bos>', '<unk>'],
)
tokenizer.save_model('gpt/', 'bpe')






['gpt/bpe-vocab.json', 'gpt/bpe-merges.txt']

In [3]:
tokenizer = GPT2Tokenizer(
    vocab_file='gpt/bpe-vocab.json',merges_file='gpt/bpe-merges.txt', unk_token='<unk>', bos_token='<bos>', 
    eos_token='<eos>', pad_token='<pad>', add_prefix_space=True, local_files_only=True)

In [4]:
configuration = GPT2Config(vocab_size=tokenizer.vocab_size)
model = GPT2LMHeadModel(configuration)

In [5]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
def perplexity(input_ids, model):
    length = model.config.n_positions
    nlls = []
    for i in range(0, input_ids.size(1)-length, length):
        curr_input_ids = input_ids[:, i:i+length].to(device)
        with torch.no_grad():
            outputs = model(curr_input_ids, labels=curr_input_ids)
        nlls.append(outputs['loss'])
    return torch.exp(sum(nlls)/len(nlls))

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [6]:
test_dataset = load_dataset('text', data_files='data/test.txt')
tokenized_test_dataset = tokenizer('\n\n'.join(test_dataset['train']['text']), return_tensors="pt")

Using custom data configuration default-5414c941d6230d78
Reusing dataset text (/home/ania/.cache/huggingface/datasets/text/default-5414c941d6230d78/0.0.0/acc32f2f2ef863c93c2f30c52f7df6cc9053a1c2230b8d7da0d210404683ca08)


  0%|          | 0/1 [00:00<?, ?it/s]

In [7]:
perplexity(tokenized_test_dataset.input_ids, model)

tensor(23401.0781)

In [21]:
def generate_text(prompt):
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    outputs = model(**inputs, labels=inputs["input_ids"])
    loss = outputs.loss
    logits = outputs.logits
    output = model.generate(**inputs, return_dict_in_generate=True, output_scores=True)
    return tokenizer.decode(output['sequences'][0])

In [23]:
generate_text("Kot siedział na drzewie i ")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


' Kot siedział na drzewie i  rodzinieferfer stawektkowych miały rocznego edycję edycję edycję edycję edycję'

In [24]:
generate_text("To nie jest tak, że dobrze albo niedobrze ")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


' To nie jest tak, że dobrze albo niedobrze  pracuje pracuje elektrynowanie depu depu depu depu depu'

In [25]:
max_length=30
def tokenize(row):
    return tokenizer(row['text'], padding='max_length', truncation=True, max_length=max_length)

dataset = load_dataset('text', data_files='data/small_unsegmented.txt')
tokenized_dataset = dataset['train'].map(tokenize, batched=True)

Using custom data configuration default-5414c941d6230d78
Reusing dataset text (/home/ania/.cache/huggingface/datasets/text/default-5414c941d6230d78/0.0.0/acc32f2f2ef863c93c2f30c52f7df6cc9053a1c2230b8d7da0d210404683ca08)


  0%|          | 0/1 [00:00<?, ?it/s]



  0%|          | 0/20 [00:00<?, ?ba/s]

In [29]:
training_args = TrainingArguments(output_dir="gpt_model", save_steps=2000, save_total_limit=1)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
trainer = Trainer(model=model, args=training_args, train_dataset=tokenized_dataset, data_collator=data_collator)

In [30]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 20000
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 7500


Step,Training Loss


KeyboardInterrupt: 

In [31]:
trainer.save_model('gpt_model')

Saving model checkpoint to gpt_model
Configuration saved in gpt_model/config.json
Model weights saved in gpt_model/pytorch_model.bin


In [3]:
model = GPT2LMHeadModel.from_pretrained('gpt_model')

In [33]:
perplexity(tokenized_test_dataset.input_ids[:, :2000], model)

tensor(6530.4565)

In [34]:
generate_text("Kot siedział na drzewie i ")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


' Kot siedział na drzewie i,,,,,,,,,,,,'

In [35]:
generate_text("To nie jest tak, że dobrze albo niedobrze ")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


' To nie jest tak, że dobrze albo niedobrze,,,,,,,,,'