In [1]:
!pip install tokenizers
!pip install transformers
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tokenizers
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 5.2 MB/s 
[?25hInstalling collected packages: tokenizers
Successfully installed tokenizers-0.12.1
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.20.1-py3-none-any.whl (4.4 MB)
[K     |████████████████████████████████| 4.4 MB 5.1 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 12.2 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 64.2 MB/s 
Installing 

In [3]:
# Run on colab
import torch
from tokenizers import ByteLevelBPETokenizer
from transformers import GPT2Tokenizer
from transformers import GPT2LMHeadModel, GPT2Config
from transformers import Trainer, TrainingArguments
from transformers import DataCollatorForLanguageModeling
from datasets import load_dataset

In [7]:
tokenizer = ByteLevelBPETokenizer(add_prefix_space=True)
tokenizer.train(
    'data/small_train.txt',
    vocab_size=20000,
    special_tokens=['<eos>', '<pad>', '<bos>', '<unk>'],
)
tokenizer.save_model('gpt/', 'bpe')

['gpt/bpe-vocab.json', 'gpt/bpe-merges.txt']

In [8]:
tokenizer = GPT2Tokenizer(
    vocab_file='gpt/bpe-vocab.json',merges_file='gpt/bpe-merges.txt', unk_token='<unk>', bos_token='<bos>', 
    eos_token='<eos>', pad_token='<pad>', add_prefix_space=True, local_files_only=True)

In [9]:
configuration = GPT2Config(vocab_size=tokenizer.vocab_size)
model = GPT2LMHeadModel(configuration)

In [10]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(20000, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dro

In [11]:
def perplexity(input_ids, model):
    length = model.config.n_positions
    nlls = []
    for i in range(0, input_ids.size(1)-length, length):
        curr_input_ids = input_ids[:, i:i+length].to(device)
        with torch.no_grad():
            outputs = model(curr_input_ids, labels=curr_input_ids)
        nlls.append(outputs['loss'])
    return torch.exp(sum(nlls)/len(nlls))

In [12]:
test_dataset = load_dataset('text', data_files='data/small_test.txt')
tokenized_test_dataset = tokenizer('\n\n'.join(test_dataset['train']['text']), return_tensors="pt")

Using custom data configuration default-2250ffc83885abe4


Downloading and preparing dataset text/default to /root/.cache/huggingface/datasets/text/default-2250ffc83885abe4/0.0.0/acc32f2f2ef863c93c2f30c52f7df6cc9053a1c2230b8d7da0d210404683ca08...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

Dataset text downloaded and prepared to /root/.cache/huggingface/datasets/text/default-2250ffc83885abe4/0.0.0/acc32f2f2ef863c93c2f30c52f7df6cc9053a1c2230b8d7da0d210404683ca08. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [13]:
perplexity(tokenized_test_dataset.input_ids, model)

tensor(22896.9551, device='cuda:0')

In [14]:
def generate_text(prompt):
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    outputs = model(**inputs, labels=inputs["input_ids"])
    loss = outputs.loss
    logits = outputs.logits
    output = model.generate(**inputs, return_dict_in_generate=True, output_scores=True)
    return tokenizer.decode(output['sequences'][0])

In [16]:
generate_text("Kot siedział na drzewie i ")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


' Kot siedział na drzewie i  wypadkach wypadkach Telewizjiówka Podsta Podsta Podsta Podsta Szcze Szcze Dwo Dwo'

In [17]:
generate_text("To nie jest tak, że dobrze albo niedobrze ")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


' To nie jest tak, że dobrze albo niedobrze  sukces sukces sukces Boguwiły York York Fan zajmujących'

In [19]:
max_length=30
def tokenize(row):
    return tokenizer(row['text'], padding='max_length', truncation=True, max_length=max_length)

dataset = load_dataset('text', data_files='data/small_train.txt')
tokenized_dataset = dataset['train'].map(tokenize, batched=True)

Using custom data configuration default-1e639c9c93a95f5e


Downloading and preparing dataset text/default to /root/.cache/huggingface/datasets/text/default-1e639c9c93a95f5e/0.0.0/acc32f2f2ef863c93c2f30c52f7df6cc9053a1c2230b8d7da0d210404683ca08...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

Dataset text downloaded and prepared to /root/.cache/huggingface/datasets/text/default-1e639c9c93a95f5e/0.0.0/acc32f2f2ef863c93c2f30c52f7df6cc9053a1c2230b8d7da0d210404683ca08. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]



  0%|          | 0/200 [00:00<?, ?ba/s]

In [20]:
training_args = TrainingArguments(output_dir="gpt_model", save_steps=2000, save_total_limit=1)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
trainer = Trainer(model=model, args=training_args, train_dataset=tokenized_dataset, data_collator=data_collator)

In [21]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 200000
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 75000


Step,Training Loss
500,8.2771
1000,7.9579
1500,7.8328
2000,7.7147
2500,7.5766
3000,7.455
3500,7.3525
4000,7.2529
4500,7.1995
5000,7.0757


Saving model checkpoint to gpt_model/checkpoint-2000
Configuration saved in gpt_model/checkpoint-2000/config.json
Model weights saved in gpt_model/checkpoint-2000/pytorch_model.bin
Saving model checkpoint to gpt_model/checkpoint-4000
Configuration saved in gpt_model/checkpoint-4000/config.json
Model weights saved in gpt_model/checkpoint-4000/pytorch_model.bin
Saving model checkpoint to gpt_model/checkpoint-6000
Configuration saved in gpt_model/checkpoint-6000/config.json
Model weights saved in gpt_model/checkpoint-6000/pytorch_model.bin
Saving model checkpoint to gpt_model/checkpoint-8000
Configuration saved in gpt_model/checkpoint-8000/config.json
Model weights saved in gpt_model/checkpoint-8000/pytorch_model.bin
Deleting older checkpoint [gpt_model/checkpoint-6000] due to args.save_total_limit
Saving model checkpoint to gpt_model/checkpoint-10000
Configuration saved in gpt_model/checkpoint-10000/config.json
Model weights saved in gpt_model/checkpoint-10000/pytorch_model.bin
Saving mo

TrainOutput(global_step=75000, training_loss=5.795820774739584, metrics={'train_runtime': 7407.4503, 'train_samples_per_second': 81.0, 'train_steps_per_second': 10.125, 'total_flos': 9186048000000000.0, 'train_loss': 5.795820774739584, 'epoch': 3.0})

In [22]:
trainer.save_model('gpt_model')

Saving model checkpoint to gpt_model
Configuration saved in gpt_model/config.json
Model weights saved in gpt_model/pytorch_model.bin


In [23]:
model = GPT2LMHeadModel.from_pretrained('gpt_model')

loading configuration file gpt_model/config.json
Model config GPT2Config {
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "torch_dtype": "float32",
  "transformers_version": "4.20.1",
  "use_cache": true,
  "vocab_size": 20000
}

loading weights file gpt_model/pytorch_model.bin
All model checkpoint weights were used when initializing GPT2LMHeadModel.

All the weights of GPT2LMHeadModel were initialized

In [25]:
model.to(device)
perplexity(tokenized_test_dataset.input_ids, model)

tensor(7339.0576, device='cuda:0')

In [26]:
generate_text("Kot siedział na drzewie i ")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


' Kot siedział na drzewie i owe, a nie na to......'

In [27]:
generate_text("To nie jest tak, że dobrze albo niedobrze ")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


' To nie jest tak, że dobrze albo niedobrze nia się z tym, co się dzieje.'