In [1]:
!pip install tokenizers
!pip install transformers
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tokenizers
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 5.6 MB/s 
[?25hInstalling collected packages: tokenizers
Successfully installed tokenizers-0.12.1
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.20.1-py3-none-any.whl (4.4 MB)
[K     |████████████████████████████████| 4.4 MB 5.4 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 12.5 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 60.0 MB/s 
Insta

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# Run on colab
import torch
from tokenizers import ByteLevelBPETokenizer
from transformers import GPT2Tokenizer
from transformers import GPT2LMHeadModel, GPT2Config
from transformers import Trainer, TrainingArguments
from transformers import DataCollatorForLanguageModeling
from datasets import load_dataset

In [4]:
# Pre-trained polish GPT2 model (https://huggingface.co/flax-community/papuGaPT2)
model = GPT2LMHeadModel.from_pretrained('flax-community/papuGaPT2')
tokenizer = GPT2Tokenizer.from_pretrained('flax-community/papuGaPT2')
tokenizer.pad_token = 0

Downloading:   0%|          | 0.00/864 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/487M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/867k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/534k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/208 [00:00<?, ?B/s]

In [20]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.0, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.0, inplace=False)
          (resid_dropout): Dropout(p=0.0, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.0, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.0, inplace=False)
          (resid_dropout): Dro

In [25]:
def perplexity(input_ids, model):
    length = model.config.n_positions
    nlls = []
    size = (input_ids.size(1)//length)*length # Round to the number divisible by n_positions
    for i in range(0, size, length): 
        curr_input_ids = input_ids[:, i:i+length].to(device)
        with torch.no_grad():
            outputs = model(curr_input_ids, labels=curr_input_ids)
        nlls.append(outputs['loss'])
    return torch.exp(sum(nlls)/len(nlls)).item()

In [9]:
# Run calculations on a subset of the test file with 20000 lines
test_dataset = load_dataset('text', data_files='data/small_test.txt')
tokenized_test_dataset = tokenizer('\n\n'.join(test_dataset['train']['text']), return_tensors="pt")

Using custom data configuration default-2250ffc83885abe4


Downloading and preparing dataset text/default to /root/.cache/huggingface/datasets/text/default-2250ffc83885abe4/0.0.0/acc32f2f2ef863c93c2f30c52f7df6cc9053a1c2230b8d7da0d210404683ca08...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

Dataset text downloaded and prepared to /root/.cache/huggingface/datasets/text/default-2250ffc83885abe4/0.0.0/acc32f2f2ef863c93c2f30c52f7df6cc9053a1c2230b8d7da0d210404683ca08. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [26]:
perplexity(tokenized_test_dataset.input_ids, model)

298.61334228515625

In [27]:
def generate_text(prompt):
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    outputs = model(**inputs, labels=inputs["input_ids"])
    loss = outputs.loss
    logits = outputs.logits
    output = model.generate(**inputs, return_dict_in_generate=True, output_scores=True, 
                            min_length=30, max_length=50)
    return tokenizer.decode(output['sequences'][0])

In [28]:
generate_text("Kot siedział na drzewie i ")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


'Kot siedział na drzewie i łkał. Gdy się obudził, zobaczył, że w jego pokoju jest kot. Kot zaczął się drapać i mruczeć. Kot zaczął się śmiać. Kot zaczął się śmiać. Kot zaczął się śmiać. Kot zaczął'

In [29]:
generate_text("To nie jest tak, że dobrze albo niedobrze ")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


'To nie jest tak, że dobrze albo niedobrze rządzenie jest złe. To jest po prostu bardzo dobre.\nW tym roku, w którym obchodzimy 100-lecie odzyskania niepodległości, w wielu miastach Polski odbędą się uroczystości związane z obchodami tego święta.'

In [30]:
def tokenize(row):
    return tokenizer(row['text'])

train_dataset = load_dataset('text', data_files='data/small_train.txt')
tokenized_train_dataset = train_dataset['train'].map(tokenize, batched=True)

Using custom data configuration default-1e639c9c93a95f5e


Downloading and preparing dataset text/default to /root/.cache/huggingface/datasets/text/default-1e639c9c93a95f5e/0.0.0/acc32f2f2ef863c93c2f30c52f7df6cc9053a1c2230b8d7da0d210404683ca08...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

Dataset text downloaded and prepared to /root/.cache/huggingface/datasets/text/default-1e639c9c93a95f5e/0.0.0/acc32f2f2ef863c93c2f30c52f7df6cc9053a1c2230b8d7da0d210404683ca08. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]



  0%|          | 0/200 [00:00<?, ?ba/s]

In [31]:
training_args = TrainingArguments(output_dir="gpt_pretrained", save_steps=2000, save_total_limit=1)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
trainer = Trainer(model=model, args=training_args, train_dataset=tokenized_train_dataset, data_collator=data_collator)

In [32]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 200000
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 75000


Step,Training Loss
500,3.9143
1000,3.8074
1500,3.82
2000,3.7573
2500,3.7525
3000,3.7479
3500,3.742
4000,3.7077
4500,3.7175
5000,3.6932


Saving model checkpoint to gpt_pretrained/checkpoint-2000
Configuration saved in gpt_pretrained/checkpoint-2000/config.json
Model weights saved in gpt_pretrained/checkpoint-2000/pytorch_model.bin
Saving model checkpoint to gpt_pretrained/checkpoint-4000
Configuration saved in gpt_pretrained/checkpoint-4000/config.json
Model weights saved in gpt_pretrained/checkpoint-4000/pytorch_model.bin
Deleting older checkpoint [gpt_pretrained/checkpoint-2000] due to args.save_total_limit
Saving model checkpoint to gpt_pretrained/checkpoint-6000
Configuration saved in gpt_pretrained/checkpoint-6000/config.json
Model weights saved in gpt_pretrained/checkpoint-6000/pytorch_model.bin
Deleting older checkpoint [gpt_pretrained/checkpoint-4000] due to args.save_total_limit
Saving model checkpoint to gpt_pretrained/checkpoint-8000
Configuration saved in gpt_pretrained/checkpoint-8000/config.json
Model weights saved in gpt_pretrained/checkpoint-8000/pytorch_model.bin
Deleting older checkpoint [gpt_pretraine

KeyboardInterrupt: ignored

In [34]:
# Best iteration (saved manually)
model = GPT2LMHeadModel.from_pretrained('gpt_pretrained/checkpoint-28000')

loading configuration file gpt_pretrained/checkpoint-28000/config.json
Model config GPT2Config {
  "_name_or_path": "flax-community/papuGaPT2",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.0,
  "bos_token_id": 50256,
  "embd_pdrop": 0.0,
  "eos_token_id": 50256,
  "gradient_checkpointing": false,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.0,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "torch_dtype": "float32",
  "transformers_version"

In [35]:
trainer.save_model('gpt_pretrained')

Saving model checkpoint to gpt_pretrained
Configuration saved in gpt_pretrained/config.json
Model weights saved in gpt_pretrained/pytorch_model.bin


In [38]:
model = GPT2LMHeadModel.from_pretrained('gpt_pretrained')
model.to(device)

loading configuration file gpt_pretrained/config.json
Model config GPT2Config {
  "_name_or_path": "flax-community/papuGaPT2",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.0,
  "bos_token_id": 50256,
  "embd_pdrop": 0.0,
  "eos_token_id": 50256,
  "gradient_checkpointing": false,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.0,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "torch_dtype": "float32",
  "transformers_version": "4.20.1",
  "us

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.0, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.0, inplace=False)
          (resid_dropout): Dropout(p=0.0, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.0, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.0, inplace=False)
          (resid_dropout): Dro

In [39]:
generate_text("Kot siedział na drzewie i ")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


'Kot siedział na drzewie i łkał, a jego oczy były koloru niebieskiego..................................'

In [40]:
generate_text("To nie jest tak, że dobrze albo niedobrze ")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


'To nie jest tak, że dobrze albo niedobrze rządzenie jest w Polsce...................................'

In [41]:
perplexity(tokenized_test_dataset.input_ids, model)

622.496337890625