# Imports

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
import os
import io
import requests
import numpy as np
import pandas as pd
import re
import zipfile
import random
import time
import csv
import datetime
import pickle

In [2]:
!pip install transformers==4.19.0

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers==4.19.0
  Downloading transformers-4.19.0-py3-none-any.whl (4.2 MB)
[K     |████████████████████████████████| 4.2 MB 5.0 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 48.2 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 42.6 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.7.0-py3-none-any.whl (86 kB)
[K     |████████████████████████████████| 86 kB 5.8 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uni

# Connect to GPU

In [3]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch
DEVICE = torch.device("cuda")

In [4]:
!nvidia-smi

Wed Jun 15 11:51:08 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   56C    P8    10W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

# Install model

In [5]:
model_name_or_path = "sberbank-ai/rugpt3medium_based_on_gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name_or_path)
model = GPT2LMHeadModel.from_pretrained('AnyaSchen/rugpt3_blok').to(DEVICE)

Downloading:   0%|          | 0.00/1.54M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.21M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/674 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/978 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

# Fine-tuning step-by-step

## Add special tokens

In [9]:
SPECIAL_TOKENS = {'bos_token' : "<bos>", "eos_token" :"<eos>", 'pad_token':'<pad>'}
tokenizer.add_special_tokens(SPECIAL_TOKENS)
model.resize_token_embeddings(len(tokenizer))

Embedding(50259, 1024)

## Get data

In [10]:
!wget https://www.dropbox.com/s/b6q4tokw0skjsfz/dataset.pkl?dl=0 -O dataset.pkl

--2022-06-15 09:17:47--  https://www.dropbox.com/s/b6q4tokw0skjsfz/dataset.pkl?dl=0
Resolving www.dropbox.com (www.dropbox.com)... 162.125.1.18, 2620:100:6016:18::a27d:112
Connecting to www.dropbox.com (www.dropbox.com)|162.125.1.18|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: /s/raw/b6q4tokw0skjsfz/dataset.pkl [following]
--2022-06-15 09:17:47--  https://www.dropbox.com/s/raw/b6q4tokw0skjsfz/dataset.pkl
Reusing existing connection to www.dropbox.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://uc0f52332712b4b0d0f7289d27c5.dl.dropboxusercontent.com/cd/0/inline/BnMUWU3COR4b1mK829_SRSZa2FQwhQ1EYyOXR9n6ktZ2DmOp54DCs3BM5iNMVNR-L20-ccWyB_uHdb5QwwHuUnDBJsVpXrgjthkPBHp3p3SRzDIFzXyUHSNAs31Fa1Eq1mtWx-bk-SoCfYpi7y8XdaTSxPfzD8s7E6hU6o44xYdfHA/file# [following]
--2022-06-15 09:17:48--  https://uc0f52332712b4b0d0f7289d27c5.dl.dropboxusercontent.com/cd/0/inline/BnMUWU3COR4b1mK829_SRSZa2FQwhQ1EYyOXR9n6ktZ2DmOp54DCs3BM5iNMVNR-L2

In [11]:
with open('dataset.pkl', 'rb') as f:
    dataset = pickle.load(f)

## Greate a Dataset

In [12]:
import torch
torch.manual_seed(42)
from torch.utils.data import Dataset # this is the pytorch class import

class myDataset(Dataset):

  def __init__(self, txt_list, tokenizer, gpt2_type="gpt2", max_length=150):

    self.tokenizer = tokenizer # the gpt2 tokenizer we instantiated
    self.input_ids = []
    self.attn_masks = []

    for txt in txt_list:
      """
      This loop will iterate through each entry in the flavour text corpus.
      For each bit of text it will prepend it with the start of text token,
      then append the end of text token and pad to the maximum length with the 
      pad token. 
      """

      encodings_dict = tokenizer('<bos>'+ txt + '<eos>', 
                                 truncation=True, 
                                 max_length=max_length, 
                                 padding="max_length")
      
      """
      Each iteration then appends either the encoded tensor to a list,
      or the attention mask for that encoding to a list. The attention mask is
      a binary list of 1's or 0's which determine whether the langauge model
      should take that token into consideration or not. 
      """
      self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
      self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))
    
  def __len__(self):
    return len(self.input_ids)

  def __getitem__(self, idx):
    return {
        'input_ids': self.input_ids[idx],
        'attn_masks': self.attn_masks[idx]
    }

In [13]:
train_dataset = myDataset(dataset['blok'], tokenizer)

## Add Datacollator

In [14]:
from transformers import TextDataset, DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [15]:
train_dataset[2]

{'attn_masks': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0]),
 'input_ids': tensor([50257,  7798,   416,  1490,    16,  1490,  2059,    16,   203, 37653,
           420,   289,  4297, 31007, 10951,   800,    16,   203,  8756,   636,
         23022,    16,  5834,   203,  6039,  1354, 10544, 12186,     5,   203,
         11459, 24779,   702,    16,   537,  5020,    16,   537,  4411,   203,
          3096,  6597,  2803,    16,  7680,    16,  5794,    16,   203, 21885,
          1472,  6192,   694,  3670,   203, 19115,   460, 17648

## Training

In [16]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./drive/MyDrive/blok_checkouts_2/", #The output directory
    overwrite_output_dir=True, #overwrite the content of the output directory
    num_train_epochs=10, # number of training epochs
    per_device_train_batch_size=4, # batch size for training
    per_device_eval_batch_size=4,  # batch size for evaluation
    warmup_steps=150,# number of warmup steps for learning rate scheduler
    gradient_accumulation_steps=2, # to make "virtual" batch size larger
    save_steps = 1000
    )


trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    optimizers = (torch.optim.AdamW(model.parameters(),lr=1e-5),None) # Optimizer and lr scheduler
)

In [17]:
trainer.train()

***** Running training *****
  Num examples = 2122
  Num Epochs = 10
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 2
  Total optimization steps = 2650
The following columns in the training set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: attn_masks. If attn_masks are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.


Step,Training Loss
500,1.5524
1000,1.4407
1500,1.3326
2000,1.2512
2500,1.1992


Saving model checkpoint to ./drive/MyDrive/blok_checkouts_2/checkpoint-1000
Configuration saved in ./drive/MyDrive/blok_checkouts_2/checkpoint-1000/config.json
Model weights saved in ./drive/MyDrive/blok_checkouts_2/checkpoint-1000/pytorch_model.bin
Saving model checkpoint to ./drive/MyDrive/blok_checkouts_2/checkpoint-2000
Configuration saved in ./drive/MyDrive/blok_checkouts_2/checkpoint-2000/config.json
Model weights saved in ./drive/MyDrive/blok_checkouts_2/checkpoint-2000/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=2650, training_loss=1.344735136212043, metrics={'train_runtime': 2639.6138, 'train_samples_per_second': 8.039, 'train_steps_per_second': 1.004, 'total_flos': 5772999414988800.0, 'train_loss': 1.344735136212043, 'epoch': 10.0})

## Save models

In [18]:
trainer.save_model('/content/drive/MyDrive/blok_model_2/')

Saving model checkpoint to /content/drive/MyDrive/blok_model_2/
Configuration saved in /content/drive/MyDrive/blok_model_2/config.json
Model weights saved in /content/drive/MyDrive/blok_model_2/pytorch_model.bin


# Generation

In [6]:
# Пример вероятностного сэмплирвоания с ограничением
text = "И ветер дул, и небо моросило"
input_ids = tokenizer.encode(text, return_tensors="pt").to(DEVICE)
model.eval()
with torch.no_grad():
    out = model.generate(input_ids, 
                        do_sample=True,
                        num_beams=3,
                        temperature=1.5,
                        top_p=0.9,
                        max_length=100,
                        )

generated_text = list(map(tokenizer.decode, out))[0]
print()
print(generated_text)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



И ветер дул, и небо моросило,
И от дождя синяя пелена
Озаряла унылый путь.
И каждый думал о своем.
Я думал: «Как хочется жить!
Как хочется гулять, дышать,
Как хочется напиться чаю,
Поспать на диване, в уюте,
Уснуть на бархатной перине!
Как хочется напиться чаю,
Поспать на диване, в уюте,
Уснуть на
