# Imports

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [1]:
import os
import io
import requests
import numpy as np
import pandas as pd
import re
import zipfile
import random
import time
import csv
import datetime
import pickle

In [2]:
!pip install transformers==4.19.0

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers==4.19.0
  Downloading transformers-4.19.0-py3-none-any.whl (4.2 MB)
[K     |████████████████████████████████| 4.2 MB 16.0 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.7.0-py3-none-any.whl (86 kB)
[K     |████████████████████████████████| 86 kB 5.6 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 49.9 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 52.3 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Un

# Connect to GPU

In [3]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch
DEVICE = torch.device("cuda")

In [4]:
!nvidia-smi

Wed Jun 15 11:42:27 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   51C    P8     9W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

# Install model

In [5]:
model_name_or_path = "sberbank-ai/rugpt3medium_based_on_gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name_or_path)
model = GPT2LMHeadModel.from_pretrained('AnyaSchen/rugpt3_tyutchev').to(DEVICE)

Downloading:   0%|          | 0.00/1.54M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.21M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/674 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/982 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

# Fine-tuning step-by-step

## Add special tokens

In [7]:
SPECIAL_TOKENS = {'bos_token' : "<bos>", "eos_token" :"<eos>", 'pad_token':'<pad>'}
tokenizer.add_special_tokens(SPECIAL_TOKENS)
model.resize_token_embeddings(len(tokenizer))

Embedding(50259, 1024)

## Get data

In [8]:
!wget https://www.dropbox.com/s/b6q4tokw0skjsfz/dataset.pkl?dl=0 -O dataset.pkl

--2022-06-15 09:48:39--  https://www.dropbox.com/s/b6q4tokw0skjsfz/dataset.pkl?dl=0
Resolving www.dropbox.com (www.dropbox.com)... 162.125.1.18, 2620:100:6016:18::a27d:112
Connecting to www.dropbox.com (www.dropbox.com)|162.125.1.18|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: /s/raw/b6q4tokw0skjsfz/dataset.pkl [following]
--2022-06-15 09:48:39--  https://www.dropbox.com/s/raw/b6q4tokw0skjsfz/dataset.pkl
Reusing existing connection to www.dropbox.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://ucc511ae93047352e2c410fd444f.dl.dropboxusercontent.com/cd/0/inline/BnMAd-sPm7oBbj4V3yEPU6LG9keC44_Vl-R1EKqzghSWa11EOyNHk--bGzLBoBRflS_DArGclaCo4h6DE21r3sVmg91owvqabRNb7_P7Uu-BQNugFKZeuOLcDg1ldwoMorY7pYZVhBmOg3amV1O9oe-yK-wLKvttj6VIkYhAuDqtZg/file# [following]
--2022-06-15 09:48:39--  https://ucc511ae93047352e2c410fd444f.dl.dropboxusercontent.com/cd/0/inline/BnMAd-sPm7oBbj4V3yEPU6LG9keC44_Vl-R1EKqzghSWa11EOyNHk--bGzLBoBRflS

In [9]:
with open('dataset.pkl', 'rb') as f:
    dataset = pickle.load(f)

## Greate a Dataset

In [10]:
import torch
torch.manual_seed(42)
from torch.utils.data import Dataset # this is the pytorch class import

class myDataset(Dataset):

  def __init__(self, txt_list, tokenizer, gpt2_type="gpt2", max_length=500):

    self.tokenizer = tokenizer # the gpt2 tokenizer we instantiated
    self.input_ids = []
    self.attn_masks = []

    for txt in txt_list:
      """
      This loop will iterate through each entry in the flavour text corpus.
      For each bit of text it will prepend it with the start of text token,
      then append the end of text token and pad to the maximum length with the 
      pad token. 
      """

      encodings_dict = tokenizer('<bos>'+ txt + '<eos>', 
                                 truncation=True, 
                                 max_length=max_length, 
                                 padding="max_length")
      
      """
      Each iteration then appends either the encoded tensor to a list,
      or the attention mask for that encoding to a list. The attention mask is
      a binary list of 1's or 0's which determine whether the langauge model
      should take that token into consideration or not. 
      """
      self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
      self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))
    
  def __len__(self):
    return len(self.input_ids)

  def __getitem__(self, idx):
    return {
        'input_ids': self.input_ids[idx],
        'attn_masks': self.attn_masks[idx]
    }

In [11]:
train_dataset = myDataset(dataset['tyutchev'], tokenizer)

## Add Datacollator

In [12]:
from transformers import TextDataset, DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [13]:
train_dataset[2]

{'attn_masks': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1,

## Training

In [14]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./drive/MyDrive/tyutchev_checkouts/", #The output directory
    overwrite_output_dir=True, #overwrite the content of the output directory
    num_train_epochs = 15, # number of training epochs
    per_device_train_batch_size=2, # batch size for training
    per_device_eval_batch_size=2,  # batch size for evaluation
    warmup_steps=150,# number of warmup steps for learning rate scheduler
    gradient_accumulation_steps=1, # to make "virtual" batch size larger
    save_steps = 1000
    )


trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    optimizers = (torch.optim.AdamW(model.parameters(),lr=1e-5),None) # Optimizer and lr scheduler
)

In [15]:
trainer.train()

***** Running training *****
  Num examples = 459
  Num Epochs = 15
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 2
  Gradient Accumulation steps = 1
  Total optimization steps = 3450
The following columns in the training set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: attn_masks. If attn_masks are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.


Step,Training Loss
500,0.7893
1000,0.6849
1500,0.5894
2000,0.5214
2500,0.4634
3000,0.4312


Saving model checkpoint to ./drive/MyDrive/tyutchev_checkouts/checkpoint-1000
Configuration saved in ./drive/MyDrive/tyutchev_checkouts/checkpoint-1000/config.json
Model weights saved in ./drive/MyDrive/tyutchev_checkouts/checkpoint-1000/pytorch_model.bin
Saving model checkpoint to ./drive/MyDrive/tyutchev_checkouts/checkpoint-2000
Configuration saved in ./drive/MyDrive/tyutchev_checkouts/checkpoint-2000/config.json
Model weights saved in ./drive/MyDrive/tyutchev_checkouts/checkpoint-2000/pytorch_model.bin
Saving model checkpoint to ./drive/MyDrive/tyutchev_checkouts/checkpoint-3000
Configuration saved in ./drive/MyDrive/tyutchev_checkouts/checkpoint-3000/config.json
Model weights saved in ./drive/MyDrive/tyutchev_checkouts/checkpoint-3000/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=3450, training_loss=0.5578373961517776, metrics={'train_runtime': 3231.2568, 'train_samples_per_second': 2.131, 'train_steps_per_second': 1.068, 'total_flos': 6244242462720000.0, 'train_loss': 0.5578373961517776, 'epoch': 15.0})

## Save models

In [16]:
trainer.save_model('/content/drive/MyDrive/tyutchev_model_2/')

Saving model checkpoint to /content/drive/MyDrive/tyutchev_model_2/
Configuration saved in /content/drive/MyDrive/tyutchev_model_2/config.json
Model weights saved in /content/drive/MyDrive/tyutchev_model_2/pytorch_model.bin


# Generation

In [6]:
# Пример вероятностного сэмплирвоания с ограничением
text = "И ветер дул, и небо моросило"
input_ids = tokenizer.encode(text, return_tensors="pt").to(DEVICE)
model.eval()
with torch.no_grad():
    out = model.generate(input_ids, 
                        do_sample=True,
                        num_beams=3,
                        temperature=1.5,
                        top_p=0.9,
                        max_length=100,
                        )

generated_text = list(map(tokenizer.decode, out))[0]
print()
print(generated_text)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



И ветер дул, и небо моросило,
И целый год бесновался непогожий, —
И всё для сердца и для глаз уныло-беспечно.
И в самом деле, что нам до этих бурь и ненастий!
По зрелом размышленьи,
Давно минувших дней печаль нам ни к чему.
Они не стоят ломаного гроша,
И нашей жизни, и нашего труда,
И нашей мимолетной славы,
