[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1ZXourEIpzByuA5YpUTxyOBic_pNt0htu?usp=sharing)


# Imports

In [None]:
# !pip install transformers==4.19.0

In [None]:
import torch

# Clear GPU memory
torch.cuda.empty_cache()

# Check available memory
print(torch.cuda.memory_summary())

|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |      0 B   |      0 B   |      0 B   |      0 B   |
|       from large pool |      0 B   |      0 B   |      0 B   |      0 B   |
|       from small pool |      0 B   |      0 B   |      0 B   |      0 B   |
|---------------------------------------------------------------------------|
| Active memory         |      0 B   |      0 B   |      0 B   |      0 B   |
|       from large pool |      0 B   |      0 B   |      0 B   |      0 B   |
|       from small pool |      0 B   |      0 B   |      0 B   |      0 B   |
|---------------------------------------------------------------

In [None]:
import pandas as pd
import numpy as np

In [None]:
from transformers import GPT2LMHeadModel, AutoTokenizer
import torch
from torch.utils.data import Dataset # this is the pytorch class import
from transformers import DataCollatorForLanguageModeling
from transformers import TrainingArguments, Trainer
from transformers import StoppingCriteria, StoppingCriteriaList

RuntimeError: ignored

In [None]:
torch.cuda.get_device_name(0)

'NVIDIA GeForce RTX 3080 Ti'

In [None]:
DEVICE = torch.device("cuda:0") if torch.cuda.is_available() else None

In [None]:
torch.cuda.current_device()

0

In [None]:
model_sber = 'ai-forever/rugpt3medium_based_on_gpt2'
model_mayak = 'AnyaSchen/rugpt3_mayakovskij'
my_path = '/'

# Install model

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_sber)
model = GPT2LMHeadModel.from_pretrained(model_sber).to(DEVICE)

Downloading (…)lve/main/config.json: 100%|███████████████████████████████████████████████████████████| 674/674 [00:00<00:00, 194kB/s]
Downloading (…)olve/main/vocab.json: 100%|██████████████████████████████████████████████████████| 1.61M/1.61M [00:00<00:00, 3.15MB/s]
Downloading (…)olve/main/merges.txt: 100%|██████████████████████████████████████████████████████| 1.27M/1.27M [00:00<00:00, 2.44MB/s]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Downloading pytorch_model.bin: 100%|████████████████████████████████████████████████████████████| 1.73G/1.73G [03:01<00:00, 9.55MB/s]


# Fine-tuning step-by-step

## Add special tokens

In [None]:
SPECIAL_TOKENS = {'bos_token' : "<bos>", "eos_token" :"<eos>", 'pad_token':'<pad>'}
tokenizer.add_special_tokens(SPECIAL_TOKENS)
model.resize_token_embeddings(len(tokenizer))
model.config.bos_token_id = tokenizer.bos_token_id

## Get data

In [None]:
!wget https://www.dropbox.com/s/neb74j04nfxay14/poetry_keywords.csv?dl=0 -O poetry_keywords.csv

--2023-05-13 19:35:37--  https://www.dropbox.com/s/neb74j04nfxay14/poetry_keywords.csv?dl=0
Resolving www.dropbox.com (www.dropbox.com)... 162.125.71.18, 205.251.194.52, 205.251.199.157, ...
Connecting to www.dropbox.com (www.dropbox.com)|162.125.71.18|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: /s/raw/neb74j04nfxay14/poetry_keywords.csv [following]
--2023-05-13 19:35:37--  https://www.dropbox.com/s/raw/neb74j04nfxay14/poetry_keywords.csv
Reusing existing connection to www.dropbox.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://ucd18937b93694b8e97bc7191eff.dl.dropboxusercontent.com/cd/0/inline/B79r4uwFkhMxf_1SzLlLEWsYV-wI2qiG_gt7XOxM-MGxWKmtjnkBMAB9K9z7CuWSGpu_4rQTwJ-VzpC0C3LHQ4wa1_ucnW7gslwnsz99LRK2NExxa2ms4jDseHzJGdcMUNBKEzlaoyY3wjMjaZAH3N4NBWOe7tbqsEM3iauCBjjLgA/file# [following]
--2023-05-13 19:35:38--  https://ucd18937b93694b8e97bc7191eff.dl.dropboxusercontent.com/cd/0/inline/B79r4uwFkhMxf_1SzLlLEWsYV-wI2qiG_gt7XOxM-

In [None]:
import pandas as pd

dataset = pd.read_csv('poetry_keywords.csv')

## Greate a Dataset

In [None]:
torch.manual_seed(42) # this is the pytorch class import

class myDataset(Dataset):

  def __init__(self, data: pd.DataFrame, tokenizer, gpt2_type="gpt2", max_length=150):

    self.tokenizer = tokenizer # the gpt2 tokenizer we instantiated
    self.input_ids = []
    self.attn_masks = []

    for ind in data.index:

      author = 'Автор:' + data.iloc[ind]['author']
      keywords = 'Ключевые слова: ' + ', '.join(data.iloc[ind]['keywords'].split("'")[1:-1:2])+ '\n'
      poetry = 'Поэзия: ' + data.iloc[ind]['text'] + '<eos>'

      form = author + keywords + poetry
      encodings_dict = tokenizer(form, 
                                 truncation=True, 
                                 max_length=max_length, 
                                 padding="max_length")
    
      self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
      self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))
    
  def __len__(self):
    return len(self.input_ids)

  def __getitem__(self, idx):
    return {
        'input_ids': self.input_ids[idx],
        'attention_mask': self.attn_masks[idx]
    }

In [None]:
train_dataset = myDataset(dataset[dataset['author']=='Маяковский'], tokenizer)

## Add Datacollator

In [None]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [None]:
train_dataset[950]

{'input_ids': tensor([  684,   623,   390,  1936,  1694,    30,  3377, 14817,    16,  6020,
            16, 43160,    16, 22182,    16,  8282,   203,  4408,   599,  8253,
            30, 37402,   203, 38864, 10252, 34151,   357,   334, 47092, 47092,
          8324,    30,   203,   384,   682,  1161,  2559,   323,   203,   338,
           230,   338,   230,   338,   230,   338,   230,   338,   230,   338,
           230,   338,   230,   338,   230,   338,   230,  4379,   623,  2559,
           278,     5,   203, 46249,  6620,   357,   477, 39221,    16,   203,
          4379,   623,  2559,   278, 31429,  2661,    18,   203,  4670,   353,
          4565,   357,   387,   289,   281,   867,    16,   203,  2824,  6398,
          5053,   834,   843,    18,   203, 10269, 33171,   203,   338,   230,
           338,   230,   338,   230,   338,   230, 20355, 10252, 10252, 19696,
          1139,   203, 32592,  3606,   620,   557,  2869,   282,  2575,  1262,
            18,   203, 50259,     0,   

## Training

In [None]:
training_args = TrainingArguments(
    output_dir=f'.{my_path}checkouts/', #The output directory
    overwrite_output_dir=True, #overwrite the content of the output directory
    num_train_epochs=65, # number of training epochs
    per_device_train_batch_size=4, # batch size for training 
    warmup_steps=150,#45 number of warmup steps for learning rate scheduler
    gradient_accumulation_steps=5, # to make "virtual" batch size larger
    save_steps = 5000,
    fp16=True
    )


trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    optimizers = (torch.optim.AdamW(model.parameters(),lr=1e-5),None) # Optimizer and lr scheduler
)

In [None]:
trainer.train()

    There is an imbalance between your GPUs. You may want to exclude GPU 0 which
    has less than 75% of the memory or cores of GPU 1. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.
You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


## Save models

In [None]:
trainer.save_model(f'./model/')

In [None]:
tokenizer.save_pretrained('./tokenizer')

('./tokenizer/tokenizer_config.json',
 './tokenizer/special_tokens_map.json',
 './tokenizer/vocab.json',
 './tokenizer/merges.txt',
 './tokenizer/added_tokens.json',
 './tokenizer/tokenizer.json')

##load to hugging face

In [None]:
!pip install huggingface_hub
!!huggingface-cli login --token {auth_token}

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


['Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.',
 'Token is valid.',
 'Your token has been saved to /home/revolt/.cache/huggingface/token',
 'Login successful']

In [None]:
model.push_to_hub('AnyaSchen/rugpt3-large-keywords2poetry')
tokenizer.push_to_hub('AnyaSchen/rugpt3-large-keywords2poetry')

CommitInfo(commit_url='https://huggingface.co/AnyaSchen/rugpt3-large-keywords2poetry/commit/c76b450b9e0c9cda363432c7d156e032c68b86bd', commit_message='Upload tokenizer', commit_description='', oid='c76b450b9e0c9cda363432c7d156e032c68b86bd', pr_url=None, pr_revision=None, pr_num=None)

# Generation

In [None]:
DEVICE = torch.device('cuda') if torch.cuda.is_available() else None

In [None]:
tokenizer = AutoTokenizer.from_pretrained('AnyaSchen/rugpt3-large-key2poetry')
model = GPT2LMHeadModel.from_pretrained('AnyaSchen/rugpt3-large-key2poetry').to(DEVICE)

# or medium size
# tokenizer = AutoTokenizer.from_pretrained('AnyaSchen/rugpt3-medium-key2poetry')
# model = GPT2LMHeadModel.from_pretrained('AnyaSchen/rugpt3-medium-key2poetry').to(DEVICE)

In [None]:
inp = '''Автор: Маяковский
Ключевые слова: любовь, жизнь, сон
Поэзия:'''

In [None]:
def generate_poetry(input: str, model, num_beams=3):
  input = input if len(input) > 0 else tokenizer.bos_token #токен начала предложения
  input_ids = tokenizer.encode(input, return_tensors="pt").to(DEVICE)

  attention_mask = (input_ids != tokenizer.pad_token_id).int()
  with torch.no_grad():
        out = model.generate(input_ids,
                            do_sample=True,
                            num_beams=num_beams,
                            temperature=2.0,
                            top_p=0.9,
                            max_length = 200,
                            eos_token_id=tokenizer.eos_token_id,
                            bos_token_id=tokenizer.bos_token_id,
                            attention_mask = attention_mask,
                            ).to(DEVICE)
  return tokenizer.batch_decode(out, skip_special_tokens=True)[0]

In [None]:
print(generate_poetry(inp, model))

Setting `pad_token_id` to `eos_token_id`:50259 for open-end generation.


Автор: Маяковский
Ключевые слова: любовь, жизнь, сон
Поэзия: А мне
        только снится
         настоящая жизнь.
Снится
         любовь,
         настоящая жизнь.
Только не пойму —
за что
       мне такая малость?!
Если б длились сны
такие же длительные,
я б
       жилы
           из любви к тебе бы
алиловым соком исцарапал.

