[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/15UUJKrBZCU2iZwyUeT95qL0A-5VbAVk-?usp=sharing)

#Get real data

In [None]:
from tqdm import tqdm

In [None]:
import pandas as pd

dataset = pd.read_csv('../../data/poetry_keywords.csv')

In [None]:
dataset

Unnamed: 0,text,keywords,author
0,"Влас Прогулкин —\nмилый мальчик,\nспать ложилс...","['спать', 'журнальчик', 'заставить', 'мальчик'...",Маяковский
1,"Засыпает на рассвете,\nскомкав\nёрзаньем\nкров...","['вставать', 'детвора', 'отец', 'засыпать', 'у...",Маяковский
2,"Разошлись\nдругие\nв школы,–\nВлас\nу крана\nп...","['мочить', 'дрематься', 'выходить', 'школа', '...",Маяковский
3,Пошагал\nи встал разиней:\nвывеска на магазине...,"['магазин', 'вывеска', 'прочесть', 'пошагать',...",Маяковский
4,"С конца прочёл\nзнаток наук, —\nНомисвыходит\n...","['номисвыходить', 'наука', 'пять', 'прочесть',...",Маяковский
...,...,...,...
7750,Чудный сон мне бог послал—\nС длинной белой бо...,"['сон', 'старец', 'ангел', 'плаватель', 'готов...",Пушкин
7751,"Бедный пахарь утомленный,\nОтрешишь волов от п...","['утомлённый', 'сон', 'ждать', 'исповедовать',...",Пушкин
7752,"И страшуся и надеюсь,\nКазни вечныя страшуся,\...","['успокоить', 'надеяться', 'казнь', 'творец', ...",Пушкин
7753,"О нет, мне жизнь не надоела,\nЯ жить люблю, я ...","['надоесть', 'охладеть', 'жизнь', 'хотеть', 'м...",Пушкин


In [None]:
authors = dataset['author'].unique()

In [None]:
import torch
device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')

# medium model generation

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
path = 'AnyaSchen/rugpt3-medium-keywords2poetry'
tokenizer = AutoTokenizer.from_pretrained(path)
model = AutoModelForCausalLM.from_pretrained(path).to(device)

In [None]:
def generate_poetry(input: str, model, num_beams=3):
  input = input if len(input) > 0 else tokenizer.bos_token #токен начала предложения
  input_ids = tokenizer.encode(input, return_tensors="pt").to(device)
  # Create an attention mask
  attention_mask = (input_ids != tokenizer.pad_token_id).float()

    # Set the pad_token_id
  tokenizer.pad_token_id = tokenizer.eos_token_id
  with torch.no_grad():
        out = model.generate(input_ids,
                            do_sample=True,
                            num_beams=num_beams,
                            temperature=2.0,
                            top_p=0.9,
                            max_length = 200,
                            # stopping_criteria=StoppingCriteriaList([stop_criteria]),
                            eos_token_id=tokenizer.eos_token_id,
                            bos_token_id=tokenizer.bos_token_id,
                            attention_mask=attention_mask
                            ).to(device)
  return tokenizer.batch_decode(out, skip_special_tokens=True)[0]

In [None]:
inp = 'Автор: Маяковский\nКлючевые слова:'

In [None]:
print(generate_poetry(inp, model))

Setting `pad_token_id` to `eos_token_id`:50259 for open-end generation.


Автор: Маяковский
Ключевые слова: германия, фашистский, гитлерюгенд, рабочий, стена
Поэзия: Слушайте,
рабочие мира,
злые языки утверждают, будто
в Германии
во время войны
вейсками
строился Ад.
Слушайте,
фашистские наймиты,
строители рая,
вопите, что все неправда!
Это неправда!
Это –
строился
рабочий мира
на Эльбе!



In [None]:
dataset['keywords'] = dataset['keywords'].apply(lambda x: ', '.join(x.split("'")[1:-1:2]))

In [None]:
generated_poetry = {author: [] for author in authors}

In [None]:
for author in authors:
  print(f'Current author: {author}')
  data = dataset[dataset['author'] == author].reset_index()

  for i in tqdm(range(data.shape[0])):
    inp = f"Автор: {author}\nКлючевые слова: {data['keywords'][i]}"
    generated_poetry[author].append(generate_poetry(inp, model))

Current author: Маяковский


  0%|                                                                                          | 0/1107 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:50259 for open-end generation.
  0%|                                                                                  | 1/1107 [00:02<53:40,  2.91s/it]Setting `pad_token_id` to `eos_token_id`:50259 for open-end generation.
  0%|▏                                                                               | 2/1107 [00:07<1:09:38,  3.78s/it]Setting `pad_token_id` to `eos_token_id`:50259 for open-end generation.
  0%|▏                                                                               | 3/1107 [00:10<1:04:39,  3.51s/it]Setting `pad_token_id` to `eos_token_id`:50259 for open-end generation.
  0%|▎                                                                                 | 4/1107 [00:11<48:24,  2.63s/it]Setting `pad_token_id` to `eos_token_id`:50259 for open-end generation.
  0%|▎                                  

Current author: Тютчев


  0%|                                                                                          | 0/1000 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:50259 for open-end generation.
  0%|                                                                                  | 1/1000 [00:02<45:48,  2.75s/it]Setting `pad_token_id` to `eos_token_id`:50259 for open-end generation.
  0%|▏                                                                                 | 2/1000 [00:05<47:54,  2.88s/it]Setting `pad_token_id` to `eos_token_id`:50259 for open-end generation.
  0%|▏                                                                                 | 3/1000 [00:08<47:06,  2.84s/it]Setting `pad_token_id` to `eos_token_id`:50259 for open-end generation.
  0%|▎                                                                                 | 4/1000 [00:12<57:18,  3.45s/it]Setting `pad_token_id` to `eos_token_id`:50259 for open-end generation.
  0%|▍                                  

Current author: Блок


  0%|                                                                                          | 0/1963 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:50259 for open-end generation.
  0%|                                                                                | 1/1963 [00:03<1:52:35,  3.44s/it]Setting `pad_token_id` to `eos_token_id`:50259 for open-end generation.
  0%|                                                                                | 2/1963 [00:04<1:14:05,  2.27s/it]Setting `pad_token_id` to `eos_token_id`:50259 for open-end generation.
  0%|                                                                                | 3/1963 [00:08<1:29:34,  2.74s/it]Setting `pad_token_id` to `eos_token_id`:50259 for open-end generation.
  0%|▏                                                                               | 4/1963 [00:10<1:20:25,  2.46s/it]Setting `pad_token_id` to `eos_token_id`:50259 for open-end generation.
  0%|▏                                  

Current author: Есенин


  0%|                                                                                          | 0/1478 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:50259 for open-end generation.
  0%|                                                                                | 1/1478 [00:02<1:11:49,  2.92s/it]Setting `pad_token_id` to `eos_token_id`:50259 for open-end generation.
  0%|                                                                                  | 2/1478 [00:03<40:50,  1.66s/it]Setting `pad_token_id` to `eos_token_id`:50259 for open-end generation.
  0%|▏                                                                                 | 3/1478 [00:05<45:40,  1.86s/it]Setting `pad_token_id` to `eos_token_id`:50259 for open-end generation.
  0%|▏                                                                                 | 4/1478 [00:08<56:23,  2.30s/it]Setting `pad_token_id` to `eos_token_id`:50259 for open-end generation.
  0%|▎                                  

Current author: Пушкин


  0%|                                                                                          | 0/2207 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:50259 for open-end generation.
  0%|                                                                                | 1/2207 [00:01<1:05:19,  1.78s/it]Setting `pad_token_id` to `eos_token_id`:50259 for open-end generation.
  0%|                                                                                  | 2/2207 [00:03<59:08,  1.61s/it]Setting `pad_token_id` to `eos_token_id`:50259 for open-end generation.
  0%|                                                                                | 3/2207 [00:07<1:42:07,  2.78s/it]Setting `pad_token_id` to `eos_token_id`:50259 for open-end generation.
  0%|▏                                                                               | 4/2207 [00:12<2:13:51,  3.65s/it]Setting `pad_token_id` to `eos_token_id`:50259 for open-end generation.
  0%|▏                                  

In [None]:
import pickle

In [None]:
def upload_generated_poetry(data: dict):
  file_name = './medium_all_poets.pkl'

  with open(file_name, 'wb') as f:
    pickle.dump(data, f)

In [None]:
upload_generated_poetry(generated_poetry)

In [None]:
del model

# large

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
path = 'AnyaSchen/rugpt3-large-keywords2poetry'
tokenizer = AutoTokenizer.from_pretrained(path)
model = AutoModelForCausalLM.from_pretrained(path).to(device)

Downloading (…)okenizer_config.json:   0%|          | 0.00/754 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.61M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/1.27M [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/3.74M [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/220 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/868 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/3.04G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [None]:
generated_poetry_large = {author: [] for author in authors}

In [None]:
for author in authors:
  print(f'Current author: {author}')
  data = dataset[dataset['author'] == author].reset_index()

  for i in tqdm(range(data.shape[0])):
    inp = f"Автор: {author}\nКлючевые слова: {data['keywords'][i]}"
    generated_poetry_large[author].append(generate_poetry(inp, model))

Current author: Маяковский


Setting `pad_token_id` to `eos_token_id`:50259 for open-end generation.
  0%|                                                                                | 1/1107 [00:03<1:06:22,  3.60s/it]Setting `pad_token_id` to `eos_token_id`:50259 for open-end generation.
  0%|▏                                                                               | 2/1107 [00:09<1:29:20,  4.85s/it]Setting `pad_token_id` to `eos_token_id`:50259 for open-end generation.
  0%|▏                                                                               | 3/1107 [00:16<1:46:41,  5.80s/it]Setting `pad_token_id` to `eos_token_id`:50259 for open-end generation.
  0%|▎                                                                               | 4/1107 [00:19<1:30:56,  4.95s/it]Setting `pad_token_id` to `eos_token_id`:50259 for open-end generation.
  0%|▎                                                                               | 5/1107 [00:24<1:26:29,  4.71s/it]Setting `pad_token_id` to `eos_token_id`

Current author: Тютчев


  0%|                                                                                          | 0/1000 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:50259 for open-end generation.
  0%|                                                                                  | 1/1000 [00:02<37:57,  2.28s/it]Setting `pad_token_id` to `eos_token_id`:50259 for open-end generation.
  0%|▏                                                                               | 2/1000 [00:07<1:04:11,  3.86s/it]Setting `pad_token_id` to `eos_token_id`:50259 for open-end generation.
  0%|▏                                                                                 | 3/1000 [00:09<53:48,  3.24s/it]Setting `pad_token_id` to `eos_token_id`:50259 for open-end generation.
  0%|▎                                                                                 | 4/1000 [00:10<37:06,  2.24s/it]Setting `pad_token_id` to `eos_token_id`:50259 for open-end generation.
  0%|▍                                  

Current author: Блок


  0%|                                                                                          | 0/1963 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:50259 for open-end generation.
  0%|                                                                                | 1/1963 [00:03<1:56:15,  3.56s/it]Setting `pad_token_id` to `eos_token_id`:50259 for open-end generation.
  0%|                                                                                | 2/1963 [00:05<1:31:16,  2.79s/it]Setting `pad_token_id` to `eos_token_id`:50259 for open-end generation.
  0%|                                                                                | 3/1963 [00:08<1:23:46,  2.56s/it]Setting `pad_token_id` to `eos_token_id`:50259 for open-end generation.
  0%|▏                                                                               | 4/1963 [00:11<1:40:34,  3.08s/it]Setting `pad_token_id` to `eos_token_id`:50259 for open-end generation.
  0%|▏                                  

Current author: Есенин


  0%|                                                                                          | 0/1478 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:50259 for open-end generation.
  0%|                                                                                  | 1/1478 [00:02<50:11,  2.04s/it]Setting `pad_token_id` to `eos_token_id`:50259 for open-end generation.
  0%|                                                                                  | 2/1478 [00:03<39:20,  1.60s/it]Setting `pad_token_id` to `eos_token_id`:50259 for open-end generation.
  0%|▏                                                                                 | 3/1478 [00:04<30:29,  1.24s/it]Setting `pad_token_id` to `eos_token_id`:50259 for open-end generation.
  0%|▏                                                                                 | 4/1478 [00:05<32:30,  1.32s/it]Setting `pad_token_id` to `eos_token_id`:50259 for open-end generation.
  0%|▎                                  

Current author: Пушкин


  0%|                                                                                          | 0/2207 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:50259 for open-end generation.
  0%|                                                                                | 1/2207 [00:04<2:38:38,  4.31s/it]Setting `pad_token_id` to `eos_token_id`:50259 for open-end generation.
  0%|                                                                                | 2/2207 [00:06<1:58:06,  3.21s/it]Setting `pad_token_id` to `eos_token_id`:50259 for open-end generation.
  0%|                                                                                | 3/2207 [00:10<2:09:01,  3.51s/it]Setting `pad_token_id` to `eos_token_id`:50259 for open-end generation.
  0%|▏                                                                               | 4/2207 [00:15<2:25:24,  3.96s/it]Setting `pad_token_id` to `eos_token_id`:50259 for open-end generation.
  0%|▏                                  

In [None]:
def upload_generated_poetry(data: dict):
  file_name = './large_all_poets.pkl'

  with open(file_name, 'wb') as f:
    pickle.dump(data, f)

In [None]:
upload_generated_poetry(generated_poetry_large)