[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1DFPverSl5ADn2czBmxfDrJdChgJ93LGi?usp=sharing)

In [None]:
!nvidia-smi

Fri May 26 00:01:53 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 515.57.02    Driver Version: 516.93       CUDA Version: 11.7     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  On   | 00000000:0B:00.0 Off |                  N/A |
| 35%   57C    P0   111W / 350W |     14MiB / 12288MiB |      2%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Quadro RTX 6000     On   | 00000000:43:00.0 Off |                  Off |
| 42%   66C    P0    74W / 260W |    343MiB / 24576MiB |      2%      Default |
|

In [None]:
from datasets import load_dataset

data = load_dataset('AnyaSchen/image2poetry_ru')

Found cached dataset parquet (/home/revolt/.cache/huggingface/datasets/AnyaSchen___parquet/AnyaSchen--image2poetry_ru-bd53c8b353e828ac/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
100%|██████████████████████████████████████████████████| 1/1 [00:00<00:00, 55.98it/s]


In [None]:
import torch
device = torch.device('cuda:0')

In [None]:
torch.cuda.empty_cache()

In [None]:
from PIL import Image
import requests
from transformers import AutoTokenizer, CLIPProcessor, VisionEncoderDecoderModel, ViTImageProcessor

def generate_poetry(fine_tuned_model, image, tokenizer, author):
    # Preprocess the image using the CLIP processor
    pixel_values = feature_extractor(images=image, return_tensors="pt").pixel_values
    pixel_values = pixel_values.to(device)
    
    # Encode author's name and prepare as input to the decoder
    author_input = f"<bos> {author} <sep>"
    decoder_input_ids = tokenizer.encode(author_input, return_tensors="pt").to(device)

    # Generate the poetry with the fine-tuned VisionEncoderDecoder model
    generated_tokens = fine_tuned_model.generate(
        pixel_values,
        decoder_input_ids=decoder_input_ids,
        max_length=300,
        num_beams=3,
        top_p=0.8,
        temperature=2.0,
        do_sample=True,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id,
    )

    # Decode the generated tokens
    generated_poetry = tokenizer.decode(generated_tokens[0], skip_special_tokens=True).replace('�', '')
    generated_poetry = generated_poetry.split(f'{author}')[-1]
    return generated_poetry

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
from tqdm import tqdm

In [None]:
import pickle

def upload_generated_poetry(data: dict, file_name):
  with open(file_name, 'wb') as f:
    pickle.dump(data, f)

# generate poetry by fune-tuned vit-rugpt3 model

In [None]:
path = 'AnyaSchen/vit-rugpt3-large-poetry-ft'
fine_tuned_model = VisionEncoderDecoderModel.from_pretrained(path).to(device)
feature_extractor = ViTImageProcessor.from_pretrained(path)

tokenizer = AutoTokenizer.from_pretrained(path)

poetry_all_poets = {author:[] for author in set(data['train']['author'])}
for i in tqdm(range(data['train'].num_rows)):
  author = data['train'][i]['author']
  poetry_all_poets[author].append(generate_poetry(fine_tuned_model, data['train'][i]['image'], tokenizer, author))

upload_generated_poetry(poetry_all_poets, f'./vit_rugpt2_ft.pkl')
del fine_tuned_model

 10%|████▏                                    | 797/7755 [1:00:45<7:09:26,  3.70s/it]

# generate poetry by vit-rugpt3-medium-poet model

In [None]:
info_author = {
    'Маяковский':'mayak',
    'Тютчев':'tyutchev',
    'Блок' : 'blok',
    'Пушкин': 'pushkin',
    'Есенин': 'esenin'
}

In [None]:
for author in tqdm(info_author):
  path = f'{info_author[author]}/'
# Load the fine-tuned model
  fine_tuned_model = VisionEncoderDecoderModel.from_pretrained(f'./{path}model')
  feature_extractor = ViTImageProcessor.from_pretrained(f'./{path}vit_feature_extractor')

  # Load a GPT tokenizer for the Russian language
  tokenizer = AutoTokenizer.from_pretrained(f'./{path}tokenizer')
  fine_tuned_model.to(device)

  poetry_list = []
  for i in tqdm(range(data['train'].num_rows)):
    author_curr = data['train'][i]['author']
    if author_curr == author:
      poetry_list.append(generate_poetry(fine_tuned_model, data['train'][i]['image'], tokenizer, ''))

  upload_generated_poetry({author: poetry_list}, f'./{path}generated_image2poetry_vit_rugpt2_{info_author[author]}.pkl')
  del fine_tuned_model

[1;30;43mВыходные данные были обрезаны до нескольких последних строк (5000).[0m
 58%|██████████████████████████████████████████████▋                                 | 4530/7755 [3:04:52<00:33, 97.08it/s][A
 59%|██████████████████████████████████████████████▊                                 | 4541/7755 [3:04:52<00:32, 99.40it/s][A
 59%|██████████████████████████████████████████████▎                                | 4552/7755 [3:04:52<00:31, 102.01it/s][A
 59%|██████████████████████████████████████████████▍                                | 4563/7755 [3:04:52<00:31, 100.02it/s][A
 59%|███████████████████████████████████████████████▏                                | 4574/7755 [3:04:53<00:32, 99.04it/s][A
 59%|██████████████████████████████████████████████▋                                | 4585/7755 [3:04:53<00:31, 101.17it/s][A
 59%|██████████████████████████████████████████████▊                                | 4596/7755 [3:04:53<00:31, 100.04it/s][A
 59%|████████████████████████