[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1WK9q4TG1TTm2FYJgTOAFw8hHG8Dw3Zna?usp=sharing)




## Fix device issues

In [None]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '1'
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

In [None]:
import torch

In [None]:
torch.cuda.get_device_name(0)

'Quadro RTX 6000'

In [None]:
device = torch.device("cuda:0") if torch.cuda.is_available() else None

#1. get dataset

In [None]:
# !pip install datasets

In [None]:
from datasets import load_dataset

data_train = load_dataset("AnyaSchen/image2poetry_ru")

  from .autonotebook import tqdm as notebook_tqdm
Found cached dataset parquet (/home/revolt/.cache/huggingface/datasets/AnyaSchen___parquet/AnyaSchen--image2poetry_ru-bd53c8b353e828ac/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
100%|██████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 332.09it/s]


In [None]:
data = data_train['train']

# 2. Preprocess the dataset: 

Preprocess the images and poetry text for training. For images, you can use the same preprocessing method as the CLIP processor. For text, you can tokenize the poetry using a Russian GPT model tokenizer.

In [None]:
from PIL import Image
from io import BytesIO
from torch.utils.data import Dataset
from transformers import ViTImageProcessor, AutoTokenizer

class ImagePoetryDataset(Dataset):
    def __init__(self, dataset, vit_image_processor, tokenizer):
        self.dataset = dataset
        self.vit_image_processor = vit_image_processor
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        # Load and preprocess the image
        image = self.dataset[idx]['image'].convert("RGB")
        inputs = self.vit_image_processor(images=image, return_tensors="pt", padding=True)
        pixel_values = inputs["pixel_values"].squeeze(0)

        # Concatenate author and poetry with separator
        text = f"<bos> {self.dataset[idx]['author']} <sep> {self.dataset[idx]['poetry']} <eos>"

        # Tokenize the combined text
        tokens = self.tokenizer(text, return_tensors="pt", padding="max_length", max_length=128, truncation=True)
        input_ids = tokens["input_ids"].squeeze(0)
        attention_mask = tokens["attention_mask"].squeeze(0)

        # Copy the input IDs to use as labels
        labels = input_ids.clone()

        return {
            "pixel_values": pixel_values,
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            'labels': labels
        }


# Load the CLIP processor
# clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
vit_image_processor = ViTImageProcessor.from_pretrained("google/vit-base-patch16-224-in21k")

# Load a GPT tokenizer for the Russian language
tokenizer = AutoTokenizer.from_pretrained('ai-forever/rugpt3medium_based_on_gpt2')

SPECIAL_TOKENS = {'bos_token': "<bos>", "eos_token": "<eos>", 'pad_token': '<pad>', 'sep_token': '<sep>'}
tokenizer.add_special_tokens(SPECIAL_TOKENS)
# Create the Dataset
dataset = ImagePoetryDataset(data, vit_image_processor, tokenizer)

# Example usage
sample = dataset[0]
print(sample["pixel_values"].shape)  # Processed image tensor
print(sample["input_ids"].shape)  # Tokenized poetry input IDs
print(sample["attention_mask"].shape)  # Tokenized poetry attention mask


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


torch.Size([3, 224, 224])
torch.Size([128])
torch.Size([128])


#3. Fine-tune the model: 

Fine-tune a vision-language model using the preprocessed dataset. You can use a pre-trained VisionEncoderDecoder model from Hugging Face as a starting point, and then fine-tune it on your custom dataset. This way, the model will learn the relationship between the images and the corresponding poetry.

In [None]:
import torch
from torch.utils.data import DataLoader, random_split
from transformers import VisionEncoderDecoderModel, VisionEncoderDecoderConfig, TrainingArguments, Trainer

# Create a new VisionEncoderDecoder model with the config
model = VisionEncoderDecoderModel.from_pretrained("tuman/vit-rugpt2-image-captioning")
model.to(device)
model.decoder.resize_token_embeddings(len(tokenizer))
model.config.decoder_start_token_id = tokenizer.bos_token_id
model.config.pad_token_id = tokenizer.pad_token_id

# Split the dataset into train and validation sets (80-20 split)
train_size = int(0.99 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

# Create data loaders
batch_size = 3
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=4)

# Define the training arguments
training_args = TrainingArguments(
    output_dir="./checkouts",
    num_train_epochs=27,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    gradient_accumulation_steps=10,
    evaluation_strategy="epoch",
    logging_dir="./image_poetry_logs",
    save_steps = 1000,
    learning_rate=3e-5,
    weight_decay=0.01,
    # fp16=True,  # Use mixed precision training if possible (requires an NVIDIA GPU with Tensor Cores)
)

# Create a Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

# Train the model
trainer.train()

model.save_pretrained('./model')
tokenizer.save_pretrained('./tokenizer')
vit_image_processor.save_pretrained('./processor')

##load to hugging face

In [None]:
!pip install huggingface_hub
!huggingface-cli login --token {auth_token}

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid.
Your token has been saved to /home/revolt/.cache/huggingface/token
Login successful


In [None]:
fine_tuned_model.push_to_hub('AnyaSchen/vit-rugpt3-large-poetry-ft')
tokenizer.push_to_hub('AnyaSchen/vit-rugpt3-large-poetry-ft')
feature_extractor.push_to_hub('AnyaSchen/vit-rugpt3-large-poetry-ft')


pytorch_model.bin: 100%|█████████████████████████████████████████████████| 4.30G/4.30G [10:29<00:00, 6.83MB/s]

Upload 1 LFS files: 100%|██████████████████████████████████████████████████████| 1/1 [10:30<00:00, 630.29s/it]


CommitInfo(commit_url='https://huggingface.co/AnyaSchen/vit-rugpt3-large-poetry-ft/commit/9e4a00999612f9cd4b5f1d1ce979aea76ba46a95', commit_message='Upload feature extractor', commit_description='', oid='9e4a00999612f9cd4b5f1d1ce979aea76ba46a95', pr_url=None, pr_revision=None, pr_num=None)

#4.Generation 

In [None]:
import torch
device = torch.device('cuda')

In [None]:
from PIL import Image
import requests
from transformers import AutoTokenizer, CLIPProcessor, VisionEncoderDecoderModel, ViTFeatureExtractor

def generate_poetry(fine_tuned_model, image, tokenizer, author):
    # Preprocess the image using the CLIP processor
    pixel_values = feature_extractor(images=image, return_tensors="pt").pixel_values
    pixel_values = pixel_values.to(device)
    
    # Encode author's name and prepare as input to the decoder
    author_input = f"<bos> {author} <sep>"
    decoder_input_ids = tokenizer.encode(author_input, return_tensors="pt").to(device)

    # Generate the poetry with the fine-tuned VisionEncoderDecoder model
    generated_tokens = fine_tuned_model.generate(
        pixel_values,
        decoder_input_ids=decoder_input_ids,
        max_length=300,
        num_beams=3,
        top_p=0.8,
        temperature=2.0,
        do_sample=True,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id,
    )

    # Decode the generated tokens
    generated_poetry = tokenizer.decode(generated_tokens[0], skip_special_tokens=True)
    return generated_poetry


# Load the fine-tuned model
fine_tuned_model = VisionEncoderDecoderModel.from_pretrained("AnyaSchen/vit-rugpt3-large-poetry-ft")
feature_extractor = ViTFeatureExtractor.from_pretrained("AnyaSchen/vit-rugpt3-large-poetry-ft")

# Load a GPT tokenizer for the Russian language
tokenizer = AutoTokenizer.from_pretrained('AnyaSchen/vit-rugpt3-large-poetry-ft')
fine_tuned_model.to(device)

In [None]:
url = 'https://gamerwall.pro/uploads/posts/2022-07/1657962598_1-gamerwall-pro-p-grustnaya-zima-oboi-1.jpg'
# Test with a new image
image = Image.open(requests.get(url, stream=True).raw)
# Generate poetry based on the input image
generated_poetry = generate_poetry(fine_tuned_model, image, tokenizer, 'Пушкин')
print(generated_poetry)

 Пушкин Метель серебрится,
Немолчный звук,
То отголосок прежних дней
Серебрится в лунном сияньи.
Умчались, умчалися прочь
Все счастливые дни моей жизни,
И сердце остыло, и ум холодный
Страшен мне.
Брожу один, и в лунном сияньи
Мне вспомнилась прежняя жизнь,
Метель серебристится,
Немолчный звук...
Отчего ты, любовь моя,
Так грустна и так уныла?
 
