In [1]:
import os
from PIL import Image
from tqdm import tqdm
from transformers import (
    BlipProcessor,
    BlipForConditionalGeneration,
)
import torch
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from config_file import config

In [None]:
# Путь к папке с изображениями
images_folder = config.IMAGES_PATH / "mnk" / "real-validation"
output_file = config.DATA_PATH / "captions.txt"

# ⚙️ Устройство
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [6]:
# 🔄 Загрузка модели и процессора один раз
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base", use_fast=True, cache_dir=config.PROJECT_PATH / ".cache")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base", cache_dir=config.PROJECT_PATH / ".cache").to(device)

In [81]:
def generate_caption(image_path: str, text: str, game: pd.Series) -> str:
    try:
        image = Image.open(image_path).convert("RGB")
        inputs = processor(images=image, text=text, return_tensors="pt").to(device)
        output = model.generate(**inputs)
        caption = processor.decode(output[0], skip_special_tokens=True)

        genres = "Unknown genres" if pd.isna(game["genres"]) else game["genres"]
        perspectives = (
            "Unknown perspectives"
            if pd.isna(game["player_perspectives"])
            else game["player_perspectives"]
        )
        caption = f"Game has genres: {genres}; perspectives: {perspectives}. " + caption

        return caption
    except Exception as e:
        print(f"Ошибка при обработке {image_path}: {e}")
        return ""

In [82]:
df = pd.read_csv(config.DATA_PATH / "similar_games.csv")

In [None]:
# 📷 Обработка изображений
image_files = os.listdir(images_folder)
annotations = {}
text = "this game screenshot shows"

for i in tqdm(range(len(df)), desc="Генерация аннотаций"):
    game = df.iloc[i]
    img_file = game["image_id"] + ".jpg"
    image_path = images_folder / img_file
    caption = generate_caption(
        image_path,
        text,
        game,
    )

    annotations[game["image_id"]] = caption

Генерация аннотаций: 100%|██████████| 2561/2561 [14:01<00:00,  3.04it/s]

✅ Аннотации сохранены в annotations.txt





In [89]:
with open("annotations.txt", "r") as f:
    lines = f.readlines()
    lines = map(lambda line: line.strip().split("\t"), lines)

In [1]:
import pandas as pd

annotations = pd.read_csv("annotations.txt", sep="\t", header=None)
annotations.columns = ["filename", "annotation"]
annotations.head()

Unnamed: 0,filename,annotation
0,koajaesby7cmhujlcwkl.jpg,"Game has genres: Shooter, Platform, Puzzle, Ad..."
1,poegvjadtys8fflxpxnu.jpg,"Game has genres: Shooter, Platform, Puzzle, Ad..."
2,lrx26njeiciksjqby7ou.jpg,"Game has genres: Shooter, Platform, Puzzle, Ad..."
3,vux5gzepdeqvhvm84rcz.jpg,"Game has genres: Shooter, Platform, Puzzle, Ad..."
4,jlt7ncnm7cheka0hmypd.jpg,"Game has genres: Shooter, Platform, Puzzle, Ad..."


In [6]:
annotations.to_csv(config.DATA_PATH / "annotations.csv", index=False)

# FuseCAP

In [15]:
import os
from PIL import Image

In [16]:
from config_file import config

In [None]:
images_folder = config.IMAGES_PATH / "mnk" / "real-validation"
output_file = config.DATA_PATH / "captions.txt"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [35]:
def generate_batch_captions(image_paths: list[str], text: str) -> list[str]:
    try:
        images = [Image.open(image_path).convert("RGB") for image_path in image_paths]
        inputs = processor(images=images,
                           text=[text] * len(image_paths),
                           return_tensors="pt").to(device)
        outputs = model.generate(**inputs)
        caption = processor.batch_decode(outputs, skip_special_tokens=True)

        return caption

    except Exception as e:
        print(f"Ошибка при обработке {image_paths}: {e}")
        return ""

In [9]:
import requests
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration

In [11]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
processor = BlipProcessor.from_pretrained("noamrot/FuseCap", use_fast=True, cache_dir=config.PROJECT_PATH / ".cache")
model = BlipForConditionalGeneration.from_pretrained("noamrot/FuseCap", cache_dir=config.PROJECT_PATH / ".cache").to(device)

In [36]:
def chunks(lst, n):
    for i in range(0, len(lst), n):
        yield lst[i : i + n]

batch_size = 4
text = "A screenshot from a video game shows"

for frames in chunks(list(map(lambda frame: images_folder / frame, os.listdir(images_folder)))[:batch_size], batch_size):
    captions = generate_batch_captions(frames, text)
    print(frames)
    print(captions)

[WindowsPath('E:/PythonProjects/Scraping Dataset/generating-annotations/images/mnk/real-validation/akfsuhtx1gc3dsntrj8g.jpg'), WindowsPath('E:/PythonProjects/Scraping Dataset/generating-annotations/images/mnk/real-validation/cn1sfluxpm7spva4kamr.jpg'), WindowsPath('E:/PythonProjects/Scraping Dataset/generating-annotations/images/mnk/real-validation/d0uionjuun4dw3euapeg.jpg'), WindowsPath('E:/PythonProjects/Scraping Dataset/generating-annotations/images/mnk/real-validation/de6fei40snc0otebm0pm.jpg')]
['a screenshot from a video game shows a dark room with a metal ladder and a black wall in the background, illuminated by a red light', 'a screenshot from a video game shows a cityscape with tall buildings and a green tree in the background a red car is parked on the street, and a white line marks the edge', 'a screenshot from a video game shows a large brown rock and a black tire in the foreground, with a statue in the background', 'a screenshot from a video game shows two men standing in 

In [None]:
img_url = 'https://huggingface.co/spaces/noamrot/FuseCap/resolve/main/bike.jpg' 
raw_image = Image.open(requests.get(img_url, stream=True).raw).convert('RGB')

text = "a picture of "
inputs = processor(raw_image, text, return_tensors="pt").to(device)

out = model.generate(**inputs, num_beams = 3)
print(processor.decode(out[0], skip_special_tokens=True))


# Paligemma

In [3]:
from transformers import AutoProcessor, PaliGemmaForConditionalGeneration
import requests

In [4]:
model_id = "google/paligemma2-3b-mix-448"
device = "cuda:0"
dtype = torch.bfloat16

model = PaliGemmaForConditionalGeneration.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    device_map=device,
    cache_dir=config.PROJECT_PATH / ".cache",
).eval()
processor = AutoProcessor.from_pretrained(model_id, cache_dir=config.PROJECT_PATH / ".cache")

Loading checkpoint shards: 100%|██████████| 2/2 [00:11<00:00,  5.57s/it]
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [5]:
url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg?download=true"
image = Image.open(requests.get(url, stream=True).raw)

prompt = "caption en"
model_inputs = processor(text=prompt, images=image, return_tensors="pt").to(model.device)
input_len = model_inputs["input_ids"].shape[-1]

with torch.inference_mode():
    generation = model.generate(**model_inputs, max_new_tokens=100, do_sample=False)
    generation = generation[0][input_len:]
    decoded = processor.decode(generation, skip_special_tokens=True)
    print(decoded)


You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add `<image>` tokens in the very beginning of your text. For this call, we will infer how many images each text has and add special tokens.


A blue Volkswagen Beetle parked on the side of a street.
