In [1]:
import os
from PIL import Image
from tqdm import tqdm
from transformers import (
    BlipProcessor,
    BlipForConditionalGeneration,
)
import torch
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from config_file import config

In [3]:
# Путь к папке с изображениями
images_folder = config.IMAGES_PATH
output_file = config.DATA_PATH / "captions.txt"

# ⚙️ Устройство
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [6]:
# 🔄 Загрузка модели и процессора один раз
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base", cache_dir=config.PROJECT_PATH / ".cache")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base", cache_dir=config.PROJECT_PATH / ".cache").to(device)

In [81]:
def generate_caption(image_path: str, text: str, game: pd.Series) -> str:
    try:
        image = Image.open(image_path).convert("RGB")
        inputs = processor(images=image, text=text, return_tensors="pt").to(device)
        output = model.generate(**inputs)
        caption = processor.decode(output[0], skip_special_tokens=True)

        genres = "Unknown genres" if pd.isna(game["genres"]) else game["genres"]
        perspectives = (
            "Unknown perspectives"
            if pd.isna(game["player_perspectives"])
            else game["player_perspectives"]
        )
        caption = f"Game has genres: {genres}; perspectives: {perspectives}. " + caption

        return caption
    except Exception as e:
        print(f"Ошибка при обработке {image_path}: {e}")
        return ""

In [82]:
df = pd.read_csv(config.DATA_PATH / "similar_games.csv")

In [None]:
# 📷 Обработка изображений
image_files = os.listdir(images_folder)
annotations = {}
text = "this game screenshot shows"

for i in tqdm(range(len(df)), desc="Генерация аннотаций"):
    game = df.iloc[i]
    img_file = game["image_id"] + ".jpg"
    image_path = images_folder / img_file
    caption = generate_caption(
        image_path,
        text,
        game,
    )

    annotations[game["image_id"]] = caption

Генерация аннотаций: 100%|██████████| 2561/2561 [14:01<00:00,  3.04it/s]

✅ Аннотации сохранены в annotations.txt





In [89]:
with open("annotations.txt", "r") as f:
    lines = f.readlines()
    lines = map(lambda line: line.strip().split("\t"), lines)

In [1]:
import pandas as pd

annotations = pd.read_csv("annotations.txt", sep="\t", header=None)
annotations.columns = ["filename", "annotation"]
annotations.head()

Unnamed: 0,filename,annotation
0,koajaesby7cmhujlcwkl.jpg,"Game has genres: Shooter, Platform, Puzzle, Ad..."
1,poegvjadtys8fflxpxnu.jpg,"Game has genres: Shooter, Platform, Puzzle, Ad..."
2,lrx26njeiciksjqby7ou.jpg,"Game has genres: Shooter, Platform, Puzzle, Ad..."
3,vux5gzepdeqvhvm84rcz.jpg,"Game has genres: Shooter, Platform, Puzzle, Ad..."
4,jlt7ncnm7cheka0hmypd.jpg,"Game has genres: Shooter, Platform, Puzzle, Ad..."


In [6]:
annotations.to_csv(config.DATA_PATH / "annotations.csv", index=False)

# FuseCAP

In [5]:
import requests
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration

In [8]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
processor = BlipProcessor.from_pretrained("noamrot/FuseCap", cache_dir=config.PROJECT_PATH / ".cache")
model = BlipForConditionalGeneration.from_pretrained("noamrot/FuseCap", cache_dir=config.PROJECT_PATH / ".cache").to(device)

In [None]:
img_url = 'https://huggingface.co/spaces/noamrot/FuseCap/resolve/main/bike.jpg' 
raw_image = Image.open(requests.get(img_url, stream=True).raw).convert('RGB')

text = "a picture of "
inputs = processor(raw_image, text, return_tensors="pt").to(device)

out = model.generate(**inputs, num_beams = 3)
print(processor.decode(out[0], skip_special_tokens=True))
