In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os
import gc
import nltk
import pandas as pd
import torch
from PIL import Image
from transformers import CLIPProcessor, CLIPModel

nltk.download('wordnet')

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

csv_file = '/content/drive/MyDrive/updated_annotations.csv'
generated_images_folder = '/content/drive/MyDrive/generated_images'

generated_files = set()
try:
    with os.scandir(generated_images_folder) as entries:
        for entry in entries:
            if entry.is_file():
                generated_files.add(entry.name)
    print("Number of generated files:", len(generated_files))
except OSError as e:
    print("Klasör taraması hatası:", e)
    generated_files = set()

# CSV'den açıklama ve dosya yol bilgilerini topluyoruz.
captions = []
image_paths = []
chunksize = 1000
count = 0

for chunk in pd.read_csv(csv_file, chunksize=chunksize):
    for caption, fp in zip(chunk['generated_captions'], chunk['filepath']):
        if pd.notna(caption) and str(caption).strip() and pd.notna(fp) and str(fp).strip():
            fp = str(fp).strip()
            if fp.startswith("images2/"):
                fp = fp[len("images2/"):]
            base, ext = os.path.splitext(fp)
            new_filename = f"{base}_generated.jpg"
            if new_filename in generated_files:
                captions.append(str(caption).strip())
                full_image_path = os.path.join(generated_images_folder, new_filename)
                image_paths.append(full_image_path)
    del chunk
    gc.collect()
    print("Processed chunk:", count)
    count += 1

print("Number of valid image-caption pairs:", len(captions))


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Using device: cuda
Number of generated files: 123081
Processed chunk: 0
Processed chunk: 1
Processed chunk: 2
Processed chunk: 3
Processed chunk: 4
Processed chunk: 5
Processed chunk: 6
Processed chunk: 7
Processed chunk: 8
Processed chunk: 9
Processed chunk: 10
Processed chunk: 11
Processed chunk: 12
Processed chunk: 13
Processed chunk: 14
Processed chunk: 15
Processed chunk: 16
Processed chunk: 17
Processed chunk: 18
Processed chunk: 19
Processed chunk: 20
Processed chunk: 21
Processed chunk: 22
Processed chunk: 23
Processed chunk: 24
Processed chunk: 25
Processed chunk: 26
Processed chunk: 27
Processed chunk: 28
Processed chunk: 29
Processed chunk: 30
Processed chunk: 31
Processed chunk: 32
Processed chunk: 33
Processed chunk: 34
Processed chunk: 35
Processed chunk: 36
Processed chunk: 37
Processed chunk: 38
Processed chunk: 39
Processed chunk: 40
Processed chunk: 41
Processed chunk: 42
Processed chunk: 43
Processed chunk: 44
Processed chunk: 45
Processed chunk: 46
Processed chunk: 

In [None]:
from concurrent.futures import ThreadPoolExecutor, as_completed

clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
clip_processor.tokenizer.model_max_length = 128

new_max_length = 128
old_max_length = clip_model.config.text_config.max_position_embeddings  
hidden_size = clip_model.text_model.embeddings.position_embedding.weight.shape[1]
old_pos_embed = clip_model.text_model.embeddings.position_embedding.weight.data
if new_max_length > old_max_length:
    print(f"Extending positional embeddings from {old_max_length} to {new_max_length}")
    new_pos_embed = torch.zeros(new_max_length, hidden_size, device=old_pos_embed.device)
    new_pos_embed[:old_max_length, :] = old_pos_embed
    new_pos_embed[old_max_length:, :] = old_pos_embed[-1, :].unsqueeze(0).expand(new_max_length - old_max_length, hidden_size)
    new_embedding = torch.nn.Embedding(new_max_length, hidden_size).to(device)
    new_embedding.weight.data = new_pos_embed
    clip_model.text_model.embeddings.position_embedding = new_embedding
    clip_model.config.text_config.max_position_embeddings = new_max_length

def load_image(img_path):
    try:
        return Image.open(img_path).convert("RGB")
    except Exception as e:
        print(f"Error loading image {img_path}: {e}")
        return None

batch_size = 128  
clip_scores = []
num_batches = (len(captions) + batch_size - 1) // batch_size

for i in range(0, len(captions), batch_size):
    batch_captions = captions[i:i+batch_size]
    batch_image_paths = image_paths[i:i+batch_size]

    valid_batch_images = []
    valid_batch_captions = []

    with ThreadPoolExecutor() as executor:
        future_to_info = {executor.submit(load_image, path): (path, cap)
                          for path, cap in zip(batch_image_paths, batch_captions)}
        for future in as_completed(future_to_info):
            path, cap = future_to_info[future]
            image = future.result()
            if image is not None:
                valid_batch_images.append(image)
                valid_batch_captions.append(cap)

    if not valid_batch_images:
        continue

    text_inputs = clip_processor(text=valid_batch_captions, return_tensors="pt", padding=True, truncation=True, max_length=128)
    image_inputs = clip_processor(images=valid_batch_images, return_tensors="pt")

    text_inputs = {k: v.to(device) for k, v in text_inputs.items()}
    image_inputs = {k: v.to(device) for k, v in image_inputs.items()}

    input_ids = text_inputs["input_ids"]
    position_ids = torch.arange(input_ids.shape[1], dtype=torch.long, device=device).unsqueeze(0).expand_as(input_ids)
    text_inputs["position_ids"] = position_ids

    with torch.no_grad():
        text_features = clip_model.get_text_features(**text_inputs)
        image_features = clip_model.get_image_features(**image_inputs)

    text_features = text_features / text_features.norm(dim=-1, keepdim=True)
    image_features = image_features / image_features.norm(dim=-1, keepdim=True)

    for j in range(text_features.shape[0]):
        score = torch.nn.functional.cosine_similarity(text_features[j:j+1], image_features[j:j+1]).item()
        clip_scores.append(score)

    print(f"Processed batch {i//batch_size + 1}/{num_batches}")

if clip_scores:
    avg_clip_score = sum(clip_scores) / len(clip_scores)
    print("Average CLIPScore for generated images: {:.4f}".format(avg_clip_score))
else:
    print("No valid image-caption pairs were processed.")


Extending positional embeddings from 77 to 128
Processed batch 1/962
Processed batch 2/962
Processed batch 3/962
Processed batch 4/962
Processed batch 5/962
Processed batch 6/962
Processed batch 7/962
Processed batch 8/962
Processed batch 9/962
Processed batch 10/962
Processed batch 11/962
Processed batch 12/962
Processed batch 13/962
Processed batch 14/962
Processed batch 15/962
Processed batch 16/962
Processed batch 17/962
Processed batch 18/962
Processed batch 19/962
Processed batch 20/962
Processed batch 21/962
Processed batch 22/962
Processed batch 23/962
Processed batch 24/962
Processed batch 25/962
Processed batch 26/962
Processed batch 27/962
Processed batch 28/962
Processed batch 29/962
Processed batch 30/962
Processed batch 31/962
Processed batch 32/962
Processed batch 33/962
Processed batch 34/962
Processed batch 35/962
Processed batch 36/962
Processed batch 37/962
Processed batch 38/962
Processed batch 39/962
Processed batch 40/962
Processed batch 41/962
Processed batch 42/