<a href="https://colab.research.google.com/github/alessioborgi/StyleAlignedDiffModels/blob/main/MetricsStyleAlign.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Download the file
!gdown --folder https://drive.google.com/drive/folders/1Rdb3XkwW1H_IMFVh3tShj4adh-XUNt03?usp=sharing

Retrieving folder contents
Processing file 1IMgH31F3-T9RasojQnjqbGXvaYqC2DuJ OnlyPromptFile.zip
Retrieving folder contents completed
Building directory structure
Building directory structure completed
Downloading...
From (original): https://drive.google.com/uc?id=1IMgH31F3-T9RasojQnjqbGXvaYqC2DuJ
From (redirected): https://drive.google.com/uc?id=1IMgH31F3-T9RasojQnjqbGXvaYqC2DuJ&confirm=t&uuid=15bd417c-1ca6-4e48-be3d-c51ca6fa582f
To: /content/AlignZipFolder/OnlyPromptFile.zip
100% 111M/111M [00:05<00:00, 21.1MB/s]
Download completed


In [2]:
# Unzip the file if it's a zip
!unzip -q AlignZipFolder/OnlyPromptFile.zip

In [3]:
# Install required libraries
!pip install transformers ftfy regex tqdm
!pip install git+https://github.com/openai/CLIP.git

Collecting ftfy
  Downloading ftfy-6.2.0-py3-none-any.whl (54 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.4/54.4 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: ftfy
Successfully installed ftfy-6.2.0
Collecting git+https://github.com/openai/CLIP.git
  Cloning https://github.com/openai/CLIP.git to /tmp/pip-req-build-oyc59yts
  Running command git clone --filter=blob:none --quiet https://github.com/openai/CLIP.git /tmp/pip-req-build-oyc59yts
  Resolved https://github.com/openai/CLIP.git to commit dcba3cb2e2827b402d2701e7e1c7d9fed8a20ef1
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch->clip==1.0)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch->clip==1.0)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==

In [4]:
import os
import torch
import clip
from PIL import Image
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from transformers import AutoImageProcessor, AutoModel
from torchvision import transforms

# Load CLIP model and preprocess
device = "cuda" if torch.cuda.is_available() else "cpu"
clip_model, clip_preprocess = clip.load("ViT-B/32", device=device)

# Load DINO model and processor from Hugging Face
dino_processor = AutoImageProcessor.from_pretrained("facebook/dino-vitb8")
dino_model = AutoModel.from_pretrained("facebook/dino-vitb8").to(device)

def get_clip_similarity(image_path, prompt):
    image = clip_preprocess(Image.open(image_path)).unsqueeze(0).to(device)
    text = clip.tokenize([prompt]).to(device)

    with torch.no_grad():
        image_features = clip_model.encode_image(image)
        text_features = clip_model.encode_text(text)

    image_features /= image_features.norm(dim=-1, keepdim=True)
    text_features /= text_features.norm(dim=-1, keepdim=True)

    similarity = (image_features @ text_features.T).item()
    return similarity

def get_dino_embeddings(image_paths):
    images = [Image.open(image_path) for image_path in image_paths]
    inputs = dino_processor(images=images, return_tensors="pt").to(device)

    with torch.no_grad():
        outputs = dino_model(**inputs)
        embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()  # Taking the CLS token

    return embeddings

def compute_pairwise_similarity(embeddings):
    similarities = cosine_similarity(embeddings)
    upper_tri_indices = np.triu_indices_from(similarities, k=1)
    return similarities[upper_tri_indices].mean()

def process_folder(folder_path):
    subfolders = [f.path for f in os.scandir(folder_path) if f.is_dir()]
    clip_results = {'aligned': [], 'non_aligned': []}
    dino_results = {'aligned': [], 'non_aligned': []}

    for subfolder in subfolders:
        aligned_images = []
        non_aligned_images = []
        subfolder_name = os.path.basename(subfolder)

        for file in os.listdir(subfolder):
            if file.endswith(".jpg") or file.endswith(".png"):
                file_path = os.path.join(subfolder, file)
                if file.startswith("znon"):
                    non_aligned_images.append(file_path)
                else:
                    aligned_images.append(file_path)

        for image_path in aligned_images:
            image_name = os.path.basename(image_path)
            prompt = f"{image_name[:-4]}, {subfolder_name}"
            clip_similarity = get_clip_similarity(image_path, prompt)
            clip_results['aligned'].append(clip_similarity)

        for image_path in non_aligned_images:
            image_name = os.path.basename(image_path)[5:]
            prompt = f"{image_name[:-4]}, {subfolder_name}"
            clip_similarity = get_clip_similarity(image_path, prompt)
            clip_results['non_aligned'].append(clip_similarity)

        if aligned_images:
            aligned_embeddings = get_dino_embeddings(aligned_images)
            dino_similarity = compute_pairwise_similarity(aligned_embeddings)
            dino_results['aligned'].append(dino_similarity)

        if non_aligned_images:
            non_aligned_embeddings = get_dino_embeddings(non_aligned_images)
            dino_similarity = compute_pairwise_similarity(non_aligned_embeddings)
            dino_results['non_aligned'].append(dino_similarity)

    avg_clip_aligned = np.mean(clip_results['aligned'])
    avg_clip_non_aligned = np.mean(clip_results['non_aligned'])
    avg_dino_aligned = np.mean(dino_results['aligned'])
    avg_dino_non_aligned = np.mean(dino_results['non_aligned'])

    return {
        "clip_aligned": avg_clip_aligned,
        "clip_non_aligned": avg_clip_non_aligned,
        "dino_aligned": avg_dino_aligned,
        "dino_non_aligned": avg_dino_non_aligned
    }

# Example usage
folder_path = "OnlyPrompt"
results = process_folder(folder_path)
print(results)

100%|███████████████████████████████████████| 338M/338M [00:05<00:00, 61.8MiB/s]
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/244 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/453 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/343M [00:00<?, ?B/s]

Some weights of ViTModel were not initialized from the model checkpoint at facebook/dino-vitb8 and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'clip_aligned': 0.34217529296875, 'clip_non_aligned': 0.3454345703125, 'dino_aligned': 0.5018175, 'dino_non_aligned': 0.34539917}


In [5]:
results

{'clip_aligned': 0.34217529296875,
 'clip_non_aligned': 0.3454345703125,
 'dino_aligned': 0.5018175,
 'dino_non_aligned': 0.34539917}