In [None]:
# Hugging Face authentication (do NOT hardcode tokens in notebooks)
import os
import getpass

HF_TOKEN = os.environ.get('HF_TOKEN')
if not HF_TOKEN:
    HF_TOKEN = getpass.getpass('Enter your Hugging Face token (input hidden): ')

# Make it available to libraries that look for these env vars
os.environ['HF_TOKEN'] = HF_TOKEN
os.environ['HUGGINGFACEHUB_API_TOKEN'] = HF_TOKEN


### Setup

In [None]:
# connect to drive
from google.colab import drive
drive.mount('/content/drive')

# Move to the data path
%cd "/content/drive/MyDrive/Image_data/Data/brset/images_224"

In [None]:
import torch
import torch.nn as nn
import torchvision.transforms as transforms
from torch.utils.data import DataLoader, Dataset
from torchvision import models
#from segment_anything import SamPredictor, sam_model_registry
from transformers import AutoImageProcessor, ConvNextV2ForImageClassification
from transformers import ViTModel
from transformers import CLIPProcessor, CLIPModel
from transformers import AutoImageProcessor, AutoModel

import subprocess
from PIL import Image
import os
import pandas as pd
import joblib

import warnings
warnings.filterwarnings("ignore")

### Dataloader and Backbone

In [None]:
# Define a custom dataset to load images from a folder
class ImageFolderDataset(Dataset):
    def __init__(self, folder_path, shape=(224, 224), transform=None):
        self.folder_path = folder_path
        self.image_files = [f for f in os.listdir(folder_path) if f.lower().endswith(('jpg', 'jpeg', 'png', 'gif'))]
        self.shape = shape
        self.transform = transform or transforms.Compose([
            transforms.Resize(self.shape),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        ])

    def __len__(self):
        return len(self.image_files)

    def __getitem__(self, idx):
        img_name = self.image_files[idx]
        img_path = os.path.join(self.folder_path, img_name)
        img = Image.open(img_path).convert("RGB")
        img = self.transform(img)
        return img_name, img


class CLIPImageEmbeddings(nn.Module):
    def __init__(self, vision_model, visual_projection):
        super(CLIPImageEmbeddings, self).__init__()
        self.vision_model = vision_model
        self.visual_projection = visual_projection

    def forward(self, images):
        # Pass the images through the vision model
        vision_output = self.vision_model(images)['pooler_output']

        # Apply the visual projection
        image_embeddings = self.visual_projection(vision_output)

        return image_embeddings

class FoundationalCVModel(torch.nn.Module):
    def __init__(self, backbone, mode='eval'):
        super(FoundationalCVModel, self).__init__()

        self.backbone_name = backbone

        # Select the backbone from the possible foundational models
        if backbone in ['dinov2_small', 'dinov2_base', 'dinov2_large', 'dinov2_giant']:
            # Repo: https://github.com/facebookresearch/dinov2
            # Paper: https://arxiv.org/abs/2304.07193
            backbone_path = {
                'dinov2_small': 'dinov2_vits14',
                'dinov2_base': 'dinov2_vitb14',
                'dinov2_large': 'dinov2_vitl14',
                'dinov2_giant': 'dinov2_vitg14',
            }
            self.backbone = torch.hub.load('facebookresearch/dinov2', backbone_path[backbone])

        # Add DINO V3
        elif backbone in [
            'dinov3_vits16',
            'dinov3_vits16plus',
            'dinov3_vitb16',
            'dinov3_vitl16',
            'dinov3_vith16plus',
            'dinov3_vit7b16',
            'dinov3_convnext_tiny',
            'dinov3_convnext_small',
            'dinov3_convnext_base',
            'dinov3_convnext_large',
        ]:
            #facebook/dinov3-vits16-pretrain-lvd1689m
            #facebook/dinov3-vits16plus-pretrain-lvd1689m
            #facebook/dinov3-vitb16-pretrain-lvd1689m
            #facebook/dinov3-vitl16-pretrain-lvd1689m
            #facebook/dinov3-vith16plus-pretrain-lvd1689m
            #facebook/dinov3-vit7b16-pretrain-lvd1689m
            #facebook/dinov3-convnext-base-pretrain-lvd1689m
            #facebook/dinov3-convnext-large-pretrain-lvd1689m
            #facebook/dinov3-convnext-small-pretrain-lvd1689m
            #facebook/dinov3-convnext-tiny-pretrain-lvd1689m

            # Map to HF model ids
            dinov3_map = {
                'dinov3_vits16':        'facebook/dinov3-vits16-pretrain-lvd1689m',
                'dinov3_vits16plus':    'facebook/dinov3-vits16plus-pretrain-lvd1689m',
                'dinov3_vitb16':        'facebook/dinov3-vitb16-pretrain-lvd1689m',
                'dinov3_vitl16':        'facebook/dinov3-vitl16-pretrain-lvd1689m',
                'dinov3_vith16plus':    'facebook/dinov3-vith16plus-pretrain-lvd1689m',
                'dinov3_vit7b16':       'facebook/dinov3-vit7b16-pretrain-lvd1689m',
                'dinov3_convnext_tiny': 'facebook/dinov3-convnext-tiny-pretrain-lvd1689m',
                'dinov3_convnext_small':'facebook/dinov3-convnext-small-pretrain-lvd1689m',
                'dinov3_convnext_base': 'facebook/dinov3-convnext-base-pretrain-lvd1689m',
                'dinov3_convnext_large':'facebook/dinov3-convnext-large-pretrain-lvd1689m',
            }
            model_id = dinov3_map[backbone]
            # Use the official HF processor & model (supports pooler_output for embeddings)
            self.processor = AutoImageProcessor.from_pretrained(model_id)
            self.backbone = AutoModel.from_pretrained(model_id, token=HF_TOKEN) # TODO: token, don't share or publish the code :(!!!!
            # See HF docs examples. :contentReference[oaicite:1]{index=1}


        elif backbone in ['sam_base', 'sam_large', 'sam_huge']:
            # Repo: https://github.com/facebookresearch/segment-anything
            # Paper: https://arxiv.org/abs/2304.02643

            backbone_path = {
                'sam_base': 'vit_b',
                'sam_large': 'vit_l',
                'sam_huge': 'vit_h',
            }

            import urllib.request
            if backbone == 'sam_huge':
                self.download_and_rename(url='https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth', filename=f"sam/{backbone_path[backbone]}.pth")
            elif backbone == 'sam_large':
                self.download_and_rename(url='https://dl.fbaipublicfiles.com/segment_anything/sam_vit_l_0b3195.pth', filename=f"sam/{backbone_path[backbone]}.pth")
            elif backbone == 'sam_base':
                self.download_and_rename(url='https://dl.fbaipublicfiles.com/segment_anything/sam_vit_b_01ec64.pth', filename=f"sam/{backbone_path[backbone]}.pth")

            self.backbone = sam_model_registry[backbone_path[backbone]](checkpoint=f"sam/{backbone_path[backbone]}.pth")
            self.backbone = nn.Sequential(*list(self.backbone.children())[:-2], nn.AdaptiveAvgPool2d(1), nn.Flatten())


        elif backbone in ['convnextv2_tiny', 'convnextv2_base', 'convnextv2_large']:
            # Repo: https://huggingface.co/facebook/convnextv2-base-22k-224
            # Paper: https://arxiv.org/abs/2301.00808
            backbone_path = {
                'convnextv2_tiny': 'facebook/convnextv2-tiny-22k-224',
                'convnextv2_base': 'facebook/convnextv2-base-22k-224',
                'convnextv2_large': 'facebook/convnextv2-large-22k-224',
            }

            self.backbone = ConvNextV2ForImageClassification.from_pretrained(backbone_path[backbone])
            self.backbone = nn.Sequential(*list(self.backbone.children())[:-1])


        elif backbone == 'convnext_tiny':
            # Get the backbone
            self.backbone = models.convnext.convnext_tiny(pretrained=True)
            # Remove the final classifier layer
            self.backbone = nn.Sequential(*list(self.backbone.children())[:-1], nn.Flatten())
        elif backbone == 'convnext_small':
            # Get the backbone
            self.backbone = models.convnext.convnext_small(pretrained=True)
            # Remove the final classifier layer
            self.backbone = nn.Sequential(*list(self.backbone.children())[:-1], nn.Flatten())
        elif backbone == 'convnext_base':
            # Get the backbone
            self.backbone = models.convnext.convnext_base(pretrained=True)
            # Remove the final classifier layer
            self.backbone = nn.Sequential(*list(self.backbone.children())[:-1], nn.Flatten())
        elif backbone == 'convnext_large':
            # Get the backbone
            self.backbone = models.convnext.convnext_large(pretrained=True)
            # Remove the final classifier layer
            self.backbone = nn.Sequential(*list(self.backbone.children())[:-1], nn.Flatten())

        # https://pytorch.org/vision/main/models/generated/torchvision.models.swin_t.html#torchvision.models.swin_t
        elif backbone == 'swin_tiny':
            # Get the backbone
            self.backbone = models.swin_transformer.swin_t(pretrained=True)
            # Remove the final classifier layer
            self.backbone = nn.Sequential(*list(self.backbone.children())[:-1])
        elif backbone == 'swin_small':
            # Get the backbone
            self.backbone = models.swin_transformer.swin_s(pretrained=True)
            # Remove the final classifier layer
            self.backbone = nn.Sequential(*list(self.backbone.children())[:-1])
        elif backbone == 'swin_base':
            # Get the backbone
            self.backbone = models.swin_transformer.swin_b(pretrained=True)
            # Remove the final classifier layer
            self.backbone = nn.Sequential(*list(self.backbone.children())[:-1])

        elif backbone in ['vit_base', 'vit_large']:
            # https://huggingface.co/docs/transformers/model_doc/vit
            # paper: https://arxiv.org/abs/2010.11929
            backbone_path = {
                'vit_base': "google/vit-base-patch16-224-in21k",
                'vit_large': 'google/vit-large-patch16-224-in21k',
            }
            # Get the backbone
            self.backbone = ViTModel.from_pretrained(backbone_path[backbone])

        elif backbone in ['clip_base', 'clip_large']:
            # https://huggingface.co/openai/clip-vit-base-patch16
            # paper: https://arxiv.org/abs/2103.00020
            backbone_path = {
                'clip_base': "openai/clip-vit-large-patch14",
                'clip_large': 'openai/clip-vit-base-patch16',
            }
            clip_model = CLIPModel.from_pretrained(backbone_path[backbone])
            # Get image part of CLIP model
            self.backbone = CLIPImageEmbeddings(clip_model.vision_model, clip_model.visual_projection)

        else:
            raise ValueError(f"Unsupported backbone model: {self.model_name}")

        # Set the model to evaluation or fine-tuning mode
        self.mode = mode
        if mode == 'eval':
            self.eval()
        elif mode == 'fine_tune':
            self.train()


    def download_and_rename(self, url, filename):
        """Downloads a file from the given URL and renames it to the given new file name.

        Args:
            url: The URL of the file to download.
            new_file_name: The new file name for the downloaded file.
        """

        os.makedirs(os.path.dirname(filename), exist_ok=True)

        print(f'Downloading the weights of the model: {url} ...')
        subprocess.run(["wget", "-q", "-O", filename, url])
        print(f'Done!')



    def forward(self, x):
        # If we're using a Transformers model (e.g., DINOv3), it expects 'pixel_values'
        if self.backbone_name.startswith('dinov3_'):
            # x should already be processor-normalized pixel_values (B,3,H,W)
            outputs = self.backbone(pixel_values=x)
            # HF DINOv3 returns pooler_output for a global embedding
            # (docs show using outputs.pooler_output and/or CLS token) :contentReference[oaicite:2]{index=2}
            if hasattr(outputs, "pooler_output") and outputs.pooler_output is not None:
                return outputs.pooler_output
            # Fallback: take CLS token from last_hidden_state
            last_hidden = outputs.last_hidden_state  # (B, 1+regs+patches, D)
            return last_hidden[:, 0, :]

        # Torch/timm-like models
        features = self.backbone(x)

        if self.backbone_name in ['vit_base', 'vit_large', 'convnextv2_tiny', 'convnextv2_base', 'convnextv2_large']:
            features = features['pooler_output']

        return features

### Embeddings Generation

In [None]:
# Define a function to generate embeddings in parallel
def generate_embeddings(batch, batch_number, model):
    img_names, images = batch[0], batch[1]

    with torch.no_grad():
        features = model(images)

    if batch_number % 10 == 0:
        print(f"Processed batch number: {batch_number}")

    return img_names, features


def get_embeddings_df(batch_size=32, path="../BRSET/images/", backbone="dinov2", directory='Embeddings'):

    print('#'*50, f' {backbone} ', '#'*50)

    # Create the custom dataset
    if 'sam' in backbone:
        shape = (1024, 1024)
    else:
        shape = (224, 224)
    dataset = ImageFolderDataset(folder_path=path, shape=shape)

    # Create a DataLoader to generate embeddings
    batch_size = batch_size
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

    model = FoundationalCVModel(backbone)

    img_names = []
    features = []
    for batch_number, batch in enumerate(dataloader, start=1):
        img_names_aux, features_aux = generate_embeddings(batch, batch_number, model)
        img_names.append(img_names_aux)
        features.append(features_aux)

    """
    # Parallelize the embedding generation process using joblib
    results = joblib.Parallel(n_jobs=-1, prefer="threads")(
        joblib.delayed(generate_embeddings)(batch, batch_number)
        for batch_number, batch in enumerate(dataloader, start=1)
    )
    """

    # Flatten the results to get a list of image names and their corresponding embeddings
    all_img_names = [item for sublist in img_names for item in sublist]
    all_embeddings = [item.tolist() for sublist in features for item in sublist]

    # Create a DataFrame with image names and embeddings
    df = pd.DataFrame({
        'ImageName': all_img_names,
        'Embeddings': all_embeddings
    })


    df_aux = pd.DataFrame(df['Embeddings'].tolist())
    df = pd.concat([df['ImageName'], df_aux], axis=1)

    if not os.path.exists(directory):
        os.makedirs(directory)

    df.to_csv(f'{directory}/Embeddings_{backbone}.csv', index=False)


In [None]:
# Foundational Models
dino_backbone = ['dinov2_small', 'dinov2_base', 'dinov2_large', 'dinov2_giant']

dinov3_backbone = ['dinov3_vits16',
                  'dinov3_vits16plus',
                  'dinov3_vitb16',
                  'dinov3_vitl16',
                  'dinov3_vith16plus',
                  'dinov3_vit7b16',
                  'dinov3_convnext_tiny',
                  'dinov3_convnext_small',
                  'dinov3_convnext_base',
                  'dinov3_convnext_large']

sam_backbone = ['sam_base', 'sam_large', 'sam_huge'] # Requires 1024 x 1024 x 3 Images. Right Now 224 x 224 x 3

clip_backbone = ['clip_base', 'clip_large']

# ImageNet:

### Convnext
convnext_backbone = ['convnextv2_tiny', 'convnextv2_base', 'convnextv2_large'] + ['convnext_tiny', 'convnext_small', 'convnext_base', 'convnext_large']

### Swin Transformer
swin_transformer_backbone = ['swin_tiny', 'swin_small', 'swin_base']

### ViT
vit_backbone = ['vit_base', 'vit_large']

backbones = dino_backbone + sam_backbone + clip_backbone + convnext_backbone + swin_transformer_backbone + vit_backbone

backbones

In [None]:
#!ls
!cd brset

In [None]:
dino_backbone = [#'dinov3_vits16',
            #'dinov3_vits16plus',
            'dinov3_vitb16',
            #'dinov3_vitl16',
            #'dinov3_vith16plus',
            #'dinov3_vit7b16',
            #'dinov3_convnext_tiny',
            #'dinov3_convnext_small',
            'dinov3_convnext_base',
            #'dinov3_convnext_large'
                 ]

for backbone in dino_backbone:
    get_embeddings_df(backbone=backbone, batch_size=32, path="brset/images_224/", directory='Embeddings')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import os
import pandas as pd

# Make sure the folder exists on your Drive
output_dir = "/content/drive/MyDrive/brset_ConvnextV2_embeddings"
os.makedirs(output_dir, exist_ok=True)

convnext_backbone = [
    "convnextv2_base",
]

for backbone in convnext_backbone:
    # Create a temporary writable directory for get_embeddings_df's internal operations
    temp_local_dir = "/content/temp_embeddings_output"
    os.makedirs(temp_local_dir, exist_ok=True)

    # Call get_embeddings_df to generate and save embeddings to the temporary directory
    get_embeddings_df(
        backbone=backbone,
        batch_size=32,
        path=".",
        directory=temp_local_dir # Pass the writable temporary directory
    )

    # Read the CSV that get_embeddings_df just wrote from the temporary location
    temp_csv_path = os.path.join(temp_local_dir, f"Embeddings_{backbone}.csv")
    embeddings_df = pd.read_csv(temp_csv_path)

    # Save the DataFrame explicitly to your Drive
    output_path = os.path.join(output_dir, f"{backbone}_embeddings.csv")
    embeddings_df.to_csv(output_path, index=False)
    print("Saved:", output_path)


In [None]:
from google.colab import drive
drive.mount('/content/drive')

import os
import pandas as pd # Import pandas for pd.read_csv

# Make sure the folder exists on your Drive
output_dir = "/content/drive/MyDrive/brset_VitBase_embeddings"
os.makedirs(output_dir, exist_ok=True)

vit_base_backbone = [
    "vit_base",
]

for backbone in vit_base_backbone:
    # Create a temporary writable directory for get_embeddings_df's internal operations
    temp_local_dir = "/content/temp_embeddings_output"
    os.makedirs(temp_local_dir, exist_ok=True)

    # Call get_embeddings_df to generate and save embeddings to the temporary directory
    get_embeddings_df(
        backbone=backbone,
        batch_size=32,
        path=".",
        directory=temp_local_dir # Pass the writable temporary directory
    )

    # Read the CSV that get_embeddings_df just wrote from the temporary location
    temp_csv_path = os.path.join(temp_local_dir, f"Embeddings_{backbone}.csv")
    embeddings_df = pd.read_csv(temp_csv_path)

    # Save the DataFrame explicitly to your Drive
    output_path = os.path.join(output_dir, f"{backbone}_embeddings.csv")
    embeddings_df.to_csv(output_path, index=False)
    print("Saved:", output_path)


In [None]:
!pip install -q "huggingface_hub>=0.23.0" "transformers>=4.44.0"

from huggingface_hub import login, hf_hub_download, list_repo_files
import os

# 1. Put your OWN token here, don't share it
HF_TOKEN = "your token"

# 2. Login (stores token in HF cache)
login(HF_TOKEN, add_to_git_credential=False)

# 3. Also expose it via env vars (for transformers, etc.)
os.environ["HF_TOKEN"] = HF_TOKEN
os.environ["HUGGINGFACEHUB_API_TOKEN"] = HF_TOKEN

# 4. Repo you want to test
repo_id = "facebook/dinov3-vitb16-pretrain-lvd1689m"

# 5. List files in the repo so we only request something that exists
files = list_repo_files(repo_id)
print("Number of files in repo:", len(files))
print("First 50 files:\n", files[:50])

# 6. Pick a sensible config file name that actually exists
candidate_names = [
    "preprocessor_config.json",
    "image_processor_config.json",
    "config.json",
]

target_filename = None
for name in candidate_names:
    if name in files:
        target_filename = name
        break

if target_filename is None:
    # Fallback: pick *any* JSON file as proof of access
    json_files = [f for f in files if f.endswith(".json")]
    if not json_files:
        raise RuntimeError(
            "No JSON config file found in the repo. "
            "Check the printed file list above."
        )
    target_filename = json_files[0]

print(f"\nWill download: {target_filename}")

# 7. Hard access test: download that file
local_path = hf_hub_download(
    repo_id=repo_id,
    filename=target_filename,
    token=HF_TOKEN,
)

print("Access OK.")
print("Downloaded file path:", local_path)

In [None]:
!pip install -q "huggingface_hub>=0.23.0" "transformers>=4.44.0"

import os
import torch
import numpy as np
import pandas as pd
from PIL import Image
from tqdm import tqdm
from huggingface_hub import login
from transformers import AutoImageProcessor, AutoModel

# 1) Put the token that you KNOW works for hf_hub_download here
HF_TOKEN = "<HF_TOKEN>"

# 2) Make this the only active token
os.environ["HF_TOKEN"] = HF_TOKEN
os.environ["HUGGINGFACEHUB_API_TOKEN"] = HF_TOKEN
login(HF_TOKEN, add_to_git_credential=False)

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

# Image folder and output folder on Drive
image_dir = "/content/drive/MyDrive/Image_data/Data/brset/images_224"  # <- adjust if needed
output_dir = "/content/drive/MyDrive/brset_dino_embeddings"
os.makedirs(output_dir, exist_ok=True)

# Map backbone names to HF repo IDs
BACKBONES = {
    "dinov3_vitb16": "facebook/dinov3-vitb16-pretrain-lvd1689m",
    "dinov3_convnext_base": "facebook/dinov3-convnext-base-pretrain-lvd1689m",
}

def load_dinov3(backbone_id: str, token: str):
    """Load processor and model for a DINOv3 backbone using the explicit token."""
    processor = AutoImageProcessor.from_pretrained(backbone_id, token=token)
    model = AutoModel.from_pretrained(backbone_id, token=token)
    model.to(device)
    model.eval()
    return processor, model

def get_dinov3_embeddings(image_dir: str, backbone_name: str, backbone_id: str, output_dir: str, token: str):
    processor, model = load_dinov3(backbone_id, token)

    image_files = [
        f for f in sorted(os.listdir(image_dir))
        if f.lower().endswith((".png", ".jpg", ".jpeg", ".tif", ".tiff"))
    ]
    print(f"\nBackbone: {backbone_name}")
    print(f"Found {len(image_files)} images in {image_dir}")

    names = []
    feats = []

    for fname in tqdm(image_files):
        path = os.path.join(image_dir, fname)
        try:
            img = Image.open(path).convert("RGB")
        except Exception as e:
            print(f"Skipping {fname}: {e}")
            continue

        inputs = processor(images=img, return_tensors="pt")
        inputs = {k: v.to(device) for k, v in inputs.items()}

        with torch.no_grad():
            outputs = model(**inputs)

        if hasattr(outputs, "pooler_output") and outputs.pooler_output is not None:
            emb = outputs.pooler_output          # (1, D)
        else:
            emb = outputs.last_hidden_state[:, 0, :]  # (1, D) CLS token

        emb = emb.squeeze(0).cpu().numpy()  # (D,)
        names.append(fname)
        feats.append(emb)

    feats = np.stack(feats, axis=0)  # (N, D)
    print("Embeddings shape:", feats.shape)

    df = pd.DataFrame(feats, columns=[f"feature_{i}" for i in range(feats.shape[1])])
    df.insert(0, "name", names)

    out_path = os.path.join(output_dir, f"Embeddings_{backbone_name}.csv")
    df.to_csv(out_path, index=False)
    print("Saved embeddings to:", out_path)

# Run for each backbone with the explicit token
for short_name, repo_id in BACKBONES.items():
    get_dinov3_embeddings(image_dir, short_name, repo_id, output_dir, HF_TOKEN)