In [None]:
import clip
import torch
import os
import json
import math
import pandas as pd
from pathlib import Path
from tqdm import tqdm
import numpy as np
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from torchvision import transforms
from transformers import AutoImageProcessor, AutoModel, AutoProcessor, AutoModelForImageClassification
import torch.nn.functional as F
from sklearn.random_projection import SparseRandomProjection, GaussianRandomProjection
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import roc_auc_score, precision_recall_curve, auc, roc_curve
from scipy.special import softmax

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"
SRC_PATH = "../Data/GenImage/"
generator_names = ["adm", "bgan", "glide", "midj", "sd_14", "sd_15", "vqdm", "wukong"]
with open("classes.json", "r", encoding="utf-8") as f:
    data = json.load(f)
classes_idx = data["1k_idx"]
classes_names = data["21k_idx"]

In [29]:
class ImageFolderDataset(Dataset):
    def __init__(self, image_paths, transform):
        self.image_paths = image_paths
        self.transform = transform

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        path = self.image_paths[idx]
        try:
            image = Image.open(path).convert("RGB")
            image = self.transform(image)
            return image, str(path)
        except:
            print("Failure open image.")
            return None


def collate_fn(batch):
    batch = [item for item in batch if item is not None]
    if not batch:
        return None, None
    images, paths = zip(*batch)
    return torch.stack(images), paths


def clip_patch_tokens(image_folder, batch_size = 32, device='cuda'):
    model, preprocess = clip.load("ViT-B/32", device=device)
    model.float()
    visual = model.visual
    input_dir = Path(image_folder)
    image_paths = list(input_dir.glob("*"))
    dataset = ImageFolderDataset(image_paths, transform=preprocess)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
    patch_tokens_all = []
    for images, paths in tqdm(dataloader, desc="Image Encoding"):
        with torch.no_grad():
            x = images.to(device)
            x = visual.conv1(x)  # shape = [*, width, grid, grid]
            x = x.reshape(x.shape[0], x.shape[1], -1)  # shape = [*, width, grid ** 2]
            x = x.permute(0, 2, 1)  # shape = [*, grid ** 2, width]
            x = torch.cat([visual.class_embedding.to(x.dtype) + torch.zeros(x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device), x], dim=1)  # shape = [*, grid ** 2 + 1, width]
            x = x + visual.positional_embedding.to(x.dtype)
            x = visual.ln_pre(x)
            x = x.permute(1, 0, 2)  # NLD -> LND
            x = visual.transformer(x)
            x = x.permute(1, 0, 2)
            patch_tokens = visual.ln_post(x[:, 1:, :])
            patch_tokens_all.append(patch_tokens.reshape(-1, patch_tokens.shape[-1]).cpu())
    memory_bank = torch.cat(patch_tokens_all, dim=0)

    return memory_bank 

def clip_images(image_folder, batch_size = 32, device='cuda'):
    model, preprocess = clip.load("ViT-B/32", device=device)
    model.float()
    visual = model.visual
    input_dir = Path(image_folder)
    image_paths = list(input_dir.glob("*"))
    dataset = ImageFolderDataset(image_paths, transform=preprocess)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
    patch_tokens_all = []
    for images, paths in tqdm(dataloader, desc="Image Encoding"):
        with torch.no_grad():
            x = images.to(device)
            x = visual.conv1(x)  # shape = [*, width, grid, grid]
            x = x.reshape(x.shape[0], x.shape[1], -1)  # shape = [*, width, grid ** 2]
            x = x.permute(0, 2, 1)  # shape = [*, grid ** 2, width]
            x = torch.cat([visual.class_embedding.to(x.dtype) + torch.zeros(x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device), x], dim=1)  # shape = [*, grid ** 2 + 1, width]
            x = x + visual.positional_embedding.to(x.dtype)
            x = visual.ln_pre(x)
            x = x.permute(1, 0, 2)  # NLD -> LND
            x = visual.transformer(x)
            x = x.permute(1, 0, 2)
            patch_tokens = visual.ln_post(x[:, 1:, :])
            patch_tokens_all.append(patch_tokens.cpu())
    patches = torch.cat(patch_tokens_all, dim=0)
    return patches
        

In [None]:
class ImageFolderDataset(Dataset):
    def __init__(self, image_paths):
        self.image_paths = image_paths

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        path = self.image_paths[idx]
        try:
            image = Image.open(path).convert("RGB")
            return image, str(path)
        except Exception as e:
            print(f"Failure open image because of {e}")
            return None


def collate_fn(batch):
    batch = [item for item in batch if item is not None]
    if not batch:
        return None, None
    images, paths = zip(*batch)
    return list(images), paths


def dinov2_patch_tokens(image_folder, batch_size = 32, model_name='facebook/dinov2-with-registers-base', device='cuda'):
    processor = AutoImageProcessor.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name).to(device)
    model.eval()

    input_dir = Path(image_folder)
    image_paths = list(input_dir.glob("*"))
    dataset = ImageFolderDataset(image_paths)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
    patch_tokens_all = []

    for images, paths in tqdm(dataloader, desc="Extracting patch tokens"):
        if images is None:
                continue
        # processor expects a list of PIL images
        inputs = processor(images=images, return_tensors="pt").to(device)
        with torch.no_grad():
            outputs = model(**inputs)
            patch_tokens = outputs.last_hidden_state[:, 1:, :]  # remove CLS
            patch_tokens_all.append(patch_tokens.reshape(-1, patch_tokens.shape[-1]).detach().cpu())
    
    memory_bank = torch.cat(patch_tokens_all, dim=0)

    return memory_bank  


def dinov2_images(image_folder, batch_size = 32, model_name='facebook/dinov2-with-registers-base', device='cuda'):
    processor = AutoImageProcessor.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name).to(device)
    model.eval()

    input_dir = Path(image_folder)
    image_paths = list(input_dir.glob("*"))
    dataset = ImageFolderDataset(image_paths)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
    patch_tokens_all = []

    for images, paths in tqdm(dataloader, desc="Extracting patch tokens"):
        if images is None:
                continue
        # processor expects a list of PIL images
        inputs = processor(images=images, return_tensors="pt").to(device)
        with torch.no_grad():
            outputs = model(**inputs)
            patch_tokens = outputs.last_hidden_state[:, 1:, :]  # remove CLS
            patch_tokens_all.append(patch_tokens.detach().cpu())

    patches = torch.cat(patch_tokens_all, dim=0)
    
    return patches


In [None]:
class SparseJLProjector:
    def __init__(self, input_dim, target_dim, s=None, device='cuda', seed=None):
        """
        input_dim:  D
        target_dim:  d
        s:  parameter of sparsity, by default sqrt(D)
        device: 'cuda' or 'cpu'
        """
        self.input_dim = input_dim
        self.target_dim = target_dim
        self.device = device
        self.s = s or int(math.sqrt(input_dim))
        self.seed = seed

        if seed is not None:
            torch.manual_seed(seed)

        self.projection_matrix = self._generate_sparse_projection_matrix()

    def _generate_sparse_projection_matrix(self):
        D, d, s = self.input_dim, self.target_dim, self.s
        prob_nonzero = 1.0 / s

        # Initialize as 0
        R = torch.zeros(D, d, device=self.device)

        # mask：1/(2s) be +1，1/(2s) be -1，rest to 0
        rand_vals = torch.rand(D, d, device=self.device)

        pos_mask = rand_vals < (1 / (2 * s))
        neg_mask = (rand_vals >= (1 / (2 * s))) & (rand_vals < (1 / s))

        R[pos_mask] = math.sqrt(s)
        R[neg_mask] = -math.sqrt(s)

        return R  # shape: [D, d]

    def project(self, X):
        """
        X: Tensor of shape [N, D], must be on same device
        Return: [N, d]
        """
        if X.device != self.device:
            X = X.to(self.device)
        return X @ self.projection_matrix

In [None]:

def extract_patches(image_folder, extract_fn):
    patches = extract_fn(image_folder)
    return patches

def random_project(X, proj_dim=100):
    projector = SparseRandomProjection(n_components=proj_dim)
    X_proj = projector.fit_transform(X)
    return X_proj, projector

def greedy_coreset_selection(X, l):
    N, D = X.shape
    L = int(N * l)
    selected = []
    remaining = list(range(N))

    idx = np.random.choice(remaining)
    selected.append(idx)
    remaining.remove(idx)

    for _ in tqdm(range(L)):
        dists = []
        for i in remaining:
            dist = np.min(np.linalg.norm(X[i] - X[selected], axis=1))
            dists.append(dist)
        new_idx = remaining[np.argmax(dists)]
        selected.append(new_idx)
        remaining.remove(new_idx)

    return X[selected], selected

def greedy_coreset_selection_gpu(X, l, device='cuda'):
    X = X.to(device)  # X: [N, D]
    N, D = X.shape
    L = int(N * l)

    selected_idx = []
    remaining_mask = torch.ones(N, dtype=torch.bool, device=device)
    dtype = X.dtype
    if dtype == torch.float16:
        mask_val = -1e4
    else:
        mask_val = -1e9

    idx = torch.randint(0, N, (1,)).item()
    selected_idx.append(idx)
    remaining_mask[idx] = False

    min_dists = torch.cdist(X, X[idx].unsqueeze(0)).squeeze(1)  # [N]

    for _ in range(L):
        min_dists[~remaining_mask] = mask_val
        next_idx = torch.argmax(min_dists).item()
        selected_idx.append(next_idx)
        remaining_mask[next_idx] = False

        dist_to_new_point = torch.cdist(X, X[next_idx].unsqueeze(0)).squeeze(1)
        min_dists = torch.minimum(min_dists, dist_to_new_point)

    return X[selected_idx].cpu(), selected_idx



In [None]:
def batch_anomaly_scores(test_patches_batch, memory_bank, b=10, device='cuda'):
    """
    test_patches_batch: [B, N_test, D]
    memory_bank: [N_mem, D]
    return:
        scores: [B], anomaly scores for every batch
    """
    B, N_test, D = test_patches_batch.shape
    N_mem = memory_bank.shape[0]
    
    test_patches_batch = test_patches_batch.to(device)
    memory_bank = memory_bank.to(device)
    
    # Step 1: Compute distances from every patch to memory bank [B, N_test, N_mem]
    dists = torch.cdist(test_patches_batch, memory_bank.unsqueeze(0).expand(B, -1, -1))  # broadcast memory_bank
    
    # Step 2: Search distance and index of nearest neighbour for every patch [B, N_test]
    min_dists, nn_indices = torch.min(dists, dim=2)  # minimum distance and index
    
    # Step 3: Search index of patch in every batch which has maximum in minimum distances [B]
    max_min_dists, max_idx = torch.max(min_dists, dim=1)  # max(min(distances)) for every batch
    
    # Step 4: Take corresponding patch and the corresponding NN point in memory bank [B, D]
    batch_idx = torch.arange(B, device=device)
    m_test_star = test_patches_batch[batch_idx, max_idx]  # [B, D]
    m_star = memory_bank[nn_indices[batch_idx, max_idx]]   # [B, D]
    
    # Step 5: Compute b-NN points of m_star [B, b, D]
    # For every batch element, compute distance between m_star and memory bank
    m_star_expand = m_star.unsqueeze(1)  # [B,1,D]
    dists_b = torch.cdist(m_star_expand, memory_bank.unsqueeze(0).expand(B, -1, -1)).squeeze(1)  # [B, N_mem]
    _, neighbors_b_idx = torch.topk(dists_b, b, largest=False)  # [B, b]
    neighbors_b = memory_bank[neighbors_b_idx]  # [B, b, D]
    
    # Step 6: Compute weights of distance
    dist_to_neighbors = torch.norm(neighbors_b - m_test_star.unsqueeze(1), dim=2)  # [B, b]
    d_star = torch.norm(m_test_star - m_star, dim=1)  # [B]
    
    # Step 7: Compute softmax weights
    scores_cat = torch.cat([d_star.unsqueeze(1), dist_to_neighbors], dim=1)  # [B, b+1]
    probs = F.softmax(scores_cat, dim=1)  # [B, b+1]
    weight = 1 - probs[:, 0]  # [B]
    
    # Step 8: Final weighted scores
    final_scores = weight * max_min_dists  # [B]
    
    return final_scores

def anomaly_scores(test_patches, memory_bank, batch_size=32, b=10, device='cuda'):
    """
    test_patches: [N_total, N_patches, D]
    memory_bank: [N_mem, D]
    return:
    final_scores: [N_total]
    """
    N_total = test_patches.shape[0]
    memory_bank = memory_bank.to(device)

    final_scores_list = []

    with torch.no_grad():
        for start_idx in range(0, N_total, batch_size):
            end_idx = min(start_idx + batch_size, N_total)
            batch = test_patches[start_idx:end_idx].to(device)  # [batch_size, N_patches, D]
            scores_batch = batch_anomaly_scores(batch, memory_bank, b=b, device=device)  # [batch_size]
            final_scores_list.append(scores_batch)

    final_scores = torch.cat(final_scores_list, dim=0)
    return final_scores.cpu().numpy()

In [None]:
def patchcore_detector(classes_idx, embedder, memory_fn, image_fn):
    auroc = []
    auprc = []
    generators = []
    for cls in classes_idx:
        memory = extract_patches(SRC_PATH + cls + "/nature", memory_fn)
        projector = SparseJLProjector(input_dim=768, target_dim=100)
        memory_proj = projector.project(memory)
        coreset, selected_indices = greedy_coreset_selection_gpu(memory_proj, l=0.5)
        memory_bank = memory[selected_indices]
        for generator in ["bgan", "midj", "sd_15"]:
            
            scores = []
            test_patches_1 = image_fn(SRC_PATH + cls + "/" + generator)
            test_patches_2 = image_fn(SRC_PATH + cls + "/nature_2")
            # ai_paths = get_images(SRC_PATH + cls + "/" + generator)
            # nature_paths = get_images(SRC_PATH + cls + "/nature_2")

            labels = np.concatenate((np.zeros(test_patches_1.shape[0]), np.ones(test_patches_2.shape[0])))
            
            score1 = anomaly_scores(test_patches_1,memory_bank)
            scores.append(-score1)
            score2 = anomaly_scores(test_patches_2,memory_bank)
            scores.append(-score2)
            scores = np.concatenate(scores)

            fpr, tpr, thresholds = roc_curve(labels, scores)
            distances = np.sqrt((1 - tpr) ** 2 + fpr**2)
            best_threshold = thresholds[np.argmin(distances)]
            print("Best threshold(ROC):", best_threshold)
            roc_auc = roc_auc_score(labels, scores)
            # print("AUROC:", roc_auc)

            fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
            ax1.plot(fpr, tpr, color="darkorange", lw=2, label=f"ROC curve (area = {roc_auc:.2f})")
            ax1.plot([0, 1], [0, 1], color="navy", lw=2, linestyle="--")
            ax1.set_xlim([0.0, 1.0])
            ax1.set_ylim([0.0, 1.05])
            ax1.set_xlabel("False Positive Rate (FPR)")
            ax1.set_ylabel("True Positive Rate (TPR)")
            ax1.set_title("Receiver Operating Characteristic (ROC) Curve")
            ax1.legend(loc="lower right")

            precision, recall, thresholds = precision_recall_curve(labels, scores)
            pr_auc = auc(recall, precision)
            # print("AUPRC:", pr_auc)

            ax2.plot(recall, precision, color="blue", lw=2, label=f"PR curve (area = {pr_auc:.2f})")
            ax2.set_xlim([0.0, 1.0])
            ax2.set_ylim([0.0, 1.05])
            ax2.set_xlabel("Recall")
            ax2.set_ylabel("Precision")
            ax2.set_title("Precision-Recall Curve")
            ax2.legend(loc="best")

            plt.tight_layout()
            save_path = f"../Data/Patchcore_results/{embedder}/{cls}/{generator}.png"
            os.makedirs(os.path.dirname(save_path), exist_ok=True)
            plt.savefig(save_path)
            plt.close()

            auroc.append(roc_auc)
            auprc.append(pr_auc)
        generators.extend(["bgan", "midj", "sd_15"])

    data = {"CLASS": [x for x in classes_idx for _ in range(3)], "GENERATOR": generators, "AUROC": auroc, "AUPRC": auprc}
    df = pd.DataFrame(data)
    df.to_csv(embedder + "_patchcore_result.csv", index=False)
    print(embedder + f" auroc: {np.mean(auroc)}, auprc : {np.mean(auprc)}")

    
    

In [24]:
patchcore_detector(classes_idx, "dinov2", dinov2_patch_tokens, dinov2_images)

Extracting patch tokens: 100%|██████████| 162/162 [00:03<00:00, 40.57it/s]
Extracting patch tokens: 100%|██████████| 162/162 [00:03<00:00, 49.78it/s]
Extracting patch tokens: 100%|██████████| 162/162 [00:03<00:00, 40.99it/s]


Best threshold(ROC): -25.991726


Extracting patch tokens: 100%|██████████| 162/162 [00:08<00:00, 18.94it/s]
Extracting patch tokens: 100%|██████████| 162/162 [00:03<00:00, 42.55it/s]


Best threshold(ROC): -25.939419


Extracting patch tokens: 100%|██████████| 162/162 [00:04<00:00, 34.89it/s]
Extracting patch tokens: 100%|██████████| 162/162 [00:04<00:00, 39.29it/s]


Best threshold(ROC): -23.719162


Extracting patch tokens: 100%|██████████| 162/162 [00:04<00:00, 40.13it/s]
Extracting patch tokens: 100%|██████████| 162/162 [00:03<00:00, 49.85it/s]
Extracting patch tokens: 100%|██████████| 162/162 [00:03<00:00, 42.91it/s]


Best threshold(ROC): -22.641808


Extracting patch tokens: 100%|██████████| 162/162 [00:09<00:00, 17.83it/s]
Extracting patch tokens: 100%|██████████| 162/162 [00:03<00:00, 41.51it/s]


Best threshold(ROC): -22.696753


Extracting patch tokens: 100%|██████████| 162/162 [00:06<00:00, 25.81it/s]
Extracting patch tokens: 100%|██████████| 162/162 [00:03<00:00, 42.23it/s]


Best threshold(ROC): -24.668608


Extracting patch tokens: 100%|██████████| 162/162 [00:04<00:00, 33.17it/s]
Extracting patch tokens: 100%|██████████| 162/162 [00:04<00:00, 38.49it/s]
Extracting patch tokens: 100%|██████████| 162/162 [00:04<00:00, 32.84it/s]


Best threshold(ROC): -25.065311


Extracting patch tokens: 100%|██████████| 162/162 [00:10<00:00, 15.77it/s]
Extracting patch tokens: 100%|██████████| 162/162 [00:03<00:00, 43.76it/s]


Best threshold(ROC): -24.506172


Extracting patch tokens: 100%|██████████| 162/162 [00:05<00:00, 28.92it/s]
Extracting patch tokens: 100%|██████████| 162/162 [00:03<00:00, 44.37it/s]


Best threshold(ROC): -23.44636


Extracting patch tokens: 100%|██████████| 162/162 [00:04<00:00, 35.68it/s]
Extracting patch tokens: 100%|██████████| 162/162 [00:04<00:00, 39.91it/s]
Extracting patch tokens: 100%|██████████| 162/162 [00:04<00:00, 34.02it/s]


Best threshold(ROC): -24.275915


Extracting patch tokens: 100%|██████████| 162/162 [00:11<00:00, 14.56it/s]
Extracting patch tokens: 100%|██████████| 162/162 [00:04<00:00, 39.43it/s]


Best threshold(ROC): -24.90686


Extracting patch tokens: 100%|██████████| 162/162 [00:05<00:00, 27.43it/s]
Extracting patch tokens: 100%|██████████| 162/162 [00:03<00:00, 42.31it/s]


Best threshold(ROC): -24.118734


Extracting patch tokens: 100%|██████████| 162/162 [00:05<00:00, 32.28it/s]
Extracting patch tokens: 100%|██████████| 162/162 [00:04<00:00, 35.90it/s]
Extracting patch tokens: 100%|██████████| 162/162 [00:04<00:00, 33.44it/s]


Best threshold(ROC): -26.484957


Extracting patch tokens: 100%|██████████| 162/162 [00:11<00:00, 14.47it/s]
Extracting patch tokens: 100%|██████████| 162/162 [00:04<00:00, 40.01it/s]


Best threshold(ROC): -24.449669


Extracting patch tokens: 100%|██████████| 162/162 [00:05<00:00, 29.87it/s]
Extracting patch tokens: 100%|██████████| 162/162 [00:03<00:00, 42.16it/s]


Best threshold(ROC): -24.959229


Extracting patch tokens: 100%|██████████| 162/162 [00:04<00:00, 34.30it/s]
Extracting patch tokens: 100%|██████████| 162/162 [00:04<00:00, 38.18it/s]
Extracting patch tokens: 100%|██████████| 162/162 [00:05<00:00, 31.95it/s]


Best threshold(ROC): -25.897093


Extracting patch tokens: 100%|██████████| 162/162 [00:10<00:00, 15.78it/s]
Extracting patch tokens: 100%|██████████| 162/162 [00:03<00:00, 42.20it/s]


Best threshold(ROC): -23.452425


Extracting patch tokens: 100%|██████████| 162/162 [00:05<00:00, 27.58it/s]
Extracting patch tokens: 100%|██████████| 162/162 [00:03<00:00, 42.89it/s]


Best threshold(ROC): -25.22849


Extracting patch tokens: 100%|██████████| 162/162 [00:04<00:00, 32.95it/s]
Extracting patch tokens: 100%|██████████| 162/162 [00:04<00:00, 38.95it/s]
Extracting patch tokens: 100%|██████████| 162/162 [00:04<00:00, 34.49it/s]


Best threshold(ROC): -25.716589


Extracting patch tokens: 100%|██████████| 162/162 [00:10<00:00, 15.81it/s]
Extracting patch tokens: 100%|██████████| 162/162 [00:03<00:00, 44.45it/s]


Best threshold(ROC): -26.16177


Extracting patch tokens: 100%|██████████| 162/162 [00:05<00:00, 27.89it/s]
Extracting patch tokens: 100%|██████████| 162/162 [00:03<00:00, 47.21it/s]


Best threshold(ROC): -23.937956


Extracting patch tokens: 100%|██████████| 162/162 [00:04<00:00, 35.71it/s]
Extracting patch tokens: 100%|██████████| 162/162 [00:04<00:00, 36.53it/s]
Extracting patch tokens: 100%|██████████| 162/162 [00:04<00:00, 32.97it/s]


Best threshold(ROC): -26.535555


Extracting patch tokens: 100%|██████████| 162/162 [00:10<00:00, 15.96it/s]
Extracting patch tokens: 100%|██████████| 162/162 [00:03<00:00, 41.32it/s]


Best threshold(ROC): -25.783361


Extracting patch tokens: 100%|██████████| 162/162 [00:05<00:00, 27.31it/s]
Extracting patch tokens: 100%|██████████| 162/162 [00:03<00:00, 41.78it/s]


Best threshold(ROC): -25.83523


Extracting patch tokens: 100%|██████████| 162/162 [00:04<00:00, 37.43it/s]
Extracting patch tokens: 100%|██████████| 162/162 [00:03<00:00, 49.10it/s]
Extracting patch tokens: 100%|██████████| 162/162 [00:05<00:00, 31.37it/s]


Best threshold(ROC): -24.55193


Extracting patch tokens: 100%|██████████| 162/162 [00:11<00:00, 14.40it/s]
Extracting patch tokens: 100%|██████████| 162/162 [00:03<00:00, 43.20it/s]


Best threshold(ROC): -24.152302


Extracting patch tokens: 100%|██████████| 162/162 [00:05<00:00, 27.78it/s]
Extracting patch tokens: 100%|██████████| 162/162 [00:03<00:00, 40.56it/s]


Best threshold(ROC): -25.044916


Extracting patch tokens: 100%|██████████| 162/162 [00:04<00:00, 34.64it/s]
Extracting patch tokens: 100%|██████████| 162/162 [00:04<00:00, 36.67it/s]
Extracting patch tokens: 100%|██████████| 162/162 [00:04<00:00, 33.47it/s]


Best threshold(ROC): -24.703081


Extracting patch tokens: 100%|██████████| 162/162 [00:11<00:00, 14.39it/s]
Extracting patch tokens: 100%|██████████| 162/162 [00:04<00:00, 39.43it/s]


Best threshold(ROC): -22.728994


Extracting patch tokens: 100%|██████████| 162/162 [00:06<00:00, 25.39it/s]
Extracting patch tokens: 100%|██████████| 162/162 [00:04<00:00, 39.78it/s]


Best threshold(ROC): -22.728994
dinov2 auroc: 0.5927043641721281, auprc : 0.6424010996438525


In [33]:
patchcore_detector(classes_idx, "clip", clip_patch_tokens, clip_images)

Image Encoding: 100%|██████████| 6/6 [00:01<00:00,  3.63it/s]
Image Encoding: 100%|██████████| 6/6 [00:00<00:00,  8.28it/s]
Image Encoding: 100%|██████████| 6/6 [00:01<00:00,  3.88it/s]


Best threshold(ROC): -20.337343


Image Encoding: 100%|██████████| 6/6 [00:05<00:00,  1.08it/s]
Image Encoding: 100%|██████████| 6/6 [00:01<00:00,  4.03it/s]


Best threshold(ROC): -21.367601


Image Encoding: 100%|██████████| 6/6 [00:02<00:00,  2.52it/s]
Image Encoding: 100%|██████████| 6/6 [00:01<00:00,  3.86it/s]


Best threshold(ROC): -21.354746


Image Encoding: 100%|██████████| 6/6 [00:01<00:00,  4.11it/s]
Image Encoding: 100%|██████████| 6/6 [00:00<00:00,  7.34it/s]
Image Encoding: 100%|██████████| 6/6 [00:01<00:00,  4.38it/s]


Best threshold(ROC): -18.475313


Image Encoding: 100%|██████████| 6/6 [00:06<00:00,  1.02s/it]
Image Encoding: 100%|██████████| 6/6 [00:01<00:00,  4.59it/s]


Best threshold(ROC): -17.609821


Image Encoding: 100%|██████████| 6/6 [00:02<00:00,  2.45it/s]
Image Encoding: 100%|██████████| 6/6 [00:01<00:00,  4.03it/s]


Best threshold(ROC): -17.609821


Image Encoding: 100%|██████████| 6/6 [00:01<00:00,  3.83it/s]
Image Encoding: 100%|██████████| 6/6 [00:00<00:00,  7.43it/s]
Image Encoding: 100%|██████████| 6/6 [00:01<00:00,  4.01it/s]


Best threshold(ROC): -19.345541


Image Encoding: 100%|██████████| 6/6 [00:05<00:00,  1.00it/s]
Image Encoding: 100%|██████████| 6/6 [00:01<00:00,  4.27it/s]


Best threshold(ROC): -20.282612


Image Encoding: 100%|██████████| 6/6 [00:02<00:00,  2.54it/s]
Image Encoding: 100%|██████████| 6/6 [00:01<00:00,  4.03it/s]


Best threshold(ROC): -20.360209


Image Encoding: 100%|██████████| 6/6 [00:01<00:00,  4.69it/s]
Image Encoding: 100%|██████████| 6/6 [00:00<00:00,  7.55it/s]
Image Encoding: 100%|██████████| 6/6 [00:01<00:00,  4.60it/s]


Best threshold(ROC): -20.10237


Image Encoding: 100%|██████████| 6/6 [00:06<00:00,  1.04s/it]
Image Encoding: 100%|██████████| 6/6 [00:01<00:00,  4.46it/s]


Best threshold(ROC): -20.598587


Image Encoding: 100%|██████████| 6/6 [00:02<00:00,  2.65it/s]
Image Encoding: 100%|██████████| 6/6 [00:01<00:00,  4.61it/s]


Best threshold(ROC): -20.598587


Image Encoding: 100%|██████████| 6/6 [00:01<00:00,  4.39it/s]
Image Encoding: 100%|██████████| 6/6 [00:00<00:00,  7.31it/s]
Image Encoding: 100%|██████████| 6/6 [00:01<00:00,  4.23it/s]


Best threshold(ROC): -20.45969


Image Encoding: 100%|██████████| 6/6 [00:06<00:00,  1.06s/it]
Image Encoding: 100%|██████████| 6/6 [00:01<00:00,  4.23it/s]


Best threshold(ROC): -17.72832


Image Encoding: 100%|██████████| 6/6 [00:02<00:00,  2.79it/s]
Image Encoding: 100%|██████████| 6/6 [00:01<00:00,  4.47it/s]


Best threshold(ROC): -19.370546


Image Encoding: 100%|██████████| 6/6 [00:01<00:00,  4.72it/s]
Image Encoding: 100%|██████████| 6/6 [00:00<00:00,  7.75it/s]
Image Encoding: 100%|██████████| 6/6 [00:01<00:00,  5.09it/s]


Best threshold(ROC): -19.501953


Image Encoding: 100%|██████████| 6/6 [00:05<00:00,  1.09it/s]
Image Encoding: 100%|██████████| 6/6 [00:01<00:00,  4.86it/s]


Best threshold(ROC): -18.00021


Image Encoding: 100%|██████████| 6/6 [00:02<00:00,  2.58it/s]
Image Encoding: 100%|██████████| 6/6 [00:01<00:00,  5.27it/s]


Best threshold(ROC): -18.780025


Image Encoding: 100%|██████████| 6/6 [00:01<00:00,  4.73it/s]
Image Encoding: 100%|██████████| 6/6 [00:00<00:00,  7.01it/s]
Image Encoding: 100%|██████████| 6/6 [00:01<00:00,  4.46it/s]


Best threshold(ROC): -20.477165


Image Encoding: 100%|██████████| 6/6 [00:05<00:00,  1.03it/s]
Image Encoding: 100%|██████████| 6/6 [00:01<00:00,  4.83it/s]


Best threshold(ROC): -19.651491


Image Encoding: 100%|██████████| 6/6 [00:02<00:00,  2.58it/s]
Image Encoding: 100%|██████████| 6/6 [00:01<00:00,  5.19it/s]


Best threshold(ROC): -17.81779


Image Encoding: 100%|██████████| 6/6 [00:01<00:00,  4.69it/s]
Image Encoding: 100%|██████████| 6/6 [00:00<00:00,  7.74it/s]
Image Encoding: 100%|██████████| 6/6 [00:01<00:00,  4.38it/s]


Best threshold(ROC): -20.07657


Image Encoding: 100%|██████████| 6/6 [00:05<00:00,  1.11it/s]
Image Encoding: 100%|██████████| 6/6 [00:01<00:00,  4.39it/s]


Best threshold(ROC): -19.87541


Image Encoding: 100%|██████████| 6/6 [00:02<00:00,  2.65it/s]
Image Encoding: 100%|██████████| 6/6 [00:01<00:00,  4.79it/s]


Best threshold(ROC): -19.764399


Image Encoding: 100%|██████████| 6/6 [00:01<00:00,  3.97it/s]
Image Encoding: 100%|██████████| 6/6 [00:00<00:00,  7.43it/s]
Image Encoding: 100%|██████████| 6/6 [00:01<00:00,  4.47it/s]


Best threshold(ROC): -19.48168


Image Encoding: 100%|██████████| 6/6 [00:06<00:00,  1.05s/it]
Image Encoding: 100%|██████████| 6/6 [00:01<00:00,  4.85it/s]


Best threshold(ROC): -19.11981


Image Encoding: 100%|██████████| 6/6 [00:02<00:00,  2.55it/s]
Image Encoding: 100%|██████████| 6/6 [00:01<00:00,  4.56it/s]


Best threshold(ROC): -19.11981


Image Encoding: 100%|██████████| 6/6 [00:01<00:00,  4.77it/s]
Image Encoding: 100%|██████████| 6/6 [00:00<00:00,  7.81it/s]
Image Encoding: 100%|██████████| 6/6 [00:01<00:00,  4.44it/s]


Best threshold(ROC): -18.667114


Image Encoding: 100%|██████████| 6/6 [00:06<00:00,  1.09s/it]
Image Encoding: 100%|██████████| 6/6 [00:01<00:00,  4.63it/s]


Best threshold(ROC): -18.19967


Image Encoding: 100%|██████████| 6/6 [00:02<00:00,  2.53it/s]
Image Encoding: 100%|██████████| 6/6 [00:01<00:00,  4.60it/s]


Best threshold(ROC): -18.228678
clip auroc: 0.38894985520499925, auprc : 0.46116908391860645
