# Assignment 2 Part 4

Abhinav Kumar
10/13/2025

Feature Extraction + PGVector

In [None]:

from pathlib import Path
import json, numpy as np
import torch, torch.nn.functional as F
import psycopg2
from psycopg2.extras import execute_batch
from anomalib.data import MVTec
from anomalib.models import Patchcore
from lightning import Trainer, seed_everything

PG = dict(host="localhost", port=5432, dbname="mvtec", user="postgres", password="postgres", table="mvtec_embeddings")
CATEGORIES = ("tile", "leather", "grid")
seed_everything(42)

def get_conn():
    return psycopg2.connect(host=PG["host"], port=PG["port"], dbname=PG["dbname"], user=PG["user"], password=PG["password"])

def ensure_table(dim: int):
    ddl = f"""
    CREATE EXTENSION IF NOT EXISTS vector;
    CREATE TABLE IF NOT EXISTS {PG['table']} (
        id SERIAL PRIMARY KEY,
        category TEXT,
        split TEXT,
        image_path TEXT,
        label TEXT,
        anomaly_score DOUBLE PRECISION,
        embedding VECTOR({dim}),
        metadata JSONB
    );
    """
    with get_conn() as conn, conn.cursor() as cur:
        cur.execute(ddl)
        conn.commit()


In [None]:

def global_feature(x: torch.Tensor):
    if x.ndim == 4:
        x = torch.nn.functional.adaptive_avg_pool2d(x, 1).flatten(1)
    return x

def extract_features(model, batch):
    with torch.no_grad():
        feats = model.model.backbone(batch["image"])
        if isinstance(feats, dict):
            feats = list(feats.values())[-1]
        vecs = global_feature(feats).cpu().numpy()
        return vecs


In [None]:

def build_split_embeddings(model, dm, split: str, category: str):
    loader = dm.train_dataloader() if split == "train" else dm.test_dataloader()
    all_vecs, meta = [], []
    for batch in loader:
        vecs = extract_features(model, batch)
        all_vecs.append(vecs)
        B = vecs.shape[0]
        paths = batch.get("image_path", [""]*B)
        labels = batch.get("label", ["unknown"]*B)
        for i in range(B):
            meta.append(dict(category=category, split=split, image_path=paths[i], label=str(labels[i]), anomaly_score=float("nan")))
    if not all_vecs:
        return np.empty((0,0)), []
    return np.concatenate(all_vecs, axis=0), meta


In [None]:

def insert_embeddings(embeddings, meta_rows):
    dim = embeddings.shape[1]
    ensure_table(dim)
    rows = []
    for v, m in zip(embeddings, meta_rows):
        rows.append((m["category"], m["split"], m["image_path"], m["label"], m["anomaly_score"], list(map(float, v)), json.dumps(m)))
    sql = f"""
    INSERT INTO {PG['table']}
    (category, split, image_path, label, anomaly_score, embedding, metadata)
    VALUES (%s, %s, %s, %s, %s, %s::vector, %s);
    """
    with get_conn() as conn, conn.cursor() as cur:
        execute_batch(cur, sql, rows, page_size=256)
        conn.commit()
    print(f"Inserted {len(rows)} rows.")


In [None]:

for cat in CATEGORIES:
    print(f"\n[{cat}] building embeddings...")
    dm = MVTec(root="./data", category=cat, image_size=256, task="segmentation", download=True,
               train_batch_size=8, eval_batch_size=8, num_workers=4)
    model = Patchcore()
    Trainer(accelerator="cpu", devices=1, max_epochs=0, logger=False, enable_checkpointing=False).fit(model, dm)
    tr_vecs, tr_meta = build_split_embeddings(model, dm, "train", cat)
    if tr_vecs.size:
        insert_embeddings(tr_vecs, tr_meta)
    te_vecs, te_meta = build_split_embeddings(model, dm, "test", cat)
    if te_vecs.size:
        insert_embeddings(te_vecs, te_meta)
print("\nDone.")
