## 1 Read Overall Summary

Quickly output number of samples, dataset shapes, template list, and model parameters.

In [None]:
from pathlib import Path
import h5py, json

def _fmt_shape(shape):
    try:
        return "[" + ", ".join(str(int(x)) for x in shape) + "]"
    except Exception:
        return "[]"

def _dtype(ds):
    try:
        return str(ds.dtype)
    except Exception:
        return "unknown"

def _is_vlen_str(ds):
    try:
        dt = ds.dtype
        return (hasattr(dt, "metadata") and dt.metadata and dt.metadata.get("vlen") is str) or str(dt).startswith("|S") or str(dt) == "object"
    except Exception:
        return False

def _root_attr_keys(f):
    preferred = [
        "D","T_img","created_at","version","split_counts","logit_scale","logit_bias","concept_logit_scale","concept_logit_bias",
        "label_texts","prompt_temp_for_labels","concept_texts","prompt_temp_for_concepts",
    ]
    exists = [k for k in preferred if k in f.attrs]
    others = sorted([k for k in f.attrs.keys() if k not in exists])
    return exists, others

def _template_attr_keys(g):
    preferred = ["K","D","T_txt","texts_hash","created_at"]
    exists = [k for k in preferred if k in g.attrs]
    others = sorted([k for k in g.attrs.keys() if k not in exists])
    return exists, others

def _build_h5_tree(path: str, max_templates: int = 3):
    lines = []
    with h5py.File(path, "r") as f:
        lines.append("conceptclip_features.h5")
        # Datasets at root
        if "image_features" in f:
            lines.append(f"├── image_features               # {_fmt_shape(f['image_features'].shape)}")
        if "image_token_features" in f:
            lines.append(f"├── image_token_features         # {_fmt_shape(f['image_token_features'].shape)}")
        if "ids" in f:
            lines.append(f"├── ids                          # {_fmt_shape(f['ids'].shape)} int64")
        if "labels" in f:
            lines.append(f"├── labels                       # {_fmt_shape(f['labels'].shape)} int64")
        if "split" in f:
            lines.append(f"├── split                        # {_fmt_shape(f['split'].shape)}")
        # Root attrs
        pref, others = _root_attr_keys(f)
        attr_list = pref + (["..."] if others else [])
        lines.append("├── attrs: {" + ", ".join(attr_list) + "}")
        # Templates group
        if "templates" in f:
            lines.append("└── templates/")
            tmpl_names = sorted(list(f["templates"].keys()))
            show = tmpl_names[:max_templates]
            for i, tid in enumerate(show):
                g = f["templates"][tid]
                is_last_template = (i == len(show) - 1) and (len(tmpl_names) <= max_templates)
                # Template group header
                lines.append(f"    {'└──' if is_last_template else '├──'} {tid}/")
                # Inside template group
                prefix = "    " + ("    " if is_last_template else "│   ")
                if "text_features" in g:
                    tf_shape = _fmt_shape(g["text_features"].shape)
                    tf_dtype = _dtype(g["text_features"])
                    lines.append(f"{prefix}├── text_features        # {tf_shape} ({tf_dtype})")
                if "text_token_features" in g:
                    ttf_shape = _fmt_shape(g["text_token_features"].shape)
                    ttf_dtype = _dtype(g["text_token_features"])
                    lines.append(f"{prefix}├── text_token_features  # {ttf_shape} ({ttf_dtype})")
                if "texts" in g:
                    tx_shape = _fmt_shape(g["texts"].shape)
                    vlen = " (variable length string)" if _is_vlen_str(g["texts"]) else ""
                    lines.append(f"{prefix}├── texts                # {tx_shape}{vlen}")
                pref_t, others_t = _template_attr_keys(g)
                attr_list_t = pref_t + (["..."] if others_t else [])
                lines.append(f"{prefix}└── attrs: {{" + ", ".join(attr_list_t) + "}}")
            if len(tmpl_names) > max_templates:
                # Ellipsis for more templates
                lines.append("    └── ...")
    return lines

def summarize_h5(path: str = "./conceptclip_features.h5", max_templates: int = 3):
    if not Path(path).exists():
        raise FileNotFoundError(path)
    with h5py.File(path, "r") as f:
        print("image_features:", f["image_features"].shape if "image_features" in f else None)
        print("ids preview (first 5):", f["ids"][:5] if "ids" in f else None)
        try:
            print("split counts:", json.loads(f.attrs.get("split_counts", "{}")))
        except Exception:
            print("split counts:", {})
        templates = list(f["templates"].keys()) if "templates" in f else []
        print("templates:", templates[:max_templates] + (["..."] if len(templates) > max_templates else []))
    print("\nH5 structure:\n")
    for line in _build_h5_tree(path, max_templates=max_templates):
        print(line)

summarize_h5()

image_features: (17092, 1152)
ids preview (first 5): [0 1 2 3 4]
split counts: {'train': 11959, 'val': 1712, 'test': 3421}
templates: ['concept_prompts_t01', 'concept_prompts_t02', 'concept_prompts_t03', '...']

H5 structure:

conceptclip_features.h5
├── image_features               # [17092, 1152]
├── image_token_features         # [17092, 729, 1152]
├── ids                          # [17092] int64
├── labels                       # [17092] int64
├── split                        # [17092]
├── attrs: {D, T_img, created_at, version, split_counts, logit_scale, logit_bias, concept_logit_scale, concept_logit_bias, label_texts, prompt_temp_for_labels, concept_texts, prompt_temp_for_concepts}
└── templates/
    ├── concept_prompts_t01/
    │   ├── text_features        # [15, 1152] (float32)
    │   ├── text_token_features  # [15, 15, 1152] (float32)
    │   ├── texts                # [15] (variable length string)
    │   └── attrs: {K, D, T_txt, texts_hash, created_at}}
    ├── concept_promp

## 2 Clear All Template Entries

In [None]:
from pathlib import Path
import h5py

def clear_all_templates(path: str = "./conceptclip_features.h5"):
    """Remove all template groups under the `templates` subtree in the H5 file.
    - If the `templates` group does not exist, no action is performed.
    - To keep the structure stable, the group is recreated empty after deletion.
    """
    p = Path(path)
    if not p.exists():
        raise FileNotFoundError(path)
    with h5py.File(path, "a") as f:
        if "templates" not in f:
            print("No 'templates' group found. Nothing to clear.")
            return
        # Count and delete the whole group, then recreate an empty one
        tmpl_names = list(f["templates"].keys())
        del f["templates"]
        f.create_group("templates")
        f.flush()
        print(f"Cleared {len(tmpl_names)} template groups: {tmpl_names[:5]}" + (" ..." if len(tmpl_names) > 5 else ""))

# Execute clearing and inspect structure
clear_all_templates()

Cleared 0 template groups: []


In [2]:
# summarize_h5()

## 3 Retrieve Image Features for a Specific Sample

In [None]:
import numpy as np
import h5py

# Access sample by its global index in the database
def access_sample_by_index(index: int, path: str = "./conceptclip_features.h5"):
	"""Access a sample by global index.
	- index: Global index in range [0, N-1], where N is the total number of samples.
	Returns a dict containing image-level and token-level features plus metadata.
	"""
	with h5py.File(path, "r") as f:
		img_feat = f["image_features"][index]
		img_tokens = f["image_token_features"][index]
		sample_id = int(f["ids"][index])
		label = int(f["labels"][index])
		split_name = f["split"][index].decode("utf-8") if hasattr(f["split"], "dtype") else f["split"][index]
	print(f"sample_id: {sample_id}, label: {label}, split_name: {split_name}, img_feat.shape: {img_feat.shape}, img_tokens.shape: {img_tokens.shape}")
	return {
		"image_feature": img_feat,
		"image_token_feature": img_tokens,
		"id": sample_id,
		"label": label,
		"split": split_name,
	}

# Example: access sample at index 12000
access_sample_by_index(12000)

sample_id: 12000, label: 6, split_name: val, img_feat.shape: (1152,), img_tokens.shape: (729, 1152)


{'image_feature': array([-0.0667  ,  0.02164 ,  0.013214, ...,  0.003517,  0.002188,
         0.006638], dtype=float16),
 'image_token_feature': array([[ 0.172  ,  1.203  ,  0.811  , ..., -0.637  ,  0.4094 , -1.029  ],
        [-0.04007,  0.01625,  1.405  , ..., -0.7    ,  1.091  , -0.5845 ],
        [-0.3083 , -0.2852 ,  0.763  , ..., -0.2854 ,  0.68   , -0.807  ],
        ...,
        [-0.2632 ,  0.3813 ,  1.758  , ...,  0.4058 ,  0.644  , -0.603  ],
        [-0.2456 ,  0.596  ,  1.686  , ...,  0.4138 ,  1.04   , -0.6357 ],
        [ 0.1019 ,  1.638  ,  0.8374 , ...,  0.03757,  0.4624 ,  0.2737 ]],
       dtype=float16),
 'id': 12000,
 'label': 6,
 'split': 'val'}

In [None]:
# Customized batch retrieval utilities

import numpy as np
from pathlib import Path
import h5py

def _to_str_array(arr):
    return np.array([x.decode("utf-8") if isinstance(x, (bytes, bytearray)) else str(x) for x in arr])

def access_batch(split: str = "all", idx=None, type: str = "image", path: str = "./conceptclip_features.h5") -> np.ndarray:
    """Return batched data as a numpy.ndarray.
    Args:
        split: One of "all" | "train" | "test" | "val" | "arbitrary_name". Filters by data split; "all" means no filtering. can be an arbitrary name if custom splits are used.
        idx:   Optional selection over the filtered subset; may be an int, slice, or a sequence of ints. Applies to indices AFTER split filtering. If None, returns all indices in the split.
        type:  One of "image" | "patches" | "label" mapping to image_features / image_token_features / labels.
        path:  H5 file path.
    Returns:
        numpy.ndarray with the requested data subset.
    Raises:
        FileNotFoundError, ValueError, IndexError, KeyError, TypeError for invalid inputs or missing datasets.
    """
    p = Path(path)
    if not p.exists():
        raise FileNotFoundError(path)

    type = type.lower()
    key_map = {
        "image": "image_features",
        "patches": "image_token_features",
        "label": "labels",
    }
    if type not in key_map:
        raise ValueError(f"Unsupported type '{type}'. Use one of {list(key_map.keys())}.")
    target_key = key_map[type]

    with h5py.File(path, "r") as f:
        if target_key not in f:
            raise KeyError(f"Dataset '{target_key}' not found in file.")
        total_n = f[target_key].shape[0]

        # Split filtering
        split = (split or "all")
        if split == "all":
            indices_in_split = np.arange(total_n, dtype=np.int64)
        else:
            if "split" not in f:
                raise KeyError("Dataset 'split' not found for filtering.")
            split_arr = _to_str_array(f["split"][:])

            mask = (split_arr == split)
            indices_in_split = np.flatnonzero(mask).astype(np.int64)

        # Normalize idx and map to global indices
        if idx is None:
            selected_global_idx = indices_in_split
        elif isinstance(idx, slice):
            selected_global_idx = indices_in_split[idx]
        elif isinstance(idx, (list, tuple, np.ndarray)):
            idx_arr = np.asarray(idx, dtype=np.int64)
            if idx_arr.size > 0 and (idx_arr.min() < 0 or idx_arr.max() >= indices_in_split.shape[0]):
                raise IndexError("Index out of range for the selected split.")
            selected_global_idx = indices_in_split[idx_arr]
        elif isinstance(idx, (int, np.integer)):
            if idx < 0 or idx >= indices_in_split.shape[0]:
                raise IndexError("Index out of range for the selected split.")
            selected_global_idx = np.array([indices_in_split[int(idx)]], dtype=np.int64)
        else:
            raise TypeError("idx must be None, int, slice, or a sequence of ints.")

        # Fetch from dataset
        ds = f[target_key]
        result = ds[selected_global_idx]
        result = np.asarray(result)
    return result

In [None]:
# Examples:
imgs = access_batch(split="train", idx=slice(0, 128), type="image")
tokens = access_batch(split="val", idx=[0, 2, 4, 1000], type="patches")
labels = access_batch(split="test", idx=None, type="label")
imgs.shape, tokens.shape, labels.shape

((128, 1152), (4, 729, 1152), (3421,))

## 4 Retrieve Text Features for a Specific Prompt Template

The following example demonstrates how to read CLS and token features for a given template.

In [None]:
import h5py

# Get template names
def get_template_names(path: str = "./conceptclip_features.h5"):
    if not Path(path).exists():
        raise FileNotFoundError(path)
    with h5py.File(path, "r") as f:
        templates = list(f["templates"].keys()) if "templates" in f else []
    return templates

print("templates:", get_template_names())

def _read_json_attr(f: h5py.File, key: str, default=None):
    """Read and parse a JSON-formatted attribute from the H5 file root attrs."""
    if key not in f.attrs:
        return default
    raw = f.attrs[key]
    if isinstance(raw, (bytes, bytearray)):
        raw = raw.decode("utf-8", errors="ignore")
    if isinstance(raw, str):
        try:
            return json.loads(raw)
        except json.JSONDecodeError:
            return default
    return raw

print(_read_json_attr(h5py.File("./conceptclip_features.h5", "r"), "concept_texts"))
print(_read_json_attr(h5py.File("./conceptclip_features.h5", "r"), "prompt_temp_for_concepts"))
print(_read_json_attr(h5py.File("./conceptclip_features.h5", "r"), "labels"))
print(_read_json_attr(h5py.File("./conceptclip_features.h5", "r"), "logit_scale"))

templates: ['concept_prompts_t01', 'concept_prompts_t02', 'concept_prompts_t03', 'concept_prompts_t04', 'concept_prompts_t05', 'concept_prompts_t06', 'concept_prompts_t07', 'concept_prompts_t08', 'concept_prompts_t09', 'label_prompts_t01', 'label_prompts_t02', 'label_prompts_t03']
['Segmented nucleus', 'Band nucleus (band form)', 'Reniform / indented nucleus', 'Round nucleus', 'Fine azurophilic granules', 'Eosinophilic granules', 'Basophilic granules', 'Basophilic cytoplasm', 'Cytoplasmic vacuoles', 'High nuclear-to-cytoplasmic ratio', 'Pale cytoplasm', 'Nucleated erythrocyte (erythroblast)', 'Platelet fragments / clumps', 'Stain precipitate (artifact)', 'Overlapping cell clumps (artifact)']
['a cell photo with sign of {}', 'a photo of a cell with {}', 'a cell image indicating {}', 'an image of a cell showing {}', 'blood cell with {}', 'a blood cell photo with sign of {}', 'a photo of a blood cell with {}', 'a blood cell image indicating {}', 'an image of blood cell showing {}']
None
4

In [None]:
from pathlib import Path
import numpy as np
import h5py, json

def fetch_template_features(template_id=None, *,
                            is_concept: bool = True,
                            concept_list: list | None = None,
                            is_label: bool = False,
                            type: str = "text",
                            path: str = "./conceptclip_features.h5"):
    """Return text/token features for one or multiple templates as numpy arrays.
    Args:
        template_id: str or iterable of str. When provided, intersect with id set after prefix filtering.
        is_concept: Filter templates whose names start with 'concept'. Mutually exclusive with is_label.
        concept_list: When is_concept=True, optionally restrict rows to a subset of concept names (must match root attr 'concept_texts'); preserved order.
        is_label: Filter templates whose names start with 'label'. Mutually exclusive with is_concept.
        type: 'text' -> take 'text_features'; 'tokens' -> take 'text_token_features'.
        path: Path to H5 file.
    Returns:
        If exactly one template selected: numpy.ndarray with shape [K,D] or [K,T_txt,D].
        If multiple templates: dict mapping template_id -> numpy.ndarray.
    Raises:
        ValueError / FileNotFoundError / KeyError / IndexError for invalid arguments or missing data.
    """
    if is_concept and is_label:
        raise ValueError("is_concept and is_label are mutually exclusive; please set only one True.")
    if not (is_concept or is_label):
        # Means: no prefix filtering
        pass

    sel_type = type.lower().strip()
    if sel_type not in {"text", "tokens"}:
        raise ValueError("type must be 'text' or 'tokens'")
    ds_key = "text_features" if sel_type == "text" else "text_token_features"

    p = Path(path)
    if not p.exists():
        raise FileNotFoundError(path)

    # Normalize template_id to a set
    if template_id is None:
        user_tmpls = None
    elif isinstance(template_id, (list, tuple, set, np.ndarray)):
        user_tmpls = {str(t) for t in template_id}  # a set
    else:
        user_tmpls = {str(template_id)}

    results = {}
    with h5py.File(path, "r") as f:
        if "templates" not in f:
            raise KeyError("Group 'templates' not found.")
        all_tmpls = sorted(list(f["templates"].keys()))

        # Prefix filtering
        filtered = all_tmpls
        if is_concept:
            filtered = [t for t in filtered if t.startswith("concept")]
        elif is_label:
            filtered = [t for t in filtered if t.startswith("label")]

        # Intersect with user-provided ids
        if user_tmpls is not None:
            filtered = [t for t in filtered if t in user_tmpls]
            if len(filtered) == 0:
                raise ValueError("Provided template_id has no intersection with filtered templates.")

        # Concept subset indexing
        concept_idx = None
        if is_concept and concept_list:
            base_concepts = _read_json_attr(f, "concept_texts", default=None)
            if not base_concepts:
                raise KeyError("Root attr 'concept_texts' missing or empty; cannot subset by concepts.")
            # create a mapping from concept name to its index
            pos = {str(name): i for i, name in enumerate(base_concepts)}
            miss = [c for c in concept_list if str(c) not in pos]
            if miss:
                print(f"Warning: Concepts not found in concept_texts, will ignore: {miss}")
            keep = [pos[str(c)] for c in concept_list if str(c) in pos]
            if not keep:
                raise ValueError("concept_list empty or no matches in concept_texts.")
            concept_idx = np.array(keep, dtype=np.int64)

        # Fetch each template dataset
        for tid in filtered:
            g = f["templates"][tid]
            if ds_key not in g:
                # For 'tokens' case may legitimately be absent; skip
                continue
            arr = g[ds_key][:]  # [K,D] or [K,T_txt,D]
            if concept_idx is not None:
                try:
                    arr = arr[concept_idx]
                except Exception as e:
                    raise IndexError(f"Failed to slice template {tid} with selected concept indices: {e}")
            results[tid] = np.asarray(arr)

    if len(results) == 0:
        raise ValueError("No template matched criteria or target dataset missing.")
    if len(results) == 1:
        return next(iter(results.values()))
    return results

In [None]:
# Usage examples:
# 1) Return CLS features of all concept templates (restricted to subset order)
feats = fetch_template_features(is_concept=True, concept_list=["Segmented nucleus", "Round nucleus"], type="text")
# 2) Return token features of two specific concept templates
feats_map = fetch_template_features(template_id=["concept_prompts_t01", "concept_prompts_t02"], is_concept=True, type="tokens")
# 3) Return CLS features of label templates
label_feats = fetch_template_features(is_concept=False, is_label=True, type="text")
print(label_feats['label_prompts_t03'].shape)

(8, 1152)
