In [1]:
import pandas as pd

# 1 Initialize Concept and Prompt Template Lists

In [None]:
CONCEPTS_dict = {
    # --- Margins/Shape ---
    
    # 1. Spiculation
    # Malignant sign: Refers to radial fine lines extending from the nodule margin into surrounding lung parenchyma.
    "spiculation": [
        "spiculated",             # Spiculated
        "spiculation",            # Spiculation
        "radial spicules",        # Radial spicules
        "spiculated margins",     # Spiculated margins
        "spikes extending from the surface" # Spikes extending from the surface
    ],

    # 2. Lobulation
    # Malignant sign: Due to uneven growth rates in different parts of the tumor, the margin appears wavy or scalloped.
    "lobulation": [
        "lobulated",              # Lobulated
        "lobulation",             # Lobulation
        "a scalloped contour",    # Scalloped/wavy contour
        "lobulated margins",      # Lobulated margins
        "wavy contour"            # Wavy contour
    ],

    # 3. Round/Sphericity
    # Benign tendency: Often seen in hamartomas, tuberculomas, etc., but also seen in metastases.
    "round_sphericity": [
        "round",                  # Round
        "spherical",              # Spherical
        "high-sphericity",        # High sphericity
        "a round shape",          # A round shape
        "highly spherical",       # Highly spherical
        "circular shape"          # Circular shape
    ],

    # --- Density/Attenuation ---

    # 4. Pure Ground-Glass Nodule (Pure GGN)
    # Alveolar spaces are air-filled but walls are thickened; does not obscure vascular markings.
    "pure_GGN": [
        "pure ground-glass",            # Pure ground-glass
        "non-solid",                    # Non-solid
        "pure ground-glass appearance", # Pure ground-glass appearance
        "non-solid attenuation",        # Non-solid attenuation
        "pure GGO"                      # Pure GGO (Opacities)
    ],

    # 5. Part-solid/Mixed
    # Highest probability of malignancy: Contains both ground-glass and solid components.
    "part_solid": [
        "part-solid",                   # Part-solid
        "subsolid",                     # Subsolid
        "a solid focus within ground-glass", # A solid focus within ground-glass
        "mixed ground-glass density",   # Mixed ground-glass density
        "mixed attenuation"             # Mixed attenuation values
    ],

    # 6. Solid Nodule
    # Completely obscures pulmonary vascular markings; high density.
    "solid": [
        "solid",                        # Solid
        "soft-tissue attenuation",      # Soft-tissue attenuation
        "solid attenuation",            # Solid attenuation
        "dense solid structure"         # Dense solid structure
    ],

    # --- Internal Structure & Special Signs ---

    # 7. Benign Calcification
    # Usually presents as diffuse, central, laminar, or popcorn-like.
    "benign_calc": [
        "diffuse calcification",        # Diffuse calcification
        "central calcification",        # Central calcification
        "laminar calcification",        # Laminar calcification
        "popcorn calcification",        # Popcorn-like calcification
        "benign calcification pattern"  # Benign calcification pattern
    ],

    # 8. Eccentric/Punctate Calcification
    # Higher malignant risk: e.g., eccentric distribution or scattered punctate.
    "eccentric_punctate_calc": [
        "eccentrically calcified",      # Eccentrically calcified
        "eccentric calcification",      # Eccentric calcification
        "punctate calcification",       # Punctate calcification
        "stippled calcification",       # Stippled calcification
        "small scattered calcifications" # Small scattered calcifications
    ],

    # 9. Air Bronchogram/Cavitation
    # Air-filled bronchi or cavities visible inside the nodule.
    "air_bronch_cav": [
        "cavitary",                     # Cavitary
        "an air bronchogram",           # Air bronchogram
        "internal air lucency",         # Internal air lucency
        "cavitation",                   # Cavitation formation
        "air-filled pockets"            # Air-filled pockets
    ],

    # --- Image Quality & Visibility ---

    # 10. Very Subtle/Occult Features
    # Low contrast, even hard for the human eye to catch; used to test model sensitivity to weak signals.
    "very_subtle": [
        "very subtle",                  # Very subtle/occult
        "low-contrast",                 # Low-contrast
        "faint-margin",                 # Blurred margin
        "low contrast",                 # Low contrast
        "faint margins",                # Faint margins
        "hard to see",                  # Hard to see
        "indistinct boundaries"         # Indistinct boundaries
    ]
}

# ---------------------------------------------------------
# Universal Templates
# ---------------------------------------------------------

PROMPT_TEMPLATES = [
    # Format A: Simple Concatenation (Context: {Description})
    "chest CT showing a pulmonary nodule: {}.",
    "axial chest CT of a pulmonary nodule, {}.",
    "lung window CT depicting a pulmonary nodule, {}.",
    
    # Format B: Descriptive (Context showing {Description})
    # This format works well for both nouns (spiculation) and adjectives (spiculated)
    "chest CT image of a pulmonary nodule showing {}.",
    "axial lung CT slice demonstrating {}.",
    "a pulmonary nodule characterized by {}.",
    
    # Format C: Emphasizing Features (The nodule is/has {Description})
    "chest CT where the pulmonary nodule is {}.",
    "chest CT showing a pulmonary nodule with features of {}.",
    
    # Format D: Short Medical Description Style
    "pulmonary nodule, {}.",
    "CT scan, lung nodule, {}."
]

# Deduplicate all concepts from CONCEPTS_dict into a list, preserving order
concept_list = []

for concept_type, concepts in CONCEPTS_dict.items():
    for concept in concepts:
        if concept not in concept_list:
            concept_list.append(concept)

concept_prompt_template_list = PROMPT_TEMPLATES

label_list = ['benign', 'malignant']

label_prompt_template_list = [
    "a {} lung nodule in CT scan",
    "a lung nodule showing {} in CT scan",
    "a CT scan image of a {} lung nodule",
    "a CT slice with a {} lung nodule",
    "a Chest CT image showing a {} nodule",
    "{} nodule in Chest CT scan",
    "{} nodule in Chest CT"
]

DB_METADATA = {
    "label_texts": label_list,
    "prompt_temp_for_labels": label_prompt_template_list,
    "concept_texts": concept_list,
    "prompt_temp_for_concepts": concept_prompt_template_list,
}

print("Number of sub-concepts:", len(concept_list))

Number of sub-concepts: 52


# 2 Database Overview

In [3]:
from pathlib import Path
import h5py, json

def _fmt_shape(shape):
    try:
        return "[" + ", ".join(str(int(x)) for x in shape) + "]"
    except Exception:
        return "[]"

def _dtype(ds):
    try:
        return str(ds.dtype)
    except Exception:
        return "unknown"

def _is_vlen_str(ds):
    try:
        dt = ds.dtype
        return (hasattr(dt, "metadata") and dt.metadata and dt.metadata.get("vlen") is str) or str(dt).startswith("|S") or str(dt) == "object"
    except Exception:
        return False

def _root_attr_keys(f):
    preferred = [
        "D","T_img","created_at","version","split_counts","logit_scale","logit_bias","concept_logit_scale","concept_logit_bias",
        "label_texts","prompt_temp_for_labels","concept_texts","prompt_temp_for_concepts",
    ]
    exists = [k for k in preferred if k in f.attrs]
    others = sorted([k for k in f.attrs.keys() if k not in exists])
    return exists, others

def _template_attr_keys(g):
    preferred = ["K","D","T_txt","texts_hash","created_at"]
    exists = [k for k in preferred if k in g.attrs]
    others = sorted([k for k in g.attrs.keys() if k not in exists])
    return exists, others

def _build_h5_tree(path: str, max_templates: int = 3):
    lines = []
    with h5py.File(path, "r") as f:
        lines.append("conceptclip_features.h5")
        # Datasets at root
        if "image_features" in f:
            lines.append(f"├── image_features               # {_fmt_shape(f['image_features'].shape)}")
        if "image_token_features" in f:
            lines.append(f"├── image_token_features         # {_fmt_shape(f['image_token_features'].shape)}")
        if "ids" in f:
            lines.append(f"├── ids                          # {_fmt_shape(f['ids'].shape)} int64")
        if "labels" in f:
            lines.append(f"├── labels                       # {_fmt_shape(f['labels'].shape)} int64")
        if "split" in f:
            lines.append(f"├── split                        # {_fmt_shape(f['split'].shape)}")
        # Root attrs
        pref, others = _root_attr_keys(f)
        attr_list = pref + (["..."] if others else [])
        lines.append("├── attrs: {" + ", ".join(attr_list) + "}")
        # Templates group
        if "templates" in f:
            lines.append("└── templates/")
            tmpl_names = sorted(list(f["templates"].keys()))
            show = tmpl_names[:max_templates]
            for i, tid in enumerate(show):
                g = f["templates"][tid]
                is_last_template = (i == len(show) - 1) and (len(tmpl_names) <= max_templates)
                # Template group header
                lines.append(f"    {'└──' if is_last_template else '├──'} {tid}/")
                # Inside template group
                prefix = "    " + ("    " if is_last_template else "│   ")
                if "text_features" in g:
                    tf_shape = _fmt_shape(g["text_features"].shape)
                    tf_dtype = _dtype(g["text_features"])
                    lines.append(f"{prefix}├── text_features        # {tf_shape} ({tf_dtype})")
                if "text_token_features" in g:
                    ttf_shape = _fmt_shape(g["text_token_features"].shape)
                    ttf_dtype = _dtype(g["text_token_features"])
                    lines.append(f"{prefix}├── text_token_features  # {ttf_shape} ({ttf_dtype})")
                if "texts" in g:
                    tx_shape = _fmt_shape(g["texts"].shape)
                    vlen = " (variable length string)" if _is_vlen_str(g["texts"]) else ""
                    lines.append(f"{prefix}├── texts                # {tx_shape}{vlen}")
                pref_t, others_t = _template_attr_keys(g)
                attr_list_t = pref_t + (["..."] if others_t else [])
                lines.append(f"{prefix}└── attrs: {{" + ", ".join(attr_list_t) + "}}")
            if len(tmpl_names) > max_templates:
                # Ellipsis for more templates
                lines.append("    └── ...")
    return lines

def summarize_h5(path: str = "./conceptclip_features.h5", max_templates: int = 3):
    if not Path(path).exists():
        raise FileNotFoundError(path)
    with h5py.File(path, "r") as f:
        print("image_features:", f["image_features"].shape if "image_features" in f else None)
        print("ids preview (first 5):", f["ids"][:5] if "ids" in f else None)
        try:
            print("split counts:", json.loads(f.attrs.get("split_counts", "{}")))
        except Exception:
            print("split counts:", {})
        templates = list(f["templates"].keys()) if "templates" in f else []
        print("templates:", templates[:max_templates] + (["..."] if len(templates) > max_templates else []))
    print("\nH5 structure:\n")
    for line in _build_h5_tree(path, max_templates=max_templates):
        print(line)

summarize_h5()

image_features: (2532, 1152)
ids preview (first 5): [0 1 2 3 4]
split counts: {'LIDC-IDRI-0001_0': 5, 'LIDC-IDRI-0002_0': 5, 'LIDC-IDRI-0003_0': 5, 'LIDC-IDRI-0003_1': 5, 'LIDC-IDRI-0003_2': 2, 'LIDC-IDRI-0003_3': 4, 'LIDC-IDRI-0006_0': 1, 'LIDC-IDRI-0007_0': 5, 'LIDC-IDRI-0007_1': 4, 'LIDC-IDRI-0011_0': 3, 'LIDC-IDRI-0012_0': 4, 'LIDC-IDRI-0013_0': 1, 'LIDC-IDRI-0013_1': 5, 'LIDC-IDRI-0014_0': 5, 'LIDC-IDRI-0015_0': 5, 'LIDC-IDRI-0016_0': 4, 'LIDC-IDRI-0016_1': 2, 'LIDC-IDRI-0016_2': 5, 'LIDC-IDRI-0018_0': 3, 'LIDC-IDRI-0018_1': 5, 'LIDC-IDRI-0019_0': 5, 'LIDC-IDRI-0020_0': 5, 'LIDC-IDRI-0022_0': 5, 'LIDC-IDRI-0023_0': 5, 'LIDC-IDRI-0024_0': 4, 'LIDC-IDRI-0024_1': 2, 'LIDC-IDRI-0027_0': 1, 'LIDC-IDRI-0027_1': 1, 'LIDC-IDRI-0029_0': 5, 'LIDC-IDRI-0031_0': 3, 'LIDC-IDRI-0031_1': 5, 'LIDC-IDRI-0033_0': 2, 'LIDC-IDRI-0036_0': 2, 'LIDC-IDRI-0037_0': 4, 'LIDC-IDRI-0041_0': 5, 'LIDC-IDRI-0043_0': 5, 'LIDC-IDRI-0043_1': 5, 'LIDC-IDRI-0044_0': 2, 'LIDC-IDRI-0044_1': 5, 'LIDC-IDRI-0044_2': 3, '

# 3 Extract Image Features

In [4]:
# Customized batch retrieval utilities

import numpy as np
from pathlib import Path
import h5py

def _to_str_array(arr):
    return np.array([x.decode("utf-8") if isinstance(x, (bytes, bytearray)) else str(x) for x in arr])

def access_batch(split: str = "all", idx=None, type: str = "image", path: str = "./conceptclip_features.h5") -> np.ndarray:
    """Return batched data as a numpy.ndarray.
    Args:
        split: One of "all" | "train" | "test" | "val" | "arbitrary_name". Filters by data split; "all" means no filtering. can be an arbitrary name if custom splits are used.
        idx:   Optional selection over the filtered subset; may be an int, slice, or a sequence of ints. Applies to indices AFTER split filtering. If None, returns all indices in the split.
        type:  One of "image" | "patches" | "label" mapping to image_features / image_token_features / labels.
        path:  H5 file path.
    Returns:
        numpy.ndarray with the requested data subset.
    Raises:
        FileNotFoundError, ValueError, IndexError, KeyError, TypeError for invalid inputs or missing datasets.
    """
    p = Path(path)
    if not p.exists():
        raise FileNotFoundError(path)

    type = type.lower()
    key_map = {
        "image": "image_features",
        "patches": "image_token_features",
        "label": "labels",
    }
    if type not in key_map:
        raise ValueError(f"Unsupported type '{type}'. Use one of {list(key_map.keys())}.")
    target_key = key_map[type]

    with h5py.File(path, "r") as f:
        if target_key not in f:
            raise KeyError(f"Dataset '{target_key}' not found in file.")
        total_n = f[target_key].shape[0]

        # Split filtering
        split = (split or "all")
        if split == "all":
            indices_in_split = np.arange(total_n, dtype=np.int64)
        else:
            if "split" not in f:
                raise KeyError("Dataset 'split' not found for filtering.")
            split_arr = _to_str_array(f["split"][:])

            mask = (split_arr == split)
            indices_in_split = np.flatnonzero(mask).astype(np.int64)

        # Normalize idx and map to global indices
        if idx is None:
            selected_global_idx = indices_in_split
        elif isinstance(idx, slice):
            selected_global_idx = indices_in_split[idx]
        elif isinstance(idx, (list, tuple, np.ndarray)):
            idx_arr = np.asarray(idx, dtype=np.int64)
            if idx_arr.size > 0 and (idx_arr.min() < 0 or idx_arr.max() >= indices_in_split.shape[0]):
                raise IndexError("Index out of range for the selected split.")
            selected_global_idx = indices_in_split[idx_arr]
        elif isinstance(idx, (int, np.integer)):
            if idx < 0 or idx >= indices_in_split.shape[0]:
                raise IndexError("Index out of range for the selected split.")
            selected_global_idx = np.array([indices_in_split[int(idx)]], dtype=np.int64)
        else:
            raise TypeError("idx must be None, int, slice, or a sequence of ints.")

        # Fetch from dataset
        ds = f[target_key]
        result = ds[selected_global_idx]
        result = np.asarray(result)
    return result

In [None]:
# Store image embeddings into DataFrame by split, using the previously saved curated_metadata.csv
df_image_features = pd.read_csv('curated_metadata.csv')
# Remove column img_path_curated
# df_image_features = df_image_features.drop(columns=['img_path_curated'])
if 'image_embedding' not in df_image_features:
    df_image_features['image_embedding'] = pd.Series([None]*len(df_image_features), dtype='object')
df_image_features

Unnamed: 0,patient_id,pid_nid_combo,img_path_curated,area_mm2,malignancy_label,img_shape_H,img_shape_W,image_embedding
0,LIDC-IDRI-0078,LIDC-IDRI-0078_0,./datasets/curation2/lidc_patches_all\LIDC-IDR...,156.325000,1,44,54,
1,LIDC-IDRI-0078,LIDC-IDRI-0078_0,./datasets/curation2/lidc_patches_all\LIDC-IDR...,184.210000,1,44,54,
2,LIDC-IDRI-0078,LIDC-IDRI-0078_0,./datasets/curation2/lidc_patches_all\LIDC-IDR...,191.392500,1,44,54,
3,LIDC-IDRI-0078,LIDC-IDRI-0078_0,./datasets/curation2/lidc_patches_all\LIDC-IDR...,147.875000,1,44,54,
4,LIDC-IDRI-0078,LIDC-IDRI-0078_0,./datasets/curation2/lidc_patches_all\LIDC-IDR...,81.120000,1,44,54,
...,...,...,...,...,...,...,...,...
2527,LIDC-IDRI-0127,LIDC-IDRI-0127_0,./datasets/curation2/lidc_patches_all\LIDC-IDR...,904.275089,0,61,62,
2528,LIDC-IDRI-0127,LIDC-IDRI-0127_0,./datasets/curation2/lidc_patches_all\LIDC-IDR...,974.722309,0,61,62,
2529,LIDC-IDRI-0127,LIDC-IDRI-0127_0,./datasets/curation2/lidc_patches_all\LIDC-IDR...,999.014454,0,61,62,
2530,LIDC-IDRI-0127,LIDC-IDRI-0127_0,./datasets/curation2/lidc_patches_all\LIDC-IDR...,949.822861,0,61,62,


In [None]:
# Extract image features by split

# Display split distribution
with h5py.File("./conceptclip_features.h5", "r") as f:
    split_counts = f.attrs['split_counts']
    print("Split distribution:", split_counts)

split_counts = eval(split_counts)

Split distribution: {"LIDC-IDRI-0001_0": 5, "LIDC-IDRI-0002_0": 5, "LIDC-IDRI-0003_0": 5, "LIDC-IDRI-0003_1": 5, "LIDC-IDRI-0003_2": 2, "LIDC-IDRI-0003_3": 4, "LIDC-IDRI-0006_0": 1, "LIDC-IDRI-0007_0": 5, "LIDC-IDRI-0007_1": 4, "LIDC-IDRI-0011_0": 3, "LIDC-IDRI-0012_0": 4, "LIDC-IDRI-0013_0": 1, "LIDC-IDRI-0013_1": 5, "LIDC-IDRI-0014_0": 5, "LIDC-IDRI-0015_0": 5, "LIDC-IDRI-0016_0": 4, "LIDC-IDRI-0016_1": 2, "LIDC-IDRI-0016_2": 5, "LIDC-IDRI-0018_0": 3, "LIDC-IDRI-0018_1": 5, "LIDC-IDRI-0019_0": 5, "LIDC-IDRI-0020_0": 5, "LIDC-IDRI-0022_0": 5, "LIDC-IDRI-0023_0": 5, "LIDC-IDRI-0024_0": 4, "LIDC-IDRI-0024_1": 2, "LIDC-IDRI-0027_0": 1, "LIDC-IDRI-0027_1": 1, "LIDC-IDRI-0029_0": 5, "LIDC-IDRI-0031_0": 3, "LIDC-IDRI-0031_1": 5, "LIDC-IDRI-0033_0": 2, "LIDC-IDRI-0036_0": 2, "LIDC-IDRI-0037_0": 4, "LIDC-IDRI-0041_0": 5, "LIDC-IDRI-0043_0": 5, "LIDC-IDRI-0043_1": 5, "LIDC-IDRI-0044_0": 2, "LIDC-IDRI-0044_1": 5, "LIDC-IDRI-0044_2": 3, "LIDC-IDRI-0045_0": 1, "LIDC-IDRI-0045_1": 3, "LIDC-IDRI-00

In [7]:
for key, value in split_counts.items():
    mask = df_image_features['pid_nid_combo'].eq(key)
    features = access_batch(split=key, type="image", path="./conceptclip_features.h5")
    # print(f"{key} features shape:", features.shape)
    vals = [row.copy() for row in np.asarray(features)]
    df_image_features.loc[mask, 'image_embedding'] = pd.Series(vals, index=df_image_features.index[mask], dtype='object')

In [None]:
# Save the DataFrame containing image features to a CSV file
df_image_features_tocsv = df_image_features.copy()[['patient_id', 'pid_nid_combo', 'img_path_curated', 'area_mm2',
       'malignancy_label', 'img_shape_H', 'img_shape_W', 'image_embedding']]
# The 'image_embedding' column is an Array type; expand its elements into multiple columns appended at the end
image_embedding_expanded = pd.DataFrame(df_image_features_tocsv['image_embedding'].tolist(),
                                        index=df_image_features_tocsv.index)
df_image_features_tocsv = pd.concat([df_image_features_tocsv.drop(columns=['image_embedding']), image_embedding_expanded], axis=1)

df_image_features_tocsv.to_csv('./image_features/df_image_features.csv', index=False)
# df_image_features_tocsv.head(3)

# 4 Extract Text Features

In [9]:
import h5py

# Get template names
def get_template_names(path: str = "./conceptclip_features.h5"):
    if not Path(path).exists():
        raise FileNotFoundError(path)
    with h5py.File(path, "r") as f:
        templates = list(f["templates"].keys()) if "templates" in f else []
    return templates

print("templates:", get_template_names())

def _read_json_attr(f: h5py.File, key: str, default=None):
    """Read and parse a JSON-formatted attribute from the H5 file root attrs."""
    if key not in f.attrs:
        return default
    raw = f.attrs[key]
    if isinstance(raw, (bytes, bytearray)):
        raw = raw.decode("utf-8", errors="ignore")
    if isinstance(raw, str):
        try:
            return json.loads(raw)
        except json.JSONDecodeError:
            return default
    return raw

print(_read_json_attr(h5py.File("./conceptclip_features.h5", "r"), "concept_texts"))
print(_read_json_attr(h5py.File("./conceptclip_features.h5", "r"), "prompt_temp_for_concepts"))
print(_read_json_attr(h5py.File("./conceptclip_features.h5", "r"), "label_texts"))
print(_read_json_attr(h5py.File("./conceptclip_features.h5", "r"), "prompt_temp_for_labels"))

templates: ['concept_prompts_t01', 'concept_prompts_t02', 'concept_prompts_t03', 'concept_prompts_t04', 'concept_prompts_t05', 'concept_prompts_t06', 'concept_prompts_t07', 'concept_prompts_t08', 'concept_prompts_t09', 'concept_prompts_t10', 'label_prompts_t01', 'label_prompts_t02', 'label_prompts_t03', 'label_prompts_t04', 'label_prompts_t05', 'label_prompts_t06', 'label_prompts_t07']
['spiculated', 'spiculation', 'radial spicules', 'spiculated margins', 'spikes extending from the surface', 'lobulated', 'lobulation', 'a scalloped contour', 'lobulated margins', 'wavy contour', 'round', 'spherical', 'high-sphericity', 'a round shape', 'highly spherical', 'circular shape', 'pure ground-glass', 'non-solid', 'pure ground-glass appearance', 'non-solid attenuation', 'pure GGO', 'part-solid', 'subsolid', 'a solid focus within ground-glass', 'mixed ground-glass density', 'mixed attenuation', 'solid', 'soft-tissue attenuation', 'solid attenuation', 'dense solid structure', 'diffuse calcificatio

In [10]:
from pathlib import Path
import numpy as np
import h5py, json

Get all combinations of concepts and prompt templates by concept class

In [None]:
# Generate all combinations of concepts and prompt templates by concept class
# Output:
# 1. combos_by_concept_class dict: {concept_class: [ {concept, concept_idx, template, template_idx, filled_prompt}, ... ]}
# 2. df_combos_by_concept_class DataFrame: columns include
#    ['concept_class', 'concept', 'concept_idx', 'template', 'template_idx', 'filled_prompt']
# Note: concept_idx is based on the global concept_list index constructed earlier; template_idx is based on PROMPT_TEMPLATES / concept_prompt_template_list index.

# Safety check: ensure prerequisite variables exist
required_vars = [
    'CONCEPTS_dict', 'concept_list', 'concept_prompt_template_list'
]
for v in required_vars:
    if v not in globals():
        raise NameError(f"Missing required variable: {v}")

combos_by_concept_class = {}
rows = []

for concept_class, variants in CONCEPTS_dict.items():
    class_entries = []
    for concept in variants:  # Iterate through all synonyms/expressions under this category
        try:
            concept_idx = concept_list.index(concept)
        except ValueError:
            # Skip if not in the flat list (theoretically shouldn't happen)
            continue
        for template_idx, template in enumerate(concept_prompt_template_list):
            # Generate the filled prompt (replace {} placeholder in template with concept term)
            try:
                filled = template.format(concept)
            except Exception:
                # If template doesn't contain {} or formatting fails, concatenate directly
                filled = f"{template} {concept}".strip()
            entry = {
                'concept_class': concept_class,
                'concept': concept,
                'concept_idx': concept_idx,
                'template': template,
                'template_idx': template_idx,
                'filled_prompt': filled,
            }
            class_entries.append(entry)
            rows.append(entry)
    combos_by_concept_class[concept_class] = class_entries

# Build DataFrame
import pandas as pd
if rows:
    df_combos_by_concept_class = pd.DataFrame(rows)
else:
    df_combos_by_concept_class = pd.DataFrame(columns=[
        'concept_class', 'concept', 'concept_idx', 'template', 'template_idx', 'filled_prompt'
    ])

print(f"Number of concept classes: {len(combos_by_concept_class)}; Total combination entries: {len(rows)}")
print("DataFrame preview:")
display(df_combos_by_concept_class.head(10))

概念大类数: 10; 总组合条目数: 520
DataFrame 预览：


Unnamed: 0,concept_class,concept,concept_idx,template,template_idx,filled_prompt
0,spiculation,spiculated,0,chest CT showing a pulmonary nodule: {}.,0,chest CT showing a pulmonary nodule: spiculated.
1,spiculation,spiculated,0,"axial chest CT of a pulmonary nodule, {}.",1,"axial chest CT of a pulmonary nodule, spiculated."
2,spiculation,spiculated,0,"lung window CT depicting a pulmonary nodule, {}.",2,"lung window CT depicting a pulmonary nodule, s..."
3,spiculation,spiculated,0,chest CT image of a pulmonary nodule showing {}.,3,chest CT image of a pulmonary nodule showing s...
4,spiculation,spiculated,0,axial lung CT slice demonstrating {}.,4,axial lung CT slice demonstrating spiculated.
5,spiculation,spiculated,0,a pulmonary nodule characterized by {}.,5,a pulmonary nodule characterized by spiculated.
6,spiculation,spiculated,0,chest CT where the pulmonary nodule is {}.,6,chest CT where the pulmonary nodule is spicula...
7,spiculation,spiculated,0,chest CT showing a pulmonary nodule with featu...,7,chest CT showing a pulmonary nodule with featu...
8,spiculation,spiculated,0,"pulmonary nodule, {}.",8,"pulmonary nodule, spiculated."
9,spiculation,spiculated,0,"CT scan, lung nodule, {}.",9,"CT scan, lung nodule, spiculated."


In [None]:
# Build mapping from template_id (f.templates.keys()) to template_key in H5 database
with h5py.File("./conceptclip_features.h5", "r") as f:
    template_keys = f['templates'].keys()
    template_keys = [template for template in template_keys if template.startswith("concept")]
    print(template_keys)
    # template_proj = [(template_id, template) for template_id, template in zip(template_keys, concept_prompt_template_list) if template_id.startswith("concept")]

def fetch_text_features(row):
    template_id = row['template_idx']
    template_key = template_keys[template_id]
    concept_idx = row['concept_idx']
    # print(f"Fetching text features for template_key: {template_key}, concept_idx: {concept_idx}")
    with h5py.File("./conceptclip_features.h5", "r") as f:
        return f['templates'][template_key]['text_features'][concept_idx]

# Add a new column to store the corresponding text_embedding
df_combos_by_concept_class['text_embedding'] = df_combos_by_concept_class.apply(
    fetch_text_features, axis=1
)

['concept_prompts_t01', 'concept_prompts_t02', 'concept_prompts_t03', 'concept_prompts_t04', 'concept_prompts_t05', 'concept_prompts_t06', 'concept_prompts_t07', 'concept_prompts_t08', 'concept_prompts_t09', 'concept_prompts_t10']


In [13]:
df_combos_by_concept_class

Unnamed: 0,concept_class,concept,concept_idx,template,template_idx,filled_prompt,text_embedding
0,spiculation,spiculated,0,chest CT showing a pulmonary nodule: {}.,0,chest CT showing a pulmonary nodule: spiculated.,"[0.0072476836, 0.024374694, 0.015739452, 0.025..."
1,spiculation,spiculated,0,"axial chest CT of a pulmonary nodule, {}.",1,"axial chest CT of a pulmonary nodule, spiculated.","[-0.0043996465, 0.028638378, 0.020175708, 0.02..."
2,spiculation,spiculated,0,"lung window CT depicting a pulmonary nodule, {}.",2,"lung window CT depicting a pulmonary nodule, s...","[-0.00091899675, 0.023336254, 0.018487338, 0.0..."
3,spiculation,spiculated,0,chest CT image of a pulmonary nodule showing {}.,3,chest CT image of a pulmonary nodule showing s...,"[-0.0007351318, 0.028997866, 0.024500718, 0.02..."
4,spiculation,spiculated,0,axial lung CT slice demonstrating {}.,4,axial lung CT slice demonstrating spiculated.,"[-0.025196226, 0.010378807, 0.030166976, 0.018..."
...,...,...,...,...,...,...,...
515,very_subtle,indistinct boundaries,51,a pulmonary nodule characterized by {}.,5,a pulmonary nodule characterized by indistinct...,"[-0.0017267592, 0.027832087, 0.0128417555, 0.0..."
516,very_subtle,indistinct boundaries,51,chest CT where the pulmonary nodule is {}.,6,chest CT where the pulmonary nodule is indisti...,"[-0.004122755, 0.0043851356, 0.015300023, 0.01..."
517,very_subtle,indistinct boundaries,51,chest CT showing a pulmonary nodule with featu...,7,chest CT showing a pulmonary nodule with featu...,"[-0.0023465124, 0.015994048, 0.018817179, 0.01..."
518,very_subtle,indistinct boundaries,51,"pulmonary nodule, {}.",8,"pulmonary nodule, indistinct boundaries.","[0.0015386346, 0.03336902, 0.012055268, 0.0359..."


Perform Prompt Ensembling on the obtained text_embeddings by concept_class.

Notes:
- You must L2-normalize image and text vectors before computing dot products/aggregation. Otherwise, "vector magnitude differences" will bias the similarity scores.
- In the open_clip library, the text encoder and image encoder outputs are already L2-normalized, so they can be used directly.
- Do NOT perform column-wise (dimension-wise) normalization/standardization on text (or image) embeddings. CLIP/ConceptCLIP dimensions have no interpretable "per-column semantics"; arbitrarily changing column scales will alter the metric geometry and break the alignment and gscale/gbias calibration learned during training.

In [None]:
# Group by concept_class and perform Averaging and L2 normalization on the text_embedding vectors

df_grouped_by_concept_class = df_combos_by_concept_class.groupby('concept_class')['text_embedding'].agg(lambda x: np.mean(np.array(x).tolist(), axis=0)).reset_index()
df_grouped_by_concept_class['text_embedding_l2_normalized'] = df_grouped_by_concept_class['text_embedding'].apply(lambda x: x / np.linalg.norm(x, ord=2))
df_grouped_by_concept_class

Unnamed: 0,concept_class,text_embedding,text_embedding_l2_normalized
0,air_bronch_cav,"[-0.020725762, 0.029581975, 0.0074337153, 0.02...","[-0.02215803, 0.031626258, 0.007947427, 0.0310..."
1,benign_calc,"[0.005749, 0.032680143, 0.0075095426, 0.039118...","[0.006022272, 0.03423356, 0.007866501, 0.04097..."
2,eccentric_punctate_calc,"[0.010322199, 0.030712616, 0.0133252, 0.038021...","[0.010820801, 0.032196153, 0.01396886, 0.03985..."
3,lobulation,"[-0.004971062, 0.031714868, 0.012939545, 0.031...","[-0.005294568, 0.0337788, 0.013781623, 0.03309..."
4,part_solid,"[-0.0076178466, 0.029471604, 0.013019356, 0.02...","[-0.008074296, 0.031237494, 0.013799455, 0.026..."
5,pure_GGN,"[-0.008121113, 0.03070038, 0.011562627, 0.0184...","[-0.008544817, 0.03230211, 0.0121658845, 0.019..."
6,round_sphericity,"[-7.052375e-05, 0.027904151, 0.013606882, 0.03...","[-7.416261e-05, 0.029343942, 0.014308967, 0.03..."
7,solid,"[-0.0024543812, 0.028322702, 0.008468461, 0.03...","[-0.0025939518, 0.029933296, 0.008950028, 0.03..."
8,spiculation,"[-0.0051476713, 0.02755172, 0.011830095, 0.031...","[-0.0054463474, 0.029150315, 0.012516497, 0.03..."
9,very_subtle,"[0.00074864697, 0.019593306, 0.005656914, 0.02...","[0.0007953316, 0.02081512, 0.006009672, 0.0246..."


# 5 Compute Concept Scores for Images and Save Results

Using logit_scale and logit_bias

In [None]:
# Extract logit_scale and logit_bias
with h5py.File("./conceptclip_features.h5", "r") as f:
    logit_scale = f.attrs['logit_scale']
    logit_bias = f.attrs['logit_bias']
    concept_logit_scale = f.attrs['concept_logit_scale']
    concept_logit_bias = f.attrs['concept_logit_bias']

In [None]:
# Compute the score vector for each image against each concept_class and save results
import numpy as np
import h5py

# Prepare concept matrix (K x D) and names
concept_names = df_grouped_by_concept_class['concept_class'].tolist()
concept_mat = np.vstack(df_grouped_by_concept_class['text_embedding_l2_normalized'].values).astype(np.float32)
K, D = concept_mat.shape

# Prepare image matrix (N x D)
img_list = df_image_features['image_embedding'].values
# img_mat = np.vstack([np.asarray(v, dtype=np.float32) for v in img_list])
img_mat = np.vstack(img_list).astype(np.float32)
N = img_mat.shape[0]
assert img_mat.shape[1] == D, f"Dimension mismatch: image dim={img_mat.shape[1]}, concept dim={D}"

# Read scaling parameters (use existing variables if available)
ls = float(np.array(logit_scale).reshape(()))
lb = float(np.array(logit_bias).reshape(()))

# Dot product and rescaling
scores = img_mat @ concept_mat.T              # (N, K)
print("Raw scores shape:", scores.shape)
scores = ls * scores + lb                     # Linear transformation
print("Calibrated scores shape:", scores.shape)

Raw scores shape: (2532, 10)
Calibrated scores shape: (2532, 10)


In [None]:
# Populate back to df_image_features
df_image_features['concept_scores'] = pd.Series([row for row in scores], index=df_image_features.index, dtype='object')

# Also expand into separate columns for easier filtering and export
for i, name in enumerate(concept_names):
    df_image_features[f'score__{name}'] = scores[:, i]

# # Save to CSV
# df_image_features.to_csv("curated_metadata_with_scores.csv", index=False)
# print(f"Done. images={N}, concepts={K}, saved -> curated_metadata_with_scores.csv")

# df_image_features.head(3)[['concept_scores'] + [f'score__{n}' for n in concept_names[:3]]]
df_image_features.head(6)[['patient_id', 'pid_nid_combo', 'concept_scores']]

Unnamed: 0,patient_id,pid_nid_combo,concept_scores
0,LIDC-IDRI-0078,LIDC-IDRI-0078_0,"[-10.59276, -10.627075, -10.621136, -10.468430..."
1,LIDC-IDRI-0078,LIDC-IDRI-0078_0,"[-10.754835, -10.715984, -10.689967, -10.65090..."
2,LIDC-IDRI-0078,LIDC-IDRI-0078_0,"[-10.650417, -10.654758, -10.642848, -10.53187..."
3,LIDC-IDRI-0078,LIDC-IDRI-0078_0,"[-10.614265, -10.615544, -10.5928545, -10.4933..."
4,LIDC-IDRI-0078,LIDC-IDRI-0078_0,"[-10.685362, -10.711727, -10.679584, -10.65178..."
5,LIDC-IDRI-0078,LIDC-IDRI-0078_1,"[-10.623442, -10.691663, -10.64212, -10.534759..."


# 6 Nodule-Level Feature Modeling

Nodule-level features (aggregated dimension-wise from $\tilde S$, choose 2–3 statistics):

- $\textbf{mean}_j=\frac{1}{m}\sum_i \tilde s_{ij}$
- $\textbf{top1}_j$: Take the maximum value for that dimension

These features are fed into **Logistic Regression**.

In [18]:
nodule_features = df_image_features[['patient_id', 'pid_nid_combo', 'area_mm2', 'concept_scores', 'malignancy_label']]
nodule_features

Unnamed: 0,patient_id,pid_nid_combo,area_mm2,concept_scores,malignancy_label
0,LIDC-IDRI-0078,LIDC-IDRI-0078_0,156.325000,"[-10.59276, -10.627075, -10.621136, -10.468430...",1
1,LIDC-IDRI-0078,LIDC-IDRI-0078_0,184.210000,"[-10.754835, -10.715984, -10.689967, -10.65090...",1
2,LIDC-IDRI-0078,LIDC-IDRI-0078_0,191.392500,"[-10.650417, -10.654758, -10.642848, -10.53187...",1
3,LIDC-IDRI-0078,LIDC-IDRI-0078_0,147.875000,"[-10.614265, -10.615544, -10.5928545, -10.4933...",1
4,LIDC-IDRI-0078,LIDC-IDRI-0078_0,81.120000,"[-10.685362, -10.711727, -10.679584, -10.65178...",1
...,...,...,...,...,...
2527,LIDC-IDRI-0127,LIDC-IDRI-0127_0,904.275089,"[-10.621713, -10.600826, -10.6044235, -10.4398...",0
2528,LIDC-IDRI-0127,LIDC-IDRI-0127_0,974.722309,"[-10.754862, -10.748146, -10.742528, -10.59976...",0
2529,LIDC-IDRI-0127,LIDC-IDRI-0127_0,999.014454,"[-10.61699, -10.660582, -10.640759, -10.480125...",0
2530,LIDC-IDRI-0127,LIDC-IDRI-0127_0,949.822861,"[-10.571918, -10.612805, -10.596398, -10.43918...",0


In [None]:
# Aggregate concept_scores (10 dimensions) at the nodule (pid_nid_combo) level
import numpy as np
import pandas as pd
from typing import Literal, List

def _expand_score_columns(df: pd.DataFrame, vec_col: str, prefix: str, concept_names: List[str] | None = None) -> pd.DataFrame:
    """Expand a column containing vectors into multiple numeric columns, named f"{prefix}__{concept_name}" or f"{prefix}__c{i}".
    Returns a new DataFrame (does not modify the original)."""
    if df.empty:
        return df.copy()
    # Infer vector dimension
    sample_vec = next((v for v in df[vec_col].values if isinstance(v, (list, tuple, np.ndarray))), None)
    if sample_vec is None:
        return df.copy()
    K = int(np.asarray(sample_vec).shape[0])
    if not concept_names or len(concept_names) != K:
        concept_names = [f"c{i}" for i in range(K)]
    # Assemble expanded columns
    expanded = {}
    mat = np.vstack([np.asarray(v) for v in df[vec_col].values])
    for i, name in enumerate(concept_names):
        expanded[f"{prefix}__{name}"] = mat[:, i]
    out = df.copy()
    for k, v in expanded.items():
        out[k] = v
    return out


def aggregate_nodule_scores(
    df: pd.DataFrame,
    method: Literal["mean", "max"] = "mean",
    group_key: str = "pid_nid_combo",
    scores_col: str = "concept_scores",
    concept_names: List[str] | None = None,
) -> pd.DataFrame:
    """Aggregate concept_scores by nodule.
    - df[group_key]: Nodule ID
    - df[scores_col]: 10-dimensional score vector for each slice (array-like)
    - method: "mean" or "max"
    Returns: One row per nodule, containing the aggregated vector column and expanded columns.
    """
    if group_key not in df or scores_col not in df:
        raise KeyError(f"DataFrame missing required columns: {group_key} or {scores_col}")

    # Filter out invalid vectors
    def _valid_vec(x):
        try:
            arr = np.asarray(x)
            return arr.ndim == 1 and arr.size > 0 and np.all(np.isfinite(arr))
        except Exception:
            return False

    work = df[[group_key, scores_col]].copy()
    work = work[work[scores_col].apply(_valid_vec)]
    if work.empty:
        return pd.DataFrame(columns=[group_key, f"concept_scores_{method}"])

    # Aggregation function
    def _reduce(group):
        mat = np.vstack([np.asarray(v) for v in group[scores_col].values])
        if method == "mean":
            vec = mat.mean(axis=0)
        elif method == "max":
            vec = mat.max(axis=0)
        else:
            raise ValueError("method must be 'mean' or 'max'")
        return pd.Series({f"concept_scores_{method}": vec, "n_slices": mat.shape[0]})

    out = work.groupby(group_key, as_index=False).apply(_reduce, include_groups=False).reset_index(drop=True)

    # Expand into separate columns
    out = _expand_score_columns(out, vec_col=f"concept_scores_{method}", prefix=f"{method}", concept_names=concept_names)
    return out

# Execute aggregation
try:
    concept_names  # Use if exists
except NameError:
    # Fallback: infer from any vector length
    sample_vec = next((v for v in df_image_features['concept_scores'].values if isinstance(v, (list, tuple, np.ndarray))), None)
    K = int(np.asarray(sample_vec).shape[0]) if sample_vec is not None else 10
    concept_names = [f"c{i}" for i in range(K)]


df_nodule_mean = aggregate_nodule_scores(df_image_features, method="mean", concept_names=concept_names)
df_nodule_max  = aggregate_nodule_scores(df_image_features, method="max",  concept_names=concept_names)

print("Aggregated per-nodule shapes:", len(df_nodule_mean), len(df_nodule_max))

Aggregated per-nodule shapes: 678 678


In [None]:
# Add other labels (malignancy_label) to df_nodule_mean and df_nodule_max
# malignancy_label is already in df_image_features
df_nodule_mean = df_nodule_mean.merge(
    df_image_features[['pid_nid_combo', 'malignancy_label']].drop_duplicates(),
    on='pid_nid_combo', how='left'
)

df_nodule_max = df_nodule_max.merge(
    df_image_features[['pid_nid_combo', 'malignancy_label']].drop_duplicates(),
    on='pid_nid_combo', how='left'
)

df_nodule_mean.head(3)
# df_nodule_max.head(3)

Unnamed: 0,pid_nid_combo,concept_scores_mean,n_slices,mean__air_bronch_cav,mean__benign_calc,mean__eccentric_punctate_calc,mean__lobulation,mean__part_solid,mean__pure_GGN,mean__round_sphericity,mean__solid,mean__spiculation,mean__very_subtle,malignancy_label
0,LIDC-IDRI-0001_0,"[-10.647757, -10.752212, -10.748182, -10.55358...",5,-10.647757,-10.752212,-10.748182,-10.553584,-10.613453,-10.629376,-10.591333,-10.562655,-10.528476,-10.652799,1
1,LIDC-IDRI-0002_0,"[-10.69259, -10.773565, -10.718737, -10.647285...",5,-10.69259,-10.773565,-10.718737,-10.647285,-10.537394,-10.562625,-10.664721,-10.663836,-10.567609,-10.612231,1
2,LIDC-IDRI-0003_0,"[-10.837697, -10.87017, -10.841608, -10.765376...",5,-10.837697,-10.87017,-10.841608,-10.765376,-10.736403,-10.768175,-10.797726,-10.780489,-10.701387,-10.780124,0


In [None]:
# Merge mean and max results into a single DataFrame
df_nodule_max_to_merge = df_nodule_max.drop(columns=['concept_scores_max', 'n_slices', 'malignancy_label'])
df_nodule_features = df_nodule_mean.merge(df_nodule_max_to_merge, on="pid_nid_combo", how="left")
df_nodule_features = df_nodule_features.drop(columns=['concept_scores_mean'])
df_nodule_features.head(3)

# Merge area_mm2 column from df_image_features to df_nodule_features, taking the max area_mm2 value for each pid_nid_combo
df_area = df_image_features.groupby('pid_nid_combo', as_index=False)['area_mm2'].max()
df_nodule_features = df_nodule_features.merge(df_area, on='pid_nid_combo', how='left')
df_nodule_features.head(3)

Unnamed: 0,pid_nid_combo,n_slices,mean__air_bronch_cav,mean__benign_calc,mean__eccentric_punctate_calc,mean__lobulation,mean__part_solid,mean__pure_GGN,mean__round_sphericity,mean__solid,...,max__benign_calc,max__eccentric_punctate_calc,max__lobulation,max__part_solid,max__pure_GGN,max__round_sphericity,max__solid,max__spiculation,max__very_subtle,area_mm2
0,LIDC-IDRI-0001_0,5,-10.647757,-10.752212,-10.748182,-10.553584,-10.613453,-10.629376,-10.591333,-10.562655,...,-10.652637,-10.658212,-10.498566,-10.58617,-10.598174,-10.48806,-10.475222,-10.512434,-10.604401,456.811523
1,LIDC-IDRI-0002_0,5,-10.69259,-10.773565,-10.718737,-10.647285,-10.537394,-10.562625,-10.664721,-10.663836,...,-10.701831,-10.666624,-10.563818,-10.461214,-10.50011,-10.569547,-10.574261,-10.496264,-10.563615,414.453932
2,LIDC-IDRI-0003_0,5,-10.837697,-10.87017,-10.841608,-10.765376,-10.736403,-10.768175,-10.797726,-10.780489,...,-10.708755,-10.692435,-10.640465,-10.599843,-10.635,-10.64204,-10.634002,-10.602088,-10.630994,423.261508


# 7 Machine Learning Modeling (Logistic Regression)

In [None]:
# Use logistic regression for a simple classification task and evaluate performance with K-fold cross-validation
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
# Prepare data
X_cols = [f"mean__{name}" for name in concept_names]
y_col = "malignancy_label"

X = df_nodule_mean[X_cols]
y = df_nodule_mean[y_col]
# Standardize features
scaler = StandardScaler()
X = scaler.fit_transform(X)
# Logistic regression model
model = LogisticRegression(random_state=42)
# 5-fold cross-validation, evaluate AUC and accuracy
scores = cross_val_score(model, X, y, cv=5, scoring='roc_auc')
print("5-Fold CV AUC scores:", scores)
print("Mean AUC:", scores.mean())
accuracy_scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
print("5-Fold CV Accuracy scores:", accuracy_scores)
print("Mean Accuracy:", accuracy_scores.mean())

5-Fold CV AUC scores: [0.74732143 0.68482143 0.74006218 0.72197107 0.80944846]
Mean AUC: 0.740724913549697
5-Fold CV Accuracy scores: [0.64705882 0.61029412 0.67647059 0.65185185 0.72592593]
Mean Accuracy: 0.6623202614379085


In [None]:
# For df_nodule_max

# Prepare data
X_cols = [f"max__{name}" for name in concept_names]
y_col = "malignancy_label"

X = df_nodule_max[X_cols]
y = df_nodule_max[y_col]
# Standardize features
scaler = StandardScaler()
X = scaler.fit_transform(X)
# Logistic regression model
model = LogisticRegression(random_state=42)
# 5-fold cross-validation, evaluate AUC and accuracy
scores = cross_val_score(model, X, y, cv=5, scoring='roc_auc')
print("5-Fold CV AUC scores:", scores)
print("Mean AUC:", scores.mean())
accuracy_scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
print("5-Fold CV Accuracy scores:", accuracy_scores)
print("Mean Accuracy:", accuracy_scores.mean())

5-Fold CV AUC scores: [0.78080357 0.71004464 0.73895181 0.74480108 0.79520796]
Mean AUC: 0.7539618131563085
5-Fold CV Accuracy scores: [0.68382353 0.63970588 0.65441176 0.67407407 0.67407407]
Mean Accuracy: 0.6652178649237472


In [None]:
# Consider features from both mean and max aggregation

# For df_nodule_features
# Prepare data
X_cols = [col for col in df_nodule_features.columns if col not in ['pid_nid_combo', 'malignancy_label', 'n_slices', 'area_mm2']]
y_col = 'malignancy_label'
X = df_nodule_features[X_cols]
y = df_nodule_features[y_col]
# Standardize features
scaler = StandardScaler()
X = scaler.fit_transform(X)
# Logistic regression model
model = LogisticRegression(random_state=42)
# 5-fold cross-validation, evaluate AUC and accuracy
scores = cross_val_score(model, X, y, cv=5, scoring='roc_auc')
print("5-Fold CV AUC scores:", scores)
print("Mean AUC:", scores.mean())
accuracy_scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
print("5-Fold CV Accuracy scores:", accuracy_scores)
print("Mean Accuracy:", accuracy_scores.mean())

5-Fold CV AUC scores: [0.76450893 0.72232143 0.76848767 0.73191682 0.82097649]
Mean AUC: 0.7616422682497384
5-Fold CV Accuracy scores: [0.68382353 0.67647059 0.70588235 0.67407407 0.73333333]
Mean Accuracy: 0.6947167755991286
