In [1]:
CONCEPTS_dict = {
    # ==========================================
    # Group 1: Density & Composition
    # ==========================================
    
    # 1. Solid
    "solid": [
        "solid", "soft-tissue attenuation", "solid attenuation", 
        "dense solid structure", "high density nodule"
    ],

    # 2. Part-solid
    "part_solid": [
        "part-solid", "subsolid", "mixed ground-glass density",
        "a solid focus within ground-glass", "heterogeneous attenuation"
    ],

    # 3. Pure GGN
    "pure_GGN": [
        "pure ground-glass", "non-solid", "pure GGO",
        "hazy ground-glass opacity", "non-solid attenuation"
    ],

    # 4. Fat Containing
    "fat_containing": [
        "fat density", "fatty attenuation", "lipid-rich",
        "low density fat component", "containing macroscopic fat"
    ],

    # 5. Halo Sign
    # Significance: Ground-glass halo surrounding the nodule (hemorrhage or inflammation).
    "halo_sign": [
        "halo sign", "surrounded by ground-glass halo", "perilesional halo",
        "nodule with a ground-glass rim"
    ],

    # 6. Reversed Halo / Atoll Sign - [NEW]
    # Significance: Central ground-glass with a dense rim. Common in Cryptogenic Organizing Pneumonia (COP) or fungal infection.
    "reversed_halo": [
        "reversed halo sign", "atoll sign", "central ground-glass with dense rim",
        "ring-shaped opacity", "organizing pneumonia pattern"
    ],

    # ==========================================
    # Group 2: Margins & Borders
    # ==========================================

    # 7. Spiculation
    "spiculation": [
        "spiculated", "radial spicules", "sunburst appearance",
        "spiculated margins", "spikes extending outward"
    ],

    # 8. Lobulation
    "lobulation": [
        "lobulated", "lobulation", "scalloped contour",
        "deeply lobulated", "wavy border"
    ],

    # 9. Smooth/Well-defined Margin
    "smooth_margin": [
        "smooth margin", "well-circumscribed", "sharp borders",
        "well-defined edges", "clear boundary"
    ],

    # 10. Ill-defined/Fuzzy
    "ill_defined": [
        "ill-defined", "fuzzy margins", "indistinct borders",
        "blurred edges", "poorly defined"
    ],

    # ==========================================
    # Group 3: Shape & Geometry
    # ==========================================

    # 11. Round/Spherical
    "round_shape": [
        "round", "spherical", "circular shape",
        "perfectly round", "ball-like"
    ],

    # 12. Polygonal/Irregular
    "polygonal_shape": [
        "polygonal shape", "angular shape", "geometric shape",
        "flat edges", "irregular non-spherical shape"
    ],

    # 13. Elongated/Elliptical
    "elongated": [
        "elongated", "elliptical", "oval shape",
        "sausage-like appearance", "fusiform"
    ],

    # 14. Branching/Tubular - [NEW]
    # Significance: Suggests mucous plugs, vascular shadows, or bronchiectasis rather than true tumor nodules.
    "branching_tubular": [
        "branching shape", "tubular opacity", "finger-in-glove sign",
        "Y-shaped opacity", "mucus plugging"
    ],

    # ==========================================
    # Group 4: Internal Texture
    # ==========================================

    # 15. Air Bronchogram - [Patent]
    "air_bronchogram": [
        "air bronchogram", "tubular air lucency", "patent bronchus within nodule",
        "air-filled bronchus"
    ],
    
    # 16. Bronchial Cut-off - [NEW] [Obstructed]
    # Significance: Abrupt interruption of the bronchus entering the nodule; highly suggestive of malignant obstruction.
    "bronchial_cutoff": [
        "bronchial cut-off", "abrupt bronchus termination", "obstructed bronchus",
        "blocked airway", "amputated bronchus sign"
    ],

    # 17. Bubble Lucency
    "bubble_lucency": [
        "bubble-like lucency", "vacuole sign", "pseudocavitation",
        "small internal air pockets", "cystic airspaces"
    ],

    # 18. Thick-walled/Large Cavity
    "cavity": [
        "cavitary", "thick-walled cavity", "central cavitation",
        "large central air space", "necrotic cavity"
    ],

    # 19. Typical Benign Calcification
    "benign_calc": [
        "diffuse calcification", "central calcification", "popcorn calcification",
        "laminar calcification", "calcified granuloma pattern"
    ],

    # 20. Eccentric/Punctate (Suspicious) Calcification
    "suspicious_calc": [
        "eccentric calcification", "punctate calcification", "stippled calcification",
        "amorphous calcification", "dystrophic calcification"
    ],

    # 21. Internal Heterogeneity - [NEW]
    # Significance: Uneven density; suggests internal necrosis, hemorrhage, or uneven growth activity in the tumor.
    "heterogeneous_texture": [
        "heterogeneous attenuation", "mottled texture", "complex internal echogenicity",
        "coarse texture", "non-uniform density"
    ],
    
    # 22. Internal Homogeneity - [NEW]
    # Significance: Very uniform density; common in benign lesions or lymphoma.
    "homogeneous_texture": [
        "homogeneous density", "uniform attenuation", "smooth internal texture",
        "consistent density", "monotonous appearance"
    ],
    
    # 23. Crazy Paving - [NEW]
    # Significance: Ground-glass background + interlobular septal thickening. Common in mucinous adenocarcinoma or alveolar proteinosis.
    "crazy_paving": [
        "crazy paving pattern", "ground-glass with septal thickening", 
        "reticular pattern over GGO", "geographic mosaic pattern"
    ],

    # ==========================================
    # Group 5: Spatial Location - [NEW CATEGORY]
    # ==========================================
    
    # 24. Perifissural - [NEW]
    # Significance: Typical benign feature, mostly intrapulmonary lymph nodes (PFN).
    "perifissural": [
        "perifissural nodule", "attached to fissure", "along the interlobar fissure",
        "fissural-based", "triangle shape on fissure"
    ],
    
    # 25. Subpleural/Juxtapleural - [NEW]
    # Significance: Located at the peripheral edge of the lung.
    "subpleural": [
        "subpleural location", "abutting the pleura", "juxtapleural",
        "peripheral lung nodule", "pleural-based"
    ],
    
    # 26. Centrilobular - [NEW]
    # Significance: Located in the center of the secondary pulmonary lobule, not connected to the pleura; often suggests airway-derived pathologies (inflammation/TB).
    "centrilobular": [
        "centrilobular", "center of secondary pulmonary lobule", 
        "spaced away from pleura", "rosette pattern"
    ],

    # ==========================================
    # Group 6: Context & Associated Signs
    # ==========================================

    # 27. Pleural Attachment/Tag
    "pleural_attachment": [
        "pleural tag", "pleural tail", "indenting the pleura",
        "attached to pleura", "pleural retraction"
    ],

    # 28. Vascular Convergence
    "vascular_convergence": [
        "vascular convergence", "vessel convergence", "vessels drawn into nodule",
        "vascular notch", "supplying vessels"
    ],
    
    # 29. Satellite Nodules - [NEW]
    # Significance: Small nodules appearing around the main lesion; can be benign (TB) or malignant (metastasis), but highly distinctive.
    "satellite_nodules": [
        "satellite nodules", "surrounding small nodules", "galaxy sign",
        "cluster of small nodules", "daughter lesions"
    ],

    # 30. Very Subtle/Low Contrast
    "very_subtle": [
        "very subtle", "faint appearance", "low-contrast",
        "barely visible", "ghost-like opacity"
    ]
}

In [2]:
# 1. Flatten 30 concept synonyms into a single list
concept_list = []
for k, synonyms in CONCEPTS_dict.items():
    for s in synonyms:
        if s not in concept_list:
            concept_list.append(s)
print('Total concepts (deduplicated synonyms):', len(concept_list))
concept_list[:10]

Total concepts (deduplicated synonyms): 145


['solid',
 'soft-tissue attenuation',
 'solid attenuation',
 'dense solid structure',
 'high density nodule',
 'part-solid',
 'subsolid',
 'mixed ground-glass density',
 'a solid focus within ground-glass',
 'heterogeneous attenuation']

In [3]:
# 2. PROMPT_TEMPLATES (consistent with 20-concept version)
PROMPT_TEMPLATES = [
    "chest CT showing a pulmonary nodule: {}.",
    "axial chest CT of a pulmonary nodule, {}.",
    "lung window CT depicting a pulmonary nodule, {}.",
    "chest CT image of a pulmonary nodule showing {}.",
    "axial lung CT slice demonstrating {}.",
    "a pulmonary nodule characterized by {}.",
    "chest CT where the pulmonary nodule is {}.",
    "chest CT showing a pulmonary nodule with features of {}.",
    "pulmonary nodule, {}.",
    "CT scan, lung nodule, {}."
]
print('Number of templates:', len(PROMPT_TEMPLATES))

Number of templates: 10


In [4]:
# 3. Generate full concept prompt list for each template
template_texts_map = {}
for idx, template in enumerate(PROMPT_TEMPLATES, start=1):
    texts = [template.format(c) for c in concept_list]
    tid = f"concept30_prompts_t{idx:02d}"
    template_texts_map[tid] = texts
print('Template keys sample:', list(template_texts_map.keys())[:3])
print('Prompts per template (should equal concept count):', len(next(iter(template_texts_map.values()))))

Template keys sample: ['concept30_prompts_t01', 'concept30_prompts_t02', 'concept30_prompts_t03']
Prompts per template (should equal concept count): 145


In [None]:
# 4. Load ConceptCLIP model
from transformers import AutoModel, AutoProcessor
import torch, os, h5py, json, time, numpy as np, pandas as pd
MODEL_NAME = 'JerrryNie/ConceptCLIP'
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
# Optional: Use HF_TOKEN environment variable to login to private repository
# token = os.environ.get('HF_TOKEN', None)
token="YOUR_HF_TOKEN"
if token:
    try:
        from huggingface_hub import login
        login(token=token)
    except Exception as e:
        print('HF login skipped:', e)
model = AutoModel.from_pretrained(MODEL_NAME, trust_remote_code=True).to(device).eval()
processor = AutoProcessor.from_pretrained(MODEL_NAME, trust_remote_code=True)
processor.image_processor.do_rescale = False
processor.image_processor.do_normalize = True
print('Loaded ConceptCLIP on', device)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to C:\Users\chenk\.cache\huggingface\token
Login successful


The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.


Loaded ConceptCLIP on cuda:0


In [None]:
# 5. H5 write utility for text features (text only)
TEXT_H5_PATH = 'conceptclip_features_30.h5'
DTYPE_TEXT = 'float32'

def _hash_texts(texts):
    import hashlib
    h = hashlib.sha256()
    for t in texts:
        h.update(t.encode('utf-8')); h.update(b'\0')
    return h.hexdigest()

def init_text_file(path):
    if not os.path.exists(path):
        with h5py.File(path, 'w') as f:
            f.create_group('templates')
            f.attrs['created_at'] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())
            f.attrs['version'] = 'text-only-1.0'
    return h5py.File(path, 'a')

def write_template_text_only(f, template_id, texts, text_features):
    g_root = f['templates']
    if template_id in g_root:
        del g_root[template_id]
    g = g_root.create_group(template_id)
    arr = text_features.detach().cpu().numpy().astype(DTYPE_TEXT)
    g.create_dataset('text_features', data=arr, compression='lzf')
    dt_str = h5py.string_dtype(encoding='utf-8')
    ds_txt = g.create_dataset('texts', shape=(len(texts),), dtype=dt_str, compression='lzf')
    ds_txt[:] = np.array(texts, dtype=object)
    g.attrs['K'] = arr.shape[0]
    g.attrs['D'] = arr.shape[1]
    g.attrs['texts_hash'] = _hash_texts(texts)
    g.attrs['created_at'] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())
    f.flush()

In [None]:
# 6. Encode and save all template text features
stored_templates = []
with torch.no_grad():
    with init_text_file(TEXT_H5_PATH) as f:
        f.attrs['concept_texts'] = json.dumps(concept_list, ensure_ascii=False)
        f.attrs['prompt_temp_for_concepts'] = json.dumps(PROMPT_TEMPLATES, ensure_ascii=False)
        for key in ['logit_scale','logit_bias','concept_logit_scale','concept_logit_bias']:
            if hasattr(model, key):
                val = getattr(model, key)
                try:
                    scalar = float(val.item()) if torch.is_tensor(val) else float(val)
                except Exception:
                    scalar = None
                if scalar is not None:
                    f.attrs[key] = scalar
        for tid, texts in template_texts_map.items():
            inputs = processor(text=texts, return_tensors='pt', padding=True, truncation=True).to(device)
            text_cls, text_tokens = model.encode_text(inputs['input_ids'], normalize=True)
            write_template_text_only(f, tid, texts, text_cls)
            stored_templates.append((tid, text_cls.shape))
print('Stored templates count:', len(stored_templates))
stored_templates[:3]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Stored templates count: 10


[('concept30_prompts_t01', torch.Size([145, 1152])),
 ('concept30_prompts_t02', torch.Size([145, 1152])),
 ('concept30_prompts_t03', torch.Size([145, 1152]))]

In [None]:
# 7. Validate H5 file structure
def validate_text_h5(path=TEXT_H5_PATH, max_templates=2):
    if not os.path.exists(path):
        print('File not found:', path); return
    with h5py.File(path, 'r') as f:
        tids = list(f['templates'].keys())
        print('Templates stored:', len(tids))
        clist = json.loads(f.attrs.get('concept_texts','[]'))
        print('Concept count:', len(clist))
        for tid in tids[:max_templates]:
            g = f['templates'][tid]
            print(f' - {tid}: text_features shape={g["text_features"].shape}')
            print('   sample:', g['texts'][:2])
validate_text_h5()

Templates stored: 10
Concept count: 145
 - concept30_prompts_t01: text_features shape=(145, 1152)
   sample: [b'chest CT showing a pulmonary nodule: solid.'
 b'chest CT showing a pulmonary nodule: soft-tissue attenuation.']
 - concept30_prompts_t02: text_features shape=(145, 1152)
   sample: [b'axial chest CT of a pulmonary nodule, solid.'
 b'axial chest CT of a pulmonary nodule, soft-tissue attenuation.']


In [None]:
# 8. Load image features from CSV
df_image_features = pd.read_csv('./image_features/df_image_features.csv')
print('Image features shape:', df_image_features.shape)
df_image_features.head(2)

Image features shape: (2532, 1159)


Unnamed: 0,patient_id,pid_nid_combo,img_path_curated,area_mm2,malignancy_label,img_shape_H,img_shape_W,0,1,2,...,1142,1143,1144,1145,1146,1147,1148,1149,1150,1151
0,LIDC-IDRI-0078,LIDC-IDRI-0078_0,./datasets/curation2/lidc_patches_all\LIDC-IDR...,156.325,1,44,54,-0.01431,0.04358,0.01958,...,0.01117,-0.01567,0.004425,0.001112,0.001486,-0.01164,-0.012375,0.005116,0.00555,-0.000785
1,LIDC-IDRI-0078,LIDC-IDRI-0078_0,./datasets/curation2/lidc_patches_all\LIDC-IDR...,184.21,1,44,54,-0.00985,0.04022,0.01633,...,0.02235,0.000509,-0.003216,-0.007042,-0.00609,-0.00836,-0.00344,0.01619,0.007324,-0.011856


In [None]:
# 9. Build concept -> [T,D] text embedding mapping and normalize
concept_embeddings_map = {}
with h5py.File(TEXT_H5_PATH, 'r') as f:
    clist = json.loads(f.attrs.get('concept_texts','[]'))
    tids = sorted(list(f['templates'].keys()))
    collector = {c: [] for c in clist}
    for tid in tids:
        g = f['templates'][tid]
        feats = g['text_features'][:]  # [K,D]
        for i, c in enumerate(clist):
            collector[c].append(feats[i])
    for c, lst in collector.items():
        arr = np.stack(lst, axis=0)  # [T,D]
        arr_norm = np.linalg.norm(arr, axis=1, keepdims=True)
        arr_norm[arr_norm==0] = 1.0
        concept_embeddings_map[c] = arr / arr_norm
print('Loaded concepts for scoring:', len(concept_embeddings_map))
print('Per concept template variants:', concept_embeddings_map[next(iter(concept_embeddings_map))].shape[0])

Loaded concepts for scoring: 145
Per concept template variants: 10


In [None]:
import warnings
warnings.filterwarnings('ignore', category=pd.errors.PerformanceWarning)

# 10. Compute image-level mean/max prompt scores (dot product similarity)
exclude_cols = {'patient_id','pid_nid_combo','img_path_curated','area_mm2','malignancy_label','img_shape_H','img_shape_W','split','nodule_index','nodule_index_in_patient'}
num_cols = [c for c in df_image_features.columns if c not in exclude_cols and pd.api.types.is_numeric_dtype(df_image_features[c])]
if len(num_cols) == 0:
    raise ValueError('No image embedding columns found')
emb_matrix = df_image_features[num_cols].to_numpy(dtype='float32')
row_norm = np.linalg.norm(emb_matrix, axis=1, keepdims=True)
row_norm[row_norm==0] = 1.0
emb_matrix = emb_matrix / row_norm
D_img = emb_matrix.shape[1]
D_txt = next(iter(concept_embeddings_map.values())).shape[1]
if D_img != D_txt:
    raise ValueError(f'D mismatch: image {D_img} vs text {D_txt}')
concept_names = list(concept_embeddings_map.keys())
img_mean = np.zeros((emb_matrix.shape[0], len(concept_names)), dtype='float32')
img_max  = np.zeros((emb_matrix.shape[0], len(concept_names)), dtype='float32')
for ci, cname in enumerate(concept_names):
    c_emb = concept_embeddings_map[cname]  # [T,D]
    scores = emb_matrix @ c_emb.T          # [N,T]
    img_mean[:, ci] = scores.mean(axis=1)
    img_max[:, ci]  = scores.max(axis=1)
image_scores_df = pd.DataFrame({
    **({'pid_nid_combo': df_image_features['pid_nid_combo']} if 'pid_nid_combo' in df_image_features.columns else {}),
    **({'patient_id': df_image_features['patient_id']} if 'patient_id' in df_image_features.columns else {}),
    **({'malignancy_label': df_image_features['malignancy_label']} if 'malignancy_label' in df_image_features.columns else {}),
})
for ci, cname in enumerate(concept_names):
    image_scores_df[f'{cname}_mean_prompt'] = img_mean[:, ci]
    image_scores_df[f'{cname}_max_prompt']  = img_max[:, ci]
image_scores_df.head(3)

Unnamed: 0,pid_nid_combo,patient_id,malignancy_label,solid_mean_prompt,solid_max_prompt,soft-tissue attenuation_mean_prompt,soft-tissue attenuation_max_prompt,solid attenuation_mean_prompt,solid attenuation_max_prompt,dense solid structure_mean_prompt,...,very subtle_mean_prompt,very subtle_max_prompt,faint appearance_mean_prompt,faint appearance_max_prompt,low-contrast_mean_prompt,low-contrast_max_prompt,barely visible_mean_prompt,barely visible_max_prompt,ghost-like opacity_mean_prompt,ghost-like opacity_max_prompt
0,LIDC-IDRI-0078_0,LIDC-IDRI-0078,1,0.134531,0.185346,0.125419,0.184737,0.147826,0.206102,0.128972,...,0.104277,0.175782,0.111158,0.175536,0.142865,0.203103,0.081449,0.167393,0.13483,0.191516
1,LIDC-IDRI-0078_0,LIDC-IDRI-0078,1,0.102361,0.128306,0.102469,0.147085,0.118833,0.161967,0.09503,...,0.094658,0.146197,0.084143,0.125878,0.114378,0.156437,0.066458,0.125159,0.096404,0.12923
2,LIDC-IDRI-0078_0,LIDC-IDRI-0078,1,0.125261,0.161946,0.123365,0.174834,0.139258,0.188709,0.122283,...,0.099665,0.157756,0.092139,0.143806,0.131508,0.177364,0.072622,0.141979,0.121691,0.165448


In [None]:
# 11. Nodule-level aggregation (mean / max over images)
if 'pid_nid_combo' in image_scores_df.columns:
    group_key = 'pid_nid_combo'
else:
    group_key = 'patient_id' if 'patient_id' in image_scores_df.columns else None
if group_key is None:
    raise ValueError('Missing nodule aggregation key')
agg_rows = []
for pid_nid, sub in image_scores_df.groupby(group_key):
    rec = {group_key: pid_nid}
    if 'malignancy_label' in sub.columns:
        vals = sub['malignancy_label'].dropna().values
        rec['malignancy_label'] = int(np.round(vals.mean())) if vals.size else -1
    for cname in concept_names:
        m_scores = sub[f'{cname}_mean_prompt'].values
        x_scores = sub[f'{cname}_max_prompt'].values
        rec[f'{cname}_mean_over_images'] = float(m_scores.mean())
        rec[f'{cname}_max_over_images']  = float(x_scores.max())
    agg_rows.append(rec)
df_nodule_features = pd.DataFrame(agg_rows)
print('df_nodule_features shape:', df_nodule_features.shape)
df_nodule_features.head(3)

df_nodule_features shape: (678, 292)


Unnamed: 0,pid_nid_combo,malignancy_label,solid_mean_over_images,solid_max_over_images,soft-tissue attenuation_mean_over_images,soft-tissue attenuation_max_over_images,solid attenuation_mean_over_images,solid attenuation_max_over_images,dense solid structure_mean_over_images,dense solid structure_max_over_images,...,very subtle_mean_over_images,very subtle_max_over_images,faint appearance_mean_over_images,faint appearance_max_over_images,low-contrast_mean_over_images,low-contrast_max_over_images,barely visible_mean_over_images,barely visible_max_over_images,ghost-like opacity_mean_over_images,ghost-like opacity_max_over_images
0,LIDC-IDRI-0001_0,1,0.130282,0.190296,0.110652,0.177217,0.130308,0.19249,0.12462,0.171977,...,0.067671,0.150291,0.076787,0.145256,0.119016,0.181759,0.050313,0.136802,0.113962,0.162114
1,LIDC-IDRI-0002_0,1,0.105857,0.174993,0.100741,0.156,0.118502,0.188378,0.086971,0.148155,...,0.104785,0.172811,0.110235,0.160199,0.114909,0.181646,0.088935,0.152967,0.09929,0.171082
2,LIDC-IDRI-0003_0,0,0.084831,0.146845,0.064679,0.139538,0.089721,0.152957,0.076247,0.13286,...,0.061748,0.144194,0.065547,0.136548,0.083606,0.166524,0.055905,0.142804,0.086553,0.156031


In [None]:
# 12. Save CSV
# df_nodule_features.to_csv('df_nodule_features_concept30.csv', index=False)
# print('Saved df_nodule_features_concept30.csv')

In [None]:
# 13. Logistic Regression (mean features)
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
mean_cols = [c for c in df_nodule_features.columns if c.endswith('_mean_over_images')]
X_mean = df_nodule_features[mean_cols]
y = df_nodule_features['malignancy_label']
scaler_mean = StandardScaler(); X_mean_scaled = scaler_mean.fit_transform(X_mean)
logreg = LogisticRegression(max_iter=1000, random_state=42)
auc_mean = cross_val_score(logreg, X_mean_scaled, y, cv=5, scoring='roc_auc')
acc_mean = cross_val_score(logreg, X_mean_scaled, y, cv=5, scoring='accuracy')
print('Mean features AUC:', auc_mean, 'Mean AUC:', auc_mean.mean())
print('Mean features ACC:', acc_mean, 'Mean ACC:', acc_mean.mean())

Mean features AUC: [0.81941964 0.73616071 0.7493893  0.69439421 0.85488246] Mean AUC: 0.7708492651724248
Mean features ACC: [0.72794118 0.66911765 0.66176471 0.64444444 0.77777778] Mean ACC: 0.6962091503267973


In [None]:
# 14. Logistic Regression (max features)
max_cols = [c for c in df_nodule_features.columns if c.endswith('_max_over_images')]
X_max = df_nodule_features[max_cols]
scaler_max = StandardScaler(); X_max_scaled = scaler_max.fit_transform(X_max)
logreg2 = LogisticRegression(max_iter=1000, random_state=42)
auc_max = cross_val_score(logreg2, X_max_scaled, y, cv=5, scoring='roc_auc')
acc_max = cross_val_score(logreg2, X_max_scaled, y, cv=5, scoring='accuracy')
print('Max features AUC:', auc_max, 'Mean AUC:', auc_max.mean())
print('Max features ACC:', acc_max, 'Mean ACC:', acc_max.mean())

Max features AUC: [0.80044643 0.69955357 0.77692649 0.71292948 0.79566004] Mean AUC: 0.7571032010405762
Max features ACC: [0.72058824 0.65441176 0.71323529 0.67407407 0.71851852] Mean ACC: 0.696165577342048


In [None]:
# 15. Logistic Regression (mean+max combined features)
combo_cols = mean_cols + max_cols
X_combo = df_nodule_features[combo_cols]
scaler_combo = StandardScaler(); X_combo_scaled = scaler_combo.fit_transform(X_combo)
logreg3 = LogisticRegression(max_iter=1000, random_state=42)
auc_combo = cross_val_score(logreg3, X_combo_scaled, y, cv=5, scoring='roc_auc')
acc_combo = cross_val_score(logreg3, X_combo_scaled, y, cv=5, scoring='accuracy')
print('Combo features AUC:', auc_combo, 'Mean AUC:', auc_combo.mean())
print('Combo features ACC:', acc_combo, 'Mean ACC:', acc_combo.mean())

Combo features AUC: [0.80625    0.73147321 0.77759272 0.73711573 0.82210669] Mean AUC: 0.7749076706798641
Combo features ACC: [0.77205882 0.625      0.72794118 0.71111111 0.72592593] Mean ACC: 0.7124074074074074
