## Bad Performance on DSC

In [1]:
import os
import pandas as pd

# Path to the root folder (adjust as needed)
models_root = 'final_14dec2024_results/jhh_results - table 2 in paper'

bad_performance_threshold = 0.3

# Dictionary to store results
low_performance_dict = {}

# Loop through each model folder
for model_name in os.listdir(models_root):
    model_path = os.path.join(models_root, model_name)
    dsc_path = os.path.join(model_path, 'dsc.csv')

    if os.path.isfile(dsc_path):
        dsc_df = pd.read_csv(dsc_path)

        # Initialize dictionary for this model
        model_results = {}

        for organ in dsc_df.columns[1:]:  # Skip 'name' column
            if organ in dsc_df:
                low_perf_ids = dsc_df[dsc_df[organ] < bad_performance_threshold]['name'].tolist()
                model_results[organ] = low_perf_ids

        low_performance_dict[model_name] = model_results

In [17]:
organs = dsc_df.columns[1:]

#### Certain models are terrible at certain organs

In [2]:
# Now count how many low-performance cases there are per model per organ
counts_dict = {
    model: {organ: len(ct_ids) for organ, ct_ids in organs.items()}
    for model, organs in low_performance_dict.items()
}

# Convert to DataFrame for visualization
counts_df = pd.DataFrame.from_dict(counts_dict, orient='index').fillna(0).astype(int)

# Display or save the DataFrame
counts_df
counts_df.style.background_gradient(cmap='Blues')

Unnamed: 0,aorta,gall_bladder,kidney_left,kidney_right,liver,pancreas,postcava,spleen,stomach
UNETR,139,235,44,40,11,147,76,37,16
U-Net_CLIP,32,60,25,18,7,43,12,14,7
Diff-UNet,15,98,30,28,12,65,15,26,12
LHU-Net,14,89,30,26,10,64,17,20,10
NexToU,17,146,38,35,15,93,16,33,18
Swin_UNETR_CLIP,25,95,26,24,9,46,19,17,13
STU-Net H,12,51,27,25,10,60,11,17,9
nnU-Net_ResEncL,13,65,26,25,11,63,12,22,10
SegVol,20,188,27,26,11,84,18,24,12
UNEST,137,278,35,37,11,85,51,21,16


In [3]:
low_performance_dict.keys()

dict_keys(['UNETR', 'U-Net_CLIP', 'Diff-UNet', 'LHU-Net', 'NexToU', 'Swin_UNETR_CLIP', 'STU-Net H', 'nnU-Net_ResEncL', 'SegVol', 'UNEST', 'MedNeXt', 'nnU-Net U-Net', 'MedFormer', 'Swin_UNETR', 'STU-Net B', 'STU-Net L', 'UCTransNet', 'UniSeg', 'SAM-Adapter'])

In [4]:
len(list(low_performance_dict.keys()))

19

In [5]:
import pandas as pd

# Extract a dictionary of sets for LIVER only:
#   model_bad_cts[model_name] = {all CTs flagged by this model for liver}
model_bad_cts = {}
for model_name, organs_dict in low_performance_dict.items():
    # If "liver" is one of the organs in the dictionary:
    if "liver" in organs_dict:
        # Convert the list of "bad" CTs to a set for easier union/counting
        model_bad_cts[model_name] = set(organs_dict["liver"])
    else:
        # Or you can store an empty set if "liver" is not in the dictionary
        model_bad_cts[model_name] = set()

# 1) Create a union of all CTs flagged by at least one model
all_bad_cts = set()
for ct_set in model_bad_cts.values():
    all_bad_cts |= ct_set  # union

# 2) Count how many models flagged each CT
ct_flag_count = {}
for ct_id in all_bad_cts:
    count = sum(ct_id in ct_set for ct_set in model_bad_cts.values())
    ct_flag_count[ct_id] = count

# 3) Put it into a DataFrame and sort by number of flags descending
liver_union_df = pd.DataFrame.from_dict(
    ct_flag_count, orient='index', columns=['num_models_flagged']
).sort_values(by='num_models_flagged', ascending=False)

# 4) (Optional) Style the DataFrame with a background gradient
liver_union_df_styled = liver_union_df.style.background_gradient(cmap='Blues')

# Finally, display or save the styled DataFrame
liver_union_df_styled

Unnamed: 0,num_models_flagged
BDMAP_A0000761,19
BDMAP_A0000637,19
BDMAP_A0002501,19
BDMAP_A0000778,19
BDMAP_V0001722,18
BDMAP_V0000726,17
BDMAP_A0000763,17
BDMAP_V0001931,17
BDMAP_V0001954,17
BDMAP_V0001936,16


In [None]:
import pandas as pd

def get_low_dsc_agreement_table(low_performance_dict, organ_name, agreement_cutoff=3):
    """
    For a given organ, returns a DataFrame showing which CTs were flagged as poor-performing
    (DSC < threshold) and how many models flagged each one. Filters out outliers based on cutoff.

    Parameters:
    - low_performance_dict: dict in the form {model_name: {organ: [list of bad CTs]}}
    - organ_name: name of the organ to analyze (e.g., "liver")
    - agreement_cutoff: minimum number of models that must have flagged a CT for it to be included.
                        (e.g., agreement_cutoff=4 keeps only CTs flagged by 4+ models)

    Returns:
    - pandas DataFrame sorted by number of models that flagged each CT (descending)
    - pandas Styler object for easy visual inspection
    """

    # Step 1: Build {model_name: set of bad CTs for the organ}
    model_bad_cts = {
        model_name: set(organs.get(organ_name, []))
        for model_name, organs in low_performance_dict.items()
    }

    # Step 2: Union of all CTs flagged by at least one model
    all_bad_cts = set.union(*model_bad_cts.values()) if model_bad_cts else set()

    # Step 3: Count how many models flagged each CT
    ct_flag_count = {
        ct_id: sum(ct_id in ct_set for ct_set in model_bad_cts.values())
        for ct_id in all_bad_cts
    }

    # Step 4: Make a DataFrame and apply the cutoff
    df = pd.DataFrame.from_dict(ct_flag_count, orient='index', columns=['num_models_flagged'])
    df.index.name = 'CT_ID'
    df = df[df['num_models_flagged'] > agreement_cutoff]
    df.sort_values(by='num_models_flagged', ascending=False, inplace=True)

    return df, df.style.background_gradient(cmap='Blues')

In [15]:
liver_df, liver_styled = get_low_dsc_agreement_table(low_performance_dict, "liver")
liver_styled  # Display in notebook

# Or use another organ:
# spleen_df, spleen_styled = get_low_dsc_agreement_table(low_performance_dict, "spleen")

Unnamed: 0_level_0,num_models_flagged
CT_ID,Unnamed: 1_level_1
BDMAP_A0000761,19
BDMAP_A0000637,19
BDMAP_A0002501,19
BDMAP_A0000778,19
BDMAP_V0001722,18
BDMAP_V0000726,17
BDMAP_A0000763,17
BDMAP_V0001931,17
BDMAP_V0001954,17
BDMAP_V0001936,16


Disrega

In [24]:
for organ in organs:
    organ_df, organ_styled = get_low_dsc_agreement_table(low_performance_dict, organ, agreement_cutoff=3)
    print(organ)
    display(organ_styled)  # Display in notebook

aorta


Unnamed: 0_level_0,num_models_flagged
CT_ID,Unnamed: 1_level_1
BDMAP_A0002501,19
BDMAP_A0000761,19
BDMAP_A0000763,17
BDMAP_V0001954,17
BDMAP_V0000726,17
BDMAP_A0000698,17
BDMAP_V0001416,17
BDMAP_V0002601,17
BDMAP_A0000773,17
BDMAP_A0001985,16


gall_bladder


Unnamed: 0_level_0,num_models_flagged
CT_ID,Unnamed: 1_level_1
BDMAP_A0001782,19
BDMAP_A0002501,19
BDMAP_A0000778,19
BDMAP_V0002017,19
BDMAP_A0002017,19
BDMAP_V0001116,18
BDMAP_A0001116,18
BDMAP_A0001807,18
BDMAP_A0001761,18
BDMAP_A0000773,18


kidney_left


Unnamed: 0_level_0,num_models_flagged
CT_ID,Unnamed: 1_level_1
BDMAP_V0000955,19
BDMAP_V0001229,19
BDMAP_A0000778,19
BDMAP_V0000111,19
BDMAP_A0000761,19
BDMAP_A0002501,19
BDMAP_A0001740,18
BDMAP_A0000111,18
BDMAP_A0002624,18
BDMAP_A0000773,18


kidney_right


Unnamed: 0_level_0,num_models_flagged
CT_ID,Unnamed: 1_level_1
BDMAP_A0002501,19
BDMAP_A0001285,19
BDMAP_A0000761,19
BDMAP_A0002299,19
BDMAP_V0002499,18
BDMAP_A0001740,18
BDMAP_A0002183,18
BDMAP_A0000773,18
BDMAP_V0001740,18
BDMAP_V0001954,17


liver


Unnamed: 0_level_0,num_models_flagged
CT_ID,Unnamed: 1_level_1
BDMAP_A0000761,19
BDMAP_A0000637,19
BDMAP_A0000778,19
BDMAP_A0002501,19
BDMAP_V0001722,18
BDMAP_V0001931,17
BDMAP_A0000763,17
BDMAP_V0001954,17
BDMAP_V0000726,17
BDMAP_V0001936,16


pancreas


Unnamed: 0_level_0,num_models_flagged
CT_ID,Unnamed: 1_level_1
BDMAP_A0000637,19
BDMAP_A0000629,19
BDMAP_V0000476,19
BDMAP_A0000761,19
BDMAP_A0000476,19
BDMAP_A0002629,19
BDMAP_A0002427,19
BDMAP_V0002427,19
BDMAP_A0002501,19
BDMAP_V0000022,19


postcava


Unnamed: 0_level_0,num_models_flagged
CT_ID,Unnamed: 1_level_1
BDMAP_A0000761,19
BDMAP_A0002501,19
BDMAP_V0001416,17
BDMAP_V0001931,17
BDMAP_V0002601,17
BDMAP_A0000698,17
BDMAP_V0001954,17
BDMAP_A0000763,17
BDMAP_V0001936,16
BDMAP_A0001985,16


spleen


Unnamed: 0_level_0,num_models_flagged
CT_ID,Unnamed: 1_level_1
BDMAP_A0002036,19
BDMAP_A0000778,19
BDMAP_A0002501,19
BDMAP_A0000761,19
BDMAP_V0002036,18
BDMAP_A0000773,18
BDMAP_A0000057,17
BDMAP_V0000726,17
BDMAP_A0002473,17
BDMAP_V0001416,17


stomach


Unnamed: 0_level_0,num_models_flagged
CT_ID,Unnamed: 1_level_1
BDMAP_A0000778,19
BDMAP_A0000761,19
BDMAP_A0000773,18
BDMAP_V0001416,17
BDMAP_V0001931,17
BDMAP_A0000763,17
BDMAP_V0001954,17
BDMAP_V0000726,17
BDMAP_V0001936,16
BDMAP_A0002636,6


In [37]:
def build_ct_organ_agreement_table(low_performance_dict, organs, agreement_cutoff=0):
    """
    Creates a master table where rows are CT_IDs and columns are organs.
    Each cell shows how many models flagged that CT for that organ (DSC < threshold).

    Parameters:
    - low_performance_dict: dict of {model_name: {organ_name: [bad CTs]}}
    - organs: list of organ names to include
    - agreement_cutoff: minimum number of models that must have flagged a CT (per organ)

    Returns:
    - pandas DataFrame with CT_IDs as rows and organs as columns
    """

    organ_dfs = []

    for organ in organs:
        # Use previous logic per organ
        model_bad_cts = {
            model_name: set(organs_dict.get(organ, []))
            for model_name, organs_dict in low_performance_dict.items()
        }
        all_bad_cts = set.union(*model_bad_cts.values()) if model_bad_cts else set()
        ct_flag_count = {
            ct_id: sum(ct_id in ct_set for ct_set in model_bad_cts.values())
            for ct_id in all_bad_cts
        }
        df = pd.DataFrame.from_dict(ct_flag_count, orient='index', columns=[organ])
        df = df[df[organ] > agreement_cutoff]
        df.index.name = 'CT_ID'
        organ_dfs.append(df)

    # Merge all organ DataFrames on CT_ID (outer join to keep all CTs that appear in any organ)
    merged_df = pd.concat(organ_dfs, axis=1).fillna(0).astype(int)

    return merged_df

In [38]:
organs = organs  # whatever you want to include
ct_summary_df = build_ct_organ_agreement_table(low_performance_dict, organs, agreement_cutoff=0)

#### Keep `agreement_cutoff=0` when displaying this, otherwise it may be misleading


CTs will otherwise show up as 0 when in fact they may have 1:(aggregate_cutouff-1) agreements

In [39]:
ct_summary_df

Unnamed: 0_level_0,aorta,gall_bladder,kidney_left,kidney_right,liver,pancreas,postcava,spleen,stomach
CT_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
BDMAP_V0000023,2,0,0,0,0,1,1,0,0
BDMAP_A0001457,1,0,2,1,0,1,0,0,0
BDMAP_V0002453,1,0,0,0,0,0,1,0,0
BDMAP_V0001132,1,0,0,0,0,1,0,0,0
BDMAP_V0000317,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...
BDMAP_V0000719,0,0,0,0,0,0,0,0,1
BDMAP_V0001766,0,0,0,0,0,0,0,0,5
BDMAP_V0002257,0,0,0,0,0,0,0,0,1
BDMAP_V0002636,0,0,0,0,0,0,0,0,4


In [40]:
ct_summary_df.to_csv("outlier_cts.csv")