In [4]:
import pandas as pd

train = pd.read_csv('pbc_attr_v1_train.csv')
test = pd.read_csv('pbc_attr_v1_test.csv')
# combine these two datasets
meta = pd.concat([train, test], ignore_index=True)


considered_columns = ['label','cell_size','cell_shape','nucleus_shape','nuclear_cytoplasmic_ratio','chromatin_density','cytoplasm_vacuole','cytoplasm_texture','cytoplasm_colour','granule_type','granule_colour','granularity']

for col in considered_columns:
    print(f"{col}: {meta[col].unique()}")

label: ['Neutrophil' 'Eosinophil' 'Basophil' 'Lymphocyte' 'Monocyte']
cell_size: ['big' 'small']
cell_shape: ['round' 'irregular']
nucleus_shape: ['unsegmented-band' 'unsegmented-round' 'segmented-multilobed'
 'segmented-bilobed' 'irregular' 'unsegmented-indented']
nuclear_cytoplasmic_ratio: ['low' 'high']
chromatin_density: ['densely' 'loosely']
cytoplasm_vacuole: ['no' 'yes']
cytoplasm_texture: ['clear' 'frosted']
cytoplasm_colour: ['light blue' 'blue' 'purple blue']
granule_type: ['small' 'round' 'coarse' 'nil']
granule_colour: ['pink' 'purple' 'red' 'nil']
granularity: ['yes' 'no']


In [5]:
# Combine train and test datasets
meta = pd.concat([train, test], ignore_index=True)

# List of columns to consider (medical concepts)
considered_columns = ['cell_size','cell_shape','nucleus_shape','nuclear_cytoplasmic_ratio','chromatin_density','cytoplasm_vacuole','cytoplasm_texture','cytoplasm_colour','granule_type','granule_colour','granularity']

# Prepare results dictionary
proba_results = {}

# Get all unique labels
label_values = meta['label'].dropna().unique().tolist()

# For each medical concept (except label)
for concept in considered_columns:
    unique_vals = meta[concept].dropna().unique().tolist()
    for value in unique_vals:
        subset = meta[meta[concept] == value]
        total = len(subset)
        if total == 0:
            print(f"No samples found for {concept}={value}. Skipping.")
            continue
        for label in label_values:
            label_count = (subset['label'] == label).sum()
            proba = label_count / total
            proba_results[f"P({label} | {concept}={value})"] = proba

# Display results
for k, v in proba_results.items():
    print(f"{k}: {v}")

P(Neutrophil | cell_size=big): 0.36001600960576347
P(Eosinophil | cell_size=big): 0.35641384830898537
P(Basophil | cell_size=big): 0.06063638182909746
P(Lymphocyte | cell_size=big): 0.0018010806483890335
P(Monocyte | cell_size=big): 0.22113267960776467
P(Neutrophil | cell_size=small): 0.2800280964645282
P(Eosinophil | cell_size=small): 0.2392882228986186
P(Basophil | cell_size=small): 0.18496839147740576
P(Lymphocyte | cell_size=small): 0.2561461016155467
P(Monocyte | cell_size=small): 0.03956918754390073
P(Neutrophil | cell_shape=round): 0.3238533389098006
P(Eosinophil | cell_shape=round): 0.3591244946326502
P(Basophil | cell_shape=round): 0.1357869789488359
P(Lymphocyte | cell_shape=round): 0.10330405687996654
P(Monocyte | cell_shape=round): 0.0779311306287467
P(Neutrophil | cell_shape=irregular): 0.320763723150358
P(Eosinophil | cell_shape=irregular): 0.10835322195704057
P(Basophil | cell_shape=irregular): 0.056801909307875896
P(Lymphocyte | cell_shape=irregular): 0.1727923627684964