In [2]:
import pandas as pd

train = pd.read_csv('pbc_attr_v1_train.csv')
test = pd.read_csv('pbc_attr_v1_test.csv')
# combine these two datasets
meta = pd.concat([train, test], ignore_index=True)


considered_columns = ['label','cell_size','cell_shape','nucleus_shape','nuclear_cytoplasmic_ratio','chromatin_density','cytoplasm_vacuole','cytoplasm_texture','cytoplasm_colour','granule_type','granule_colour','granularity']

for col in considered_columns:
    print(f"{col}: {meta[col].unique()}")

label: ['Neutrophil' 'Eosinophil' 'Basophil' 'Lymphocyte' 'Monocyte']
cell_size: ['big' 'small']
cell_shape: ['round' 'irregular']
nucleus_shape: ['unsegmented-band' 'unsegmented-round' 'segmented-multilobed'
 'segmented-bilobed' 'irregular' 'unsegmented-indented']
nuclear_cytoplasmic_ratio: ['low' 'high']
chromatin_density: ['densely' 'loosely']
cytoplasm_vacuole: ['no' 'yes']
cytoplasm_texture: ['clear' 'frosted']
cytoplasm_colour: ['light blue' 'blue' 'purple blue']
granule_type: ['small' 'round' 'coarse' 'nil']
granule_colour: ['pink' 'purple' 'red' 'nil']
granularity: ['yes' 'no']


In [6]:
# Combine train and test datasets
meta = pd.concat([train, test], ignore_index=True)

# List of columns to consider (medical concepts)
considered_columns = ['cell_size','cell_shape','nucleus_shape','nuclear_cytoplasmic_ratio','chromatin_density','cytoplasm_vacuole','cytoplasm_texture','cytoplasm_colour','granule_type','granule_colour','granularity']

# Prepare results dictionary
proba_results = {}

# Get all unique labels
label_values = meta['label'].dropna().unique().tolist()

# For each medical concept (except label)
for concept in considered_columns:
    unique_vals = meta[concept].dropna().unique().tolist()
    for value in unique_vals:
        subset = meta[meta[concept] == value]
        total = len(subset)
        if total == 0:
            print(f"No samples found for {concept}={value}. Skipping.")
            continue
        for label in label_values:
            label_count = (subset['label'] == label).sum()
            proba = label_count / total
            proba_results[f"P({label} | {concept}={value})"] = proba

# Display results
for k, v in proba_results.items():
    if 'Neutrophil' in k and v > 0.1:  # Filter to show only Neutrophil related probabilities
        print(f"{k}: {v:.2f}")

P(Neutrophil | cell_size=big): 0.36
P(Neutrophil | cell_size=small): 0.28
P(Neutrophil | cell_shape=round): 0.32
P(Neutrophil | cell_shape=irregular): 0.32
P(Neutrophil | nucleus_shape=unsegmented-band): 0.85
P(Neutrophil | nucleus_shape=segmented-multilobed): 0.34
P(Neutrophil | nucleus_shape=segmented-bilobed): 0.20
P(Neutrophil | nuclear_cytoplasmic_ratio=low): 0.37
P(Neutrophil | chromatin_density=densely): 0.35
P(Neutrophil | cytoplasm_vacuole=no): 0.34
P(Neutrophil | cytoplasm_vacuole=yes): 0.15
P(Neutrophil | cytoplasm_texture=clear): 0.40
P(Neutrophil | cytoplasm_colour=light blue): 0.43
P(Neutrophil | granule_type=small): 1.00
P(Neutrophil | granule_colour=pink): 1.00
P(Neutrophil | granularity=yes): 0.43
