In [2]:
import pandas as pd
import numpy as np
from scipy.stats import f_oneway, chi2_contingency
from pathlib import Path

# Paths
IN_PATH = Path("../data/processed/02_clustered_data.csv")

# Load
df = pd.read_csv(IN_PATH)
if "Cluster" not in df.columns:
    raise RuntimeError("Cluster column missing. Run 02_pca_clustering.ipynb first.")

# Top biomarkers to test (edit as needed)
top_biomarkers = [
    'Proteina C reativa mg/dL',
    'Leukocytes',
    'Platelets',
    'Creatinine',
    'Urea',
]

results = []
for feat in top_biomarkers:
    if feat not in df.columns:
        print(f"Skipping missing feature: {feat}")
        continue
    vals = pd.to_numeric(df[feat], errors='coerce')
    df_feat = pd.DataFrame({'Cluster': df['Cluster'], 'val': vals}).dropna()
    groups = [df_feat.loc[df_feat['Cluster'] == c, 'val'] for c in sorted(df_feat['Cluster'].unique())]
    if any(len(g) == 0 for g in groups):
        print(f"Skipping {feat}: empty group")
        continue
    f_stat, p_val = f_oneway(*groups)
    means = df_feat.groupby('Cluster')['val'].mean()
    results.append({
        'Feature': feat,
        'Cluster 0 Mean': means.get(0, float('nan')),
        'Cluster 1 Mean': means.get(1, float('nan')),
        'Cluster 2 Mean': means.get(2, float('nan')),
        'p_value': p_val,
        'F_stat': f_stat,
    })

if results:
    evidence_df = pd.DataFrame(results)
    evidence_df['p_value_fmt'] = evidence_df['p_value'].apply(lambda p: '< 0.001' if p < 1e-3 else f"{p:.3f}")
    evidence_df['Significance'] = evidence_df['p_value'].apply(lambda p: '*' if p < 0.05 else '')
    evidence_df = evidence_df.sort_values(by='p_value')
    evidence_df = evidence_df[[
        'Feature', 'Cluster 0 Mean', 'Cluster 1 Mean', 'Cluster 2 Mean', 'p_value_fmt', 'Significance'
    ]]
    print("One-way ANOVA (top biomarkers):")
    display(evidence_df)
else:
    print("No ANOVA results computed. Check feature availability.")

# Chi-square for ICU admission
icu_col = 'Patient addmited to intensive care unit (1=yes, 0=no)'
if icu_col in df.columns:
    contingency = pd.crosstab(df['Cluster'], df[icu_col].fillna(0).astype(int))
    chi2, chi_p, _, _ = chi2_contingency(contingency)
    print(f"Chi-square p-value for Cluster vs ICU: {chi_p:.4g}")
    display(contingency)
else:
    print(f"ICU column not found: {icu_col}")



One-way ANOVA (top biomarkers):


Unnamed: 0,Feature,Cluster 0 Mean,Cluster 1 Mean,Cluster 2 Mean,p_value_fmt,Significance
4,Urea,-0.002845,13.872796,-0.453214,< 0.001,*
3,Creatinine,0.063287,4.571409,-1.517085,< 0.001,*
1,Leukocytes,-0.060292,2.233926,1.201259,< 0.001,*
0,Proteina C reativa mg/dL,-0.029149,2.137217,0.541601,0.001,*
2,Platelets,-0.030778,-0.241048,0.664375,0.002,*


Chi-square p-value for Cluster vs ICU: 2.134e-13


"Patient addmited to intensive care unit (1=yes, 0=no)",0,4
Cluster,Unnamed: 1_level_1,Unnamed: 2_level_1
0,555,20
1,0,1
2,19,8
