In [None]:
import os
import pandas as pd
from lifelines import KaplanMeierFitter
from lifelines.statistics import logrank_test

pred_path = "secondary_fold_predictions.csv"
km_path = "km_mvb.csv"
if not os.path.exists(km_path) and os.path.exists(os.path.join("urine", km_path)):
    km_path = os.path.join("urine", km_path)
preds = pd.read_csv(pred_path)
km = pd.read_csv(km_path)
km["id"] = km["Sample"].str.replace("_CrAdj", "", regex=False)

df = preds.merge(km, left_on="sample_id", right_on="id", how="inner")
df["Survival (days)"] = pd.to_numeric(df["Survival (days)"], errors="coerce")
df["Event"] = pd.to_numeric(df["Event"], errors="coerce")
df = df.dropna(subset=["Survival (days)", "Event", "pred_label"])

low = df["pred_label"].astype(int) == 0
high = df["pred_label"].astype(int) == 1

test = logrank_test(
    df.loc[low, "Survival (days)"],
    df.loc[high, "Survival (days)"],
    event_observed_A=df.loc[low, "Event"],
    event_observed_B=df.loc[high, "Event"],
)
kmf = KaplanMeierFitter()
med_low = kmf.fit(df.loc[low, "Survival (days)"], df.loc[low, "Event"]).median_survival_time_
med_high = kmf.fit(df.loc[high, "Survival (days)"], df.loc[high, "Event"]).median_survival_time_
out = {
    "n": int(len(df)),
    "n_low": int(low.sum()),
    "n_high": int(high.sum()),
    "logrank_p": float(test.p_value),
    "median_low_days": None if pd.isna(med_low) else float(med_low),
    "median_high_days": None if pd.isna(med_high) else float(med_high),
}
out