In [32]:
# ============================================
# ILD(–ò–ó–õ) vs COPD(–•–û–ë–õ): –Ω–µ–ø–∞—Ä–∞–º–µ—Ç—Ä–∏—á–µ—Å–∫–∏–π –∞–Ω–∞–ª–∏–∑ –≤ Colab
# - –∑–∞–≥—Ä—É–∑–∫–∞ —Ñ–∞–π–ª–∞ –∏–∑ –¥–∏–∞–ª–æ–≥–∞/Drive
# - long/wide –∞–≤—Ç–æ–æ–ø—Ä–µ–¥–µ–ª–µ–Ω–∏–µ
# - Wilcoxon (–ø–∞—Ä–Ω—ã–µ) –≤–Ω—É—Ç—Ä–∏ –≥—Ä—É–ø–ø –ø–æ –ø–∞—Ä–∞–º –ª–µ—Ç
# - Mann‚ÄìWhitney –º–µ–∂–¥—É –≥—Ä—É–ø–ø–∞–º–∏ (–ø–æ –≥–æ–¥–∞–º –∏ –ø–æ –¥–µ–ª—å—Ç–∞–º)
# - FDR(BH) –æ—Ç–¥–µ–ª—å–Ω–æ –¥–ª—è within/between, —ç—Ñ—Ñ–µ–∫—Ç—ã (rank-biserial, Cliff‚Äôs Œ¥)
# - —ç–∫—Å–ø–æ—Ä—Ç Excel/CSV + PNG-–≥—Ä–∞—Ñ–∏–∫–∏, —Å–∫–∞—á–∏–≤–∞–Ω–∏–µ ZIP
# ============================================

# !pip -q install pandas numpy scipy statsmodels matplotlib openpyxl python-slugify

import os, re, glob, warnings, shutil
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
from scipy import stats
from statsmodels.stats.multitest import multipletests
from slugify import slugify

warnings.filterwarnings("ignore")

# ---------- –ù–ê–°–¢–†–û–ô–ö–ò ----------
YEARS      = ["2023", "2024", "2025"]   # –æ–∂–∏–¥–∞–µ–º—ã–µ –≥–æ–¥–∞ (–±—É–¥—É—Ç –∞–≤—Ç–æ-–¥–µ—Ç–µ–∫—Ç–∏—Ç—å—Å—è –ø—Ä–∏ long)
MIN_PAIRED = 5                          # –º–∏–Ω–∏–º—É–º –ø–∞—Ä –¥–ª—è Wilcoxon
AUTO_DOWNLOAD = True                    # –∞–≤—Ç–æ–º–∞—Ç–∏—á–µ—Å–∫–∏ —Å–∫–∞—á–∞—Ç—å ZIP —Å —Ä–µ–∑—É–ª—å—Ç–∞—Ç–∞–º–∏

# –ï—Å–ª–∏ —Ö–æ—Ç–∏—Ç–µ —Ä—É–∫–∞–º–∏ –∑–∞–¥–∞—Ç—å –ø—É—Ç—å (–Ω–∞–ø—Ä–∏–º–µ—Ä –∏–∑ Google Drive) ‚Äî —Ä–∞—Å–∫–æ–º–º–µ–Ω—Ç–∏—Ä—É–π—Ç–µ:
# INFILE = "/content/drive/MyDrive/–ø–∞–ø–∫–∞/pulmo_polarity_aligned –∏—Ç–æ–≥–æ–≤—ã–π —Ñ–∞–π–ª.xlsx"

# ---------- –£–¢–ò–õ–ò–¢–´ ----------
def clean_names(cols):
    out=[]
    for c in cols:
        c = str(c)
        c = re.sub(r"[\u00A0\u2009]", " ", c).strip()
        c = re.sub(r"\s+", " ", c)
        out.append(c)
    return out

def map_group(x):
    s = str(x).strip()
    if s in {"1","–ò–õ–î","–ò–ó–õ","ILD","ild","Izl","IZL"}: return "–ò–ó–õ"
    if s in {"2","–•–û–ë–õ","COPD","copd","Copd","–•–æ–±–ª"}: return "–•–û–ë–õ"
    return np.nan

def read_any(path, sheet=0):
    ext = Path(path).suffix.lower()
    if ext in [".xlsx",".xls"]:
        return pd.read_excel(path, sheet_name=sheet)
    elif ext == ".csv":
        try:
            return pd.read_csv(path, encoding="utf-8")
        except Exception:
            return pd.read_csv(path, encoding="cp1251", sep=";")
    else:
        raise ValueError("–ü–æ–¥–¥–µ—Ä–∂–∏–≤–∞—é—Ç—Å—è .xlsx/.xls/.csv")

def find_col(df, cands):
    low = [c.lower() for c in df.columns]
    for cand in cands:
        if cand.lower() in low:
            return df.columns[low.index(cand.lower())]
    return None

def cliffs_delta(x, y):
    x = np.asarray(x, float); y = np.asarray(y, float)
    x = x[~np.isnan(x)]; y = y[~np.isnan(y)]
    if len(x)==0 or len(y)==0: return np.nan
    gt = sum((xi > y).sum() for xi in x)
    lt = sum((xi < y).sum() for xi in x)
    return (gt - lt) / (len(x)*len(y))

def paired_rank_biserial(a, b):
    a = np.asarray(a, float); b = np.asarray(b, float)
    d = b - a
    pos = np.sum(d > 0); neg = np.sum(d < 0)
    nz  = pos + neg
    return 0.0 if nz == 0 else (pos - neg) / nz

def add_fdr(df):
    if df is None or df.empty:
        return df
    q = multipletests(df["p_raw"].values, method="fdr_bh")[1]
    out = df.copy()
    out["q_fdr_bh"] = q
    return out

# ---------- –ó–ê–ì–†–£–ó–ö–ê –§–ê–ô–õ–ê (–¥–∏–∞–ª–æ–≥ –∏–ª–∏ Drive) ----------
try:
    INFILE
except NameError:
    INFILE = None

if INFILE is None:
    print("‚¨ÜÔ∏è –í—ã–±–µ—Ä–∏—Ç–µ Excel/CSV —Ñ–∞–π–ª –¥–ª—è –∑–∞–≥—Ä—É–∑–∫–∏‚Ä¶")
    try:
        from google.colab import files
        uploaded = files.upload()
        if uploaded:
            INFILE = next(iter(uploaded.keys()))
            print("‚úÖ –ó–∞–≥—Ä—É–∂–µ–Ω —Ñ–∞–π–ª:", INFILE)
    except Exception as e:
        print("‚ö†Ô∏è –ù–µ –≤ Colab –∏–ª–∏ –¥–∏–∞–ª–æ–≥ –Ω–µ —Å—Ä–∞–±–æ—Ç–∞–ª:", e)

if INFILE is None:
    # –ü–æ–ø—ã—Ç–∫–∞ –Ω–∞–π—Ç–∏ –æ—Ä–∏–≥–∏–Ω–∞–ª—å–Ω—ã–π —Ñ–∞–π–ª –ø–æ —á–∞—Å—Ç–∏ –∏–º–µ–Ω–∏
    cand = [p for p in os.listdir(".") if p.lower().endswith((".xlsx",".xls",".csv"))]
    if cand:
        INFILE = cand[0]
        print("‚ÑπÔ∏è –ò—Å–ø–æ–ª—å–∑—É—é –Ω–∞–π–¥–µ–Ω–Ω—ã–π —Ñ–∞–π–ª:", INFILE)

assert INFILE is not None and Path(INFILE).exists(), f"–§–∞–π–ª –Ω–µ –Ω–∞–π–¥–µ–Ω: {INFILE}"

# ---------- –ß–¢–ï–ù–ò–ï –ò –ü–†–ò–í–ï–î–ï–ù–ò–ï –ö LONG ----------
SHEET_NAME = 0  # –ø—Ä–∏ –Ω–µ–æ–±—Ö–æ–¥–∏–º–æ—Å—Ç–∏ –∑–∞–º–µ–Ω–∏—Ç–µ –Ω–∞ –∏–º—è –ª–∏—Å—Ç–∞
df0 = read_any(INFILE, SHEET_NAME)
df0.columns = clean_names(df0.columns)

ID_CANDS    = ["id","patient_id","–ø–∞—Ü–∏–µ–Ω—Ç","‚Ññ","–∏–¥","–∫–æ–¥","uid"]
GROUP_CANDS = ["group","–≥—Ä—É–ø–ø–∞","–¥–∏–∞–≥–Ω–æ–∑"]
YEAR_CANDS  = ["year","–≥–æ–¥","time","–≤—Ä–µ–º—è"]
MARK_CANDS  = ["marker","–ø–æ–∫–∞–∑–∞—Ç–µ–ª—å","–ø–∞—Ä–∞–º–µ—Ç—Ä","–∞–Ω–∞–ª–∏–∑","variable","feature","–º–∞—Ä–∫–µ—Ä"]
VAL_CANDS   = ["value","–∑–Ω–∞—á–µ–Ω–∏–µ","val","y"]

id_col   = find_col(df0, ID_CANDS)
grp_col  = find_col(df0, GROUP_CANDS)
year_col = find_col(df0, YEAR_CANDS)
mark_col = find_col(df0, MARK_CANDS)
val_col  = find_col(df0, VAL_CANDS)

is_long = all(c is not None for c in [id_col, grp_col, year_col, mark_col, val_col])

if is_long:
    print("‚úÖ –û–±–Ω–∞—Ä—É–∂–µ–Ω –¥–ª–∏–Ω–Ω—ã–π —Ñ–æ—Ä–º–∞—Ç.")
    dat = df0.rename(columns={id_col:"id", grp_col:"group", year_col:"year",
                              mark_col:"marker", val_col:"value"}).copy()
    # –∞–≤—Ç–æ-–¥–µ—Ç–µ–∫—Ç –ª–µ—Ç, –µ—Å–ª–∏ –≤ –¥–∞–Ω–Ω—ã—Ö –µ—Å—Ç—å —á—Ç–æ-—Ç–æ –æ—Ç–ª–∏—á–Ω–æ–µ –æ—Ç YEARS
    years_detected = sorted(list(map(str, pd.Series(dat["year"]).astype(str).unique())))
    if set(years_detected).issuperset(set(YEARS)) or len(years_detected) <= len(YEARS):
        YEARS = [y for y in YEARS if y in years_detected] or years_detected
else:
    print("‚ÑπÔ∏è –ü–æ—Ö–æ–∂–µ –Ω–∞ —à–∏—Ä–æ–∫–∏–π —Ñ–æ—Ä–º–∞—Ç. –ü—Ä–µ–æ–±—Ä–∞–∑—É—é –ø–æ —Å—É—Ñ—Ñ–∏–∫—Å–∞–º _–ì–û–î.")
    # –≤–æ–∑—å–º—ë–º –≤–æ–∑–º–æ–∂–Ω—ã–µ id/group
    id_vals  = df0[id_col].astype(str) if id_col else pd.Series(range(1, len(df0)+1), index=df0.index, dtype=str)
    grp_vals = df0[grp_col] if grp_col else pd.Series([np.nan]*len(df0))
    # regex: –±–∞–∑–æ–≤–æ–µ –∏–º—è + –≥–æ–¥ –≤ –∫–æ–Ω—Ü–µ
    rx = re.compile(r"^(.*?)[\._\s]?(\d{4})$")
    rows=[]
    years_found = set()
    for col in df0.columns:
        m = rx.match(col)
        if not m:
            continue
        base, year = m.group(1).strip(), m.group(2)
        years_found.add(year)
        vals = pd.to_numeric(df0[col], errors="coerce")
        rows.append(pd.DataFrame({
            "id": id_vals.values,
            "group": grp_vals.values,
            "year": year,
            "marker": base or col,
            "value": vals
        }))
    if not rows:
        raise RuntimeError("–ù–µ –Ω–∞—à—ë–ª –Ω–∏ –æ–¥–Ω–æ–≥–æ —Å—Ç–æ–ª–±—Ü–∞ —Å –≥–æ–¥–∞–º–∏ (‚Ä¶_2023/‚Ä¶ 2024/‚Ä¶2025). –ü—Ä–æ–≤–µ—Ä—å—Ç–µ –∏–º–µ–Ω–∞.")
    YEARS = sorted(list(years_found))
    dat = pd.concat(rows, ignore_index=True)

# –ø—Ä–∏–≤–µ–¥–µ–Ω–∏–µ —Ç–∏–ø–æ–≤, –æ—Ç—á–∏—Å—Ç–∫–∞
dat["group"]  = dat["group"].map(map_group)
dat["year"]   = dat["year"].astype(str).str.strip()
dat["marker"] = dat["marker"].astype(str)
dat = dat.dropna(subset=["value","marker"])
dat = dat[dat["year"].isin(YEARS)]
dat["id"] = dat["id"].astype(str)

# –Ω–∞ —Å–ª—É—á–∞–π –¥—É–±–ª–µ–π: –∞–≥—Ä–µ–≥–∏—Ä—É–µ–º –ø–æ (id, group, marker, year) –º–µ–¥–∏–∞–Ω–æ–π
dat = (dat.groupby(["id","group","marker","year"], as_index=False)["value"]
          .median())

print(f"–ú–∞—Ä–∫–µ—Ä–æ–≤: {dat['marker'].nunique()}, –ª–µ—Ç: {YEARS}, –∑–∞–ø–∏—Å–µ–π: {len(dat)}")

# ---------- –°–í–û–î–ù–´–ï –ú–ï–î–ò–ê–ù–´ ----------
summary_medians = (
    dat.groupby(["marker","group","year"], dropna=False)
       .agg(N=("value","size"),
            median=("value","median"),
            Q1=("value", lambda x: np.quantile(x, 0.25)),
            Q3=("value", lambda x: np.quantile(x, 0.75)))
       .reset_index()
       .sort_values(["marker","group","year"])
)

# ---------- –í–ù–£–¢–†–ò–ì–†–£–ü–ü–û–í–´–ï: Wilcoxon (–ø–∞—Ä–Ω—ã–µ) ----------
pairs = []
if "2023" in YEARS and "2024" in YEARS: pairs.append(("2024","2023"))
if "2024" in YEARS and "2025" in YEARS: pairs.append(("2025","2024"))
if "2023" in YEARS and "2025" in YEARS: pairs.append(("2025","2023"))

within_rows = []
for mk in dat["marker"].unique():
    for gr in ["–ò–ó–õ","–•–û–ë–õ"]:
        d = dat[(dat["marker"]==mk) & (dat["group"]==gr)]
        for y2, y1 in pairs:
            w = (d[d["year"]==y1][["id","value"]]
                 .merge(d[d["year"]==y2][["id","value"]], on="id", suffixes=(f"_{y1}", f"_{y2}"))
                 .dropna())
            if len(w) >= MIN_PAIRED:
                a = w[f"value_{y1}"].to_numpy()
                b = w[f"value_{y2}"].to_numpy()
                try:
                    p = stats.wilcoxon(b, a, zero_method="wilcox", alternative="two-sided", method="auto").pvalue
                except Exception:
                    p = stats.wilcoxon(b, a, zero_method="wilcox", alternative="two-sided").pvalue
                rb = paired_rank_biserial(a, b)
                delta = b - a
                within_rows.append(dict(
                    family="within", marker=mk, group=gr, period=f"Œî({y2}‚Äì{y1})",
                    N_pairs=len(w),
                    median_change=float(np.median(delta)),
                    Q1_change=float(np.quantile(delta, 0.25)),
                    Q3_change=float(np.quantile(delta, 0.75)),
                    p_raw=float(p), effect=float(rb), effect_type="rank-biserial"
                ))
within_tbl = pd.DataFrame(within_rows)

# ---------- –ú–ï–ñ–ì–†–£–ü–ü–û–í–´–ï: Mann‚ÄìWhitney (–ø–æ –≥–æ–¥–∞–º –∏ –ø–æ Œî) ----------
between_rows = []

# –ø–æ –≥–æ–¥–∞–º
for mk in dat["marker"].unique():
    for yr in YEARS:
        d_yr = dat[(dat["marker"]==mk) & (dat["year"]==yr)]
        x = d_yr.loc[d_yr["group"]=="–ò–ó–õ","value"].to_numpy()
        y = d_yr.loc[d_yr["group"]=="–•–û–ë–õ","value"].to_numpy()
        if len(x) >= 5 and len(y) >= 5:
            p = stats.mannwhitneyu(x, y, alternative="two-sided").pvalue
            cd = cliffs_delta(x, y)
            between_rows.append(dict(
                family="between", marker=mk, contrast="–ò–õ–î_vs_–•–û–ë–õ",
                period=yr, N_ILD=int(len(x)), N_COPD=int(len(y)),
                p_raw=float(p), effect=float(cd), effect_type="Cliff_delta"
            ))

# –ø–æ –¥–µ–ª—å—Ç–∞–º (—É—Å—Ç–æ–π—á–∏–≤–æ –∫ –æ—Ç—Å—É—Ç—Å—Ç–≤–∏—é –≥–æ–¥–æ–≤)
for mk in dat["marker"].unique():
    d_m = dat[dat["marker"]==mk]
    for y2, y1 in pairs:
        ild = (d_m[(d_m["group"]=="–ò–ó–õ") & (d_m["year"].isin([y1, y2]))]
               .pivot_table(index="id", columns="year", values="value", aggfunc="first"))
        copd = (d_m[(d_m["group"]=="–•–û–ë–õ") & (d_m["year"].isin([y1, y2]))]
                .pivot_table(index="id", columns="year", values="value", aggfunc="first"))
        for df_ in (ild, copd):
            df_.columns = df_.columns.astype(str)
            for col in [y1, y2]:
                if col not in df_.columns: df_[col] = np.nan
        ild = ild.dropna(subset=[y1, y2], how="any")
        copd = copd.dropna(subset=[y1, y2], how="any")
        if len(ild) >= 5 and len(copd) >= 5:
            di = (ild[y2] - ild[y1]).to_numpy()
            dc = (copd[y2] - copd[y1]).to_numpy()
            p  = stats.mannwhitneyu(di, dc, alternative="two-sided").pvalue
            cd = cliffs_delta(di, dc)
            between_rows.append(dict(
                family="between", marker=mk, contrast="Œî(–ò–õ–î)_vs_Œî(–•–û–ë–õ)",
                period=f"Œî({y2}‚Äì{y1})", N_ILD=int(len(di)), N_COPD=int(len(dc)),
                p_raw=float(p), effect=float(cd), effect_type="Cliff_delta"
            ))

between_tbl = pd.DataFrame(between_rows)

# ---------- FDR(BH) –û–¢–î–ï–õ–¨–ù–û ----------
within_tbl  = add_fdr(within_tbl)
between_tbl = add_fdr(between_tbl)

# ---------- –≠–ö–°–ü–û–†–¢ ----------
Path("outputs").mkdir(exist_ok=True)

summary_out = summary_medians.copy()
summary_out.to_csv("outputs/medians_by_group_year.csv", index=False)

if within_tbl is not None and not within_tbl.empty:
    within_out = within_tbl[["marker","group","period","N_pairs",
                             "median_change","Q1_change","Q3_change",
                             "p_raw","q_fdr_bh","effect","effect_type"]] \
                             .sort_values(["marker","group","period"])
    within_out.to_csv("outputs/within_results.csv", index=False)
else:
    within_out = pd.DataFrame()

if between_tbl is not None and not between_tbl.empty:
    between_out = between_tbl[["marker","contrast","period","N_ILD","N_COPD",
                               "p_raw","q_fdr_bh","effect","effect_type"]] \
                               .sort_values(["marker","period","contrast"])
    between_out.to_csv("outputs/between_results.csv", index=False)
else:
    between_out = pd.DataFrame()

with pd.ExcelWriter("outputs/results_all.xlsx", engine="openpyxl") as xlw:
    summary_out.to_excel(xlw, sheet_name="medians", index=False)
    if not within_out.empty:  within_out.to_excel(xlw, sheet_name="within",  index=False)
    if not between_out.empty: between_out.to_excel(xlw, sheet_name="between", index=False)

print("\n‚úÖ –ì–æ—Ç–æ–≤–æ.")
print(f"–í—Å–µ–≥–æ –º–∞—Ä–∫–µ—Ä–æ–≤: {dat['marker'].nunique()}")
print(f"–í–Ω—É—Ç—Ä–∏–≥—Ä—É–ø–ø–æ–≤—ã—Ö —Å—Ä–∞–≤–Ω–µ–Ω–∏–π: {0 if within_tbl is None else len(within_tbl)}")
print(f"–ú–µ–∂–≥—Ä—É–ø–ø–æ–≤—ã—Ö —Å—Ä–∞–≤–Ω–µ–Ω–∏–π:    {0 if between_tbl is None else len(between_tbl)}")
print("–§–∞–π–ª—ã: outputs/results_all.xlsx, outputs/*.csv")

# ---------- –ì–†–ê–§–ò–ö–ò ----------
Path("plots").mkdir(exist_ok=True)

# –∑–≤—ë–∑–¥–æ—á–∫–∏ –º–µ–∂–≥—Ä—É–ø–ø–æ–≤–æ–π –∑–Ω–∞—á–∏–º–æ—Å—Ç–∏ –ø–æ –≥–æ–¥–∞–º (–ø–æ—Å–ª–µ FDR)
if between_tbl is not None and not between_tbl.empty:
    sig_year = (between_tbl
                .loc[(between_tbl["contrast"]=="–ò–õ–î_vs_–•–û–ë–õ") & (between_tbl["period"].isin(YEARS)),
                     ["marker","period","q_fdr_bh"]]
                .rename(columns={"period":"year"}))
    def star(q):
        return "***" if q < 1e-3 else ("**" if q < 1e-2 else ("*" if q < 0.05 else ""))
    sig_year["star"] = sig_year["q_fdr_bh"].map(star)
else:
    sig_year = pd.DataFrame(columns=["marker","year","q_fdr_bh","star"])

for mk in dat["marker"].unique():
    d = dat[dat["marker"]==mk]
    if d.empty:
        continue

    fig = plt.figure(figsize=(10,6))
    ax = plt.gca()

    xpos = np.arange(len(YEARS))
    width = 0.38

    for i, yr in enumerate(YEARS):
        for j, gr in enumerate(["–ò–ó–õ","–•–û–ë–õ"]):
            vals = d[(d["year"]==yr) & (d["group"]==gr)]["value"].dropna().to_numpy()
            if len(vals)==0:
                continue
            x = i + (j-0.5)*width
            bp = ax.boxplot(vals, positions=[x], widths=width*0.9, patch_artist=True, manage_ticks=False)
            ax.scatter(np.full_like(vals, x, dtype=float), vals, s=6, alpha=0.35)

    ax.set_xticks(np.arange(len(YEARS)))
    ax.set_xticklabels(YEARS)
    ax.set_title(mk)
    ax.set_xlabel("")
    ax.set_ylabel("–ó–Ω–∞—á–µ–Ω–∏–µ")
    ax.legend(["–ò–ó–õ","–•–û–ë–õ"], loc="upper right")

    # –∑–≤—ë–∑–¥–æ—á–∫–∏
    ann = sig_year[sig_year["marker"]==mk]
    if not ann.empty:
        y_max = d.groupby("year")["value"].max().reindex(YEARS)
        y_span = (np.nanmax(d["value"]) - np.nanmin(d["value"])) or 1.0
        for _, row in ann.iterrows():
            if row["year"] in y_max.index and not pd.isna(y_max.loc[row["year"]]):
                ax.text(YEARS.index(row["year"]), y_max.loc[row["year"]] + 0.05*y_span,
                        row["star"], ha="center", va="bottom", fontsize=14)

    plt.tight_layout()
    fname = f"plots/box_{slugify(mk)}.png"
    plt.savefig(fname, dpi=140)
    plt.close()

print("üñºÔ∏è –ì—Ä–∞—Ñ–∏–∫–∏ —Å–æ—Ö—Ä–∞–Ω–µ–Ω—ã –≤ –ø–∞–ø–∫–µ plots/")

# ---------- –°–ö–ê–ß–ê–¢–¨ ZIP ----------
zip_path = "/content/ILD_COPD_outputs.zip"
if os.path.exists(zip_path):
    os.remove(zip_path)
shutil.make_archive("/content/ILD_COPD_outputs", "zip", "/content")
print("ZIP:", zip_path)

if AUTO_DOWNLOAD:
    try:
        from google.colab import files
        files.download(zip_path)
    except Exception as e:
        print("‚ÑπÔ∏è –ù–µ —É–¥–∞–ª–æ—Å—å –∞–≤—Ç–æ-—Å–∫–∞—á–∞—Ç—å:", e, "\n–°–∫–∞—á–∞–π—Ç–µ ZIP —á–µ—Ä–µ–∑ –ø–∞–Ω–µ–ª—å Files —Å–ª–µ–≤–∞.")


‚ÑπÔ∏è –ü–æ—Ö–æ–∂–µ –Ω–∞ —à–∏—Ä–æ–∫–∏–π —Ñ–æ—Ä–º–∞—Ç. –ü—Ä–µ–æ–±—Ä–∞–∑—É—é –ø–æ —Å—É—Ñ—Ñ–∏–∫—Å–∞–º _–ì–û–î.
–ú–∞—Ä–∫–µ—Ä–æ–≤: 0, –ª–µ—Ç: ['2023', '2024', '2025'], –∑–∞–ø–∏—Å–µ–π: 0

‚úÖ –ì–æ—Ç–æ–≤–æ.
–í—Å–µ–≥–æ –º–∞—Ä–∫–µ—Ä–æ–≤: 0
–í–Ω—É—Ç—Ä–∏–≥—Ä—É–ø–ø–æ–≤—ã—Ö —Å—Ä–∞–≤–Ω–µ–Ω–∏–π: 0
–ú–µ–∂–≥—Ä—É–ø–ø–æ–≤—ã—Ö —Å—Ä–∞–≤–Ω–µ–Ω–∏–π:    0
–§–∞–π–ª—ã: outputs/results_all.xlsx, outputs/*.csv
üñºÔ∏è –ì—Ä–∞—Ñ–∏–∫–∏ —Å–æ—Ö—Ä–∞–Ω–µ–Ω—ã –≤ –ø–∞–ø–∫–µ plots/


KeyboardInterrupt: 

In [35]:
# -*- coding: utf-8 -*-
# Colab script: Pulmo profile page (medians/IQR, within/between p, FDR)
!pip -q install openpyxl statsmodels scipy pandas

import re
import numpy as np
import pandas as pd
from scipy.stats import mannwhitneyu, wilcoxon
from statsmodels.stats.multitest import multipletests

# ==== 1) –ó–ê–ì–†–£–ó–ö–ê –§–ê–ô–õ–ê ====
try:
    from google.colab import files  # type: ignore
    uploaded = files.upload()       # –≤—ã–±–µ—Ä–∏—Ç–µ –≤–∞—à .xlsx
    infile = next(iter(uploaded.keys()))
except Exception:
    # –ï—Å–ª–∏ –∑–∞–ø—É—Å–∫–∞–µ—Ç–µ –Ω–µ –≤ Colab ‚Äî —É–∫–∞–∂–∏—Ç–µ –ø—É—Ç—å –≤—Ä—É—á–Ω—É—é
    infile = "/content/—Ñ–∞–π–ª_–¥–ª—è_—Ä–∞—Å—á–µ—Ç–∞.xlsx"

df = pd.read_excel(infile)

# ==== 2) –ù–ê–°–¢–†–û–ô–ö–ò ====
GROUP_COL_CANDS = ["–¥–∏–∞–≥–Ω–æ–∑", "–¥–∏–∞–≥–Ωo–∑", "group", "–ì—Ä—É–ø–ø–∞", "–≥—Ä—É–ø–ø–∞"]
group_col = next((c for c in GROUP_COL_CANDS if c in df.columns), None)
if group_col is None:
    raise RuntimeError("–ù–µ –Ω–∞—à—ë–ª –∫–æ–ª–æ–Ω–∫—É —Å –≥—Ä—É–ø–ø–æ–π. –î–æ–±–∞–≤—å—Ç–µ —Å—Ç–æ–ª–±–µ—Ü '–¥–∏–∞–≥–Ω–æ–∑' (1=–ò–ó–õ, 2=–•–û–ë–õ).")

GROUP_LABELS = {1: "–ò–ó–õ", 2: "–•–û–ë–õ"}   # –ø—Ä–∏ –Ω–µ–æ–±—Ö–æ–¥–∏–º–æ—Å—Ç–∏: {1: "–ò–ó–õ-–°–°–î", 2: "–•–û–ë–õ"}

DECIMALS = 3  # —Å–∫–æ–ª—å–∫–æ –∑–Ω–∞–∫–æ–≤ –ø–æ–∫–∞–∑—ã–≤–∞—Ç—å –≤ –ú–ï/Q1/Q3 –∏ p/q

# ==== 3) –ü–û–ò–°–ö –ú–ê–†–ö–ï–†–û–í –ü–£–õ–¨–ú–û–ü–†–û–§–ò–õ–Ø ====
year_re = re.compile(r"_(\d{4})$")
marker_cols = [c for c in df.columns if year_re.search(str(c))]
if not marker_cols:
    raise RuntimeError("–ù–µ –Ω–∞—à—ë–ª —Å—Ç–æ–ª–±—Ü–æ–≤ –≤–∏–¥–∞ '<–ø–æ–∫–∞–∑–∞—Ç–µ–ª—å>_–ì–û–î'.")

all_markers = sorted(set(re.sub(year_re, "", c) for c in marker_cols))
years = sorted(set(int(year_re.search(c).group(1)) for c in marker_cols))

# –ù–æ—Ä–º–∞–ª–∏–∑–∞—Ü–∏—è –¥–ª—è —Ä–æ–±–∫–∏—Ö —Å–æ–ø–æ—Å—Ç–∞–≤–ª–µ–Ω–∏–π (–ª–∞—Ç–∏–Ω–∏—Ü–∞‚Üî–∫–∏—Ä–∏–ª–ª–∏—Ü–∞ –∏ —É–¥–∞–ª–µ–Ω–∏–µ '_'/' ')
def norm_text(s: str) -> str:
    mapping = {
        "–°":"C","—Å":"c","–û":"O","–æ":"o","–¢":"T","—Ç":"t","–ú":"M","–º":"m",
        "–•":"X","—Ö":"x","–í":"B","–≤":"b","–†":"P","—Ä":"p","–ù":"H","–Ω":"h",
        "–ö":"K","–∫":"k","–ê":"A","–∞":"a","–ï":"E","–µ":"e","–£":"Y","—É":"y",
        "–Å":"E","—ë":"e","–ô":"I","–π":"i","–õ":"L","–ª":"l","–î":"D","–¥":"d"
    }
    s2 = "".join(mapping.get(ch, ch) for ch in str(s))
    s2 = s2.lower().replace("_", "").replace(" ", "")
    return s2

# –ö–ª—é—á–∏ –ø—É–ª—å–º–æ–ø—Ä–æ—Ñ–∏–ª—è (–ø–æ—Å–ª–µ norm_text)
pulmo_keys = [
    # 6-–º–∏–Ω—É—Ç–Ω–∞—è —Ö–æ–¥—å–±–∞, –ß–î (–¥–æ/–ø–æ—Å–ª–µ), —Ä–∞–∑–Ω–∏—Ü–∞
    "–¥–∏—Å—Ç–∞–Ω—Ü–∏—è—Ç6–º—Ö","t6–º—Ö","t6mx","—Ç6–º—Ö",
    "—á–¥–≤–º–∏–Ω—É—Ç—É","—Ä–∞–∑–Ω–∏—Ü–∞—á–¥–¥—Ç6–º—Ö",
    # SaO2 –∏ –ë–æ—Ä–≥
    "sao2pct","sao2","sa–æ2pct","—à–∫–∞–ª–∞–±–æ—Ä–≥–∞",
    # —Å–ø–∏—Ä–æ–º–µ—Ç—Ä–∏—è
    "—Ñ–∂–µ–ª","–æ—Ñ–≤1","o—Ñ–≤1","ofv1",
    # –≥–∞–∑—ã –∏ –ª–∞–∫—Ç–∞—Ç –≤—ã–¥–æ—Ö–∞/–∫—Ä–æ–≤–∏
    "co2–≤–≤—ã–¥—ã—Ö–∞–µ–º–æ–º–≤–æ–∑–¥—É—Ö–µ","ph","po2","pco2","lac",
    # –ö–¢ –∏ –æ–±—ä—ë–º—ã
    "–æ–±—ä–µ–º–ª–µ–≥–∫–∏—Ö","–æ–±—å–µ–º–ª–µ–≥–∫–∏—Ö",
    "–¥–µ–Ω—Å–∏—Ç–æ–º–µ—Ç—Ä–∏—á–µ—Åkie–ø–æ–∫–∞–∑–∞—Ç–µ–ª–∏","–¥–µ–Ω—Å–∏—Ç–æ–º–µ—Ç—Ä–∏—á–µ—Å–∫–∏–µ–ø–æ–∫–∞–∑–∞—Ç–µ–ª–∏","–¥–µ–Ω—Å–∏—Ç–æ–º–µ—Ç—Ä–∏—á–µ—Å–∫–∏–µ–ø–æ–∫–∞–∑–∞—Ç–µ–ª–∏",
    "–ø–ª–æ—Ç–Ω–æ—Å—Ç—å—Ñ–∏–±—Ä–æ–∑–∞","–ø–ª–æ—Ç–Ω–æ—Å—Ç—å—Ñ–∏–±—Ä–æ–∑–∞"
]

pulmo_markers = [m for m in all_markers if any(k in norm_text(m) for k in pulmo_keys)]
if not pulmo_markers:  # –Ω–∞ –≤—Å—è–∫–∏–π —Å–ª—É—á–∞–π ‚Äî –µ—Å–ª–∏ —Å–ø–∏—Å–æ–∫ –ø—É—Å—Ç–æ–π, –±–µ—Ä—ë–º –≤—Å–µ
    pulmo_markers = all_markers.copy()

# ==== 4) LONG-–§–û–†–ú–ê–¢ ====
rows_long = []
for m in pulmo_markers:
    for y in years:
        col = f"{m}_{y}"
        if col in df.columns:
            vals = pd.to_numeric(df[col], errors="coerce")
            rows_long.append(pd.DataFrame({
                "marker": m,
                "year": y,
                "group": df[group_col].map(GROUP_LABELS).fillna(df[group_col].astype(str)),
                "value": vals
            }))
long_df = pd.concat(rows_long, ignore_index=True)

# ==== 5) –í–°–ü–û–ú–û–ì–ê–¢–ï–õ–¨–ù–´–ï –§–£–ù–ö–¶–ò–ò ====
def med_q1_q3(x):
    x = pd.to_numeric(pd.Series(x), errors="coerce").dropna()
    if len(x) == 0:
        return np.nan, np.nan, np.nan, 0
    return (float(np.nanmedian(x)),
            float(np.nanpercentile(x, 25)),
            float(np.nanpercentile(x, 75)),
            int(x.notna().sum()))

def fmt_num(v):
    return "NA" if pd.isna(v) else f"{v:.{DECIMALS}g}"

def fmt_stat(med, q1, q3, n):
    return f"{fmt_num(med)} [{fmt_num(q1)}; {fmt_num(q3)}] (n={n})"

# ==== 6) –ú–ï–ñ–ì–†–£–ü–ü–û–í–´–ï P –ü–û –ì–û–î–ê–ú (Mann‚ÄìWhitney) + FDR ====
between_rows = []
for m in pulmo_markers:
    for y in years:
        sub = long_df[(long_df["marker"] == m) & (long_df["year"] == y)]
        v_ild  = sub[sub["group"] == "–ò–ó–õ"]["value"].dropna().values
        v_copd = sub[sub["group"] == "–•–û–ë–õ"]["value"].dropna().values
        if len(v_ild) > 0 and len(v_copd) > 0:
            try:
                _, p = mannwhitneyu(v_ild, v_copd, alternative="two-sided")
            except ValueError:
                p = np.nan
        else:
            p = np.nan
        between_rows.append({"marker": m, "year": y, "p_between": p})

between_df = pd.DataFrame(between_rows)
if not between_df.empty:
    mask = between_df["p_between"].notna()
    q = np.full(len(between_df), np.nan)
    if mask.any():
        _, qvals, _, _ = multipletests(between_df.loc[mask, "p_between"], method="fdr_bh")
        q[mask.values] = qvals
    between_df["q_between"] = q

# ==== 7) –í–ù–£–¢–†–ò–ì–†–£–ü–ü–û–í–´–ï P (Wilcoxon) –¢–û–õ–¨–ö–û –°–û–°–ï–î–ù–ò–ï –ì–û–î–´ + FDR ====
year_pairs = {years[i+1]: years[i] for i in range(len(years)-1)}  # {2024:2023, 2025:2024, ...}
within_rows = []
for m in pulmo_markers:
    for g_code, g_name in GROUP_LABELS.items():
        sub_idx = df[group_col] == g_code
        wide = {}
        for y in years:
            col = f"{m}_{y}"
            if col in df.columns:
                wide[y] = pd.to_numeric(df.loc[sub_idx, col], errors="coerce").reset_index(drop=True)
        if not wide:
            continue
        wide_df = pd.DataFrame(wide)
        for y2, y1 in year_pairs.items():
            if y1 in wide_df and y2 in wide_df:
                v1, v2 = wide_df[y1], wide_df[y2]
                mask = v1.notna() & v2.notna()
                if mask.sum() > 1:
                    try:
                        _, p = wilcoxon(v1[mask], v2[mask], alternative="two-sided", zero_method="wilcox")
                    except ValueError:
                        p = np.nan
                else:
                    p = np.nan
                within_rows.append({"marker": m, "group": g_name, "year": y2, "prev_year": y1, "p_within": p})

within_df = pd.DataFrame(within_rows)
if not within_df.empty:
    mask = within_df["p_within"].notna()
    q = np.full(len(within_df), np.nan)
    if mask.any():
        _, qvals, _, _ = multipletests(within_df.loc[mask, "p_within"], method="fdr_bh")
        q[mask.values] = qvals
    within_df["q_within"] = q

# ==== 8) –°–ë–û–†–ö–ê ¬´–°–¢–†–ê–ù–ò–¶–´¬ª –ü–£–õ–¨–ú–û–ü–†–û–§–ò–õ–Ø ====
label_map = {y: (["a)","b)","c)","d)"][i] if i < 4 else f"{i+1})") for i, y in enumerate(years)}

rows = []
for m in pulmo_markers:
    for y in years:
        sub = long_df[(long_df["marker"] == m) & (long_df["year"] == y)]
        med_i, q1_i, q3_i, n_i = med_q1_q3(sub[sub["group"] == "–ò–ó–õ"]["value"])
        med_c, q1_c, q3_c, n_c = med_q1_q3(sub[sub["group"] == "–•–û–ë–õ"]["value"])

        rb = between_df[(between_df["marker"] == m) & (between_df["year"] == y)]
        p_b = rb["p_between"].iloc[0] if not rb.empty else np.nan
        q_b = rb["q_between"].iloc[0] if not rb.empty else np.nan

        rw_i = within_df[(within_df["marker"] == m) & (within_df["group"] == "–ò–ó–õ") & (within_df["year"] == y)]
        p_w_i = rw_i["p_within"].iloc[0] if not rw_i.empty else np.nan
        q_w_i = rw_i["q_within"].iloc[0] if not rw_i.empty else np.nan

        rw_c = within_df[(within_df["marker"] == m) & (within_df["group"] == "–•–û–ë–õ") & (within_df["year"] == y)]
        p_w_c = rw_c["p_within"].iloc[0] if not rw_c.empty else np.nan
        q_w_c = rw_c["q_within"].iloc[0] if not rw_c.empty else np.nan

        rows.append({
            "Variables": m,
            "—Å—Ç—Ä–æ–∫–∞": f"{label_map[y]} {y} –≥–æ–¥",
            "–ú–ï [Q1;Q3] –ò–ó–õ": fmt_stat(med_i, q1_i, q3_i, n_i),
            "–† –≤–Ω—É—Ç—Ä–∏–≥—Ä—É–ø–ø–æ–≤–æ–π (–ò–ó–õ)": (np.nan if y == years[0] else p_w_i),
            "–† —Å–∫–æ—Ä—Ä–µ–∫—Ç–∏—Ä–æ–≤–∞–Ω–Ω—ã–π (–ò–ó–õ)": (np.nan if y == years[0] else q_w_i),
            "–ú–ï [Q1;Q3] –•–û–ë–õ": fmt_stat(med_c, q1_c, q3_c, n_c),
            "–† –≤–Ω—É—Ç—Ä–∏–≥—Ä—É–ø–ø–æ–≤–æ–π (–•–û–ë–õ)": (np.nan if y == years[0] else p_w_c),
            "–† —Å–∫–æ—Ä—Ä–µ–∫—Ç–∏—Ä–æ–≤–∞–Ω–Ω—ã–π (–•–û–ë–õ)": (np.nan if y == years[0] else q_w_c),
            "–† –º–µ–∂–≥—Ä—É–ø–ø–æ–≤–æ–π": p_b,
            "–† —Å–∫–æ—Ä—Ä–µ–∫—Ç–∏—Ä–æ–≤–∞–Ω–Ω—ã–π (–º–µ–∂–≥—Ä—É–ø–ø–æ–≤–æ–π)": q_b
        })

pulmo_page = pd.DataFrame(rows)
pulmo_page["year"] = pulmo_page["—Å—Ç—Ä–æ–∫–∞"].str.extract(r"(\d{4})").astype(int)
pulmo_page = pulmo_page.sort_values(["Variables", "year"]).drop(columns=["year"]).reset_index(drop=True)

# ==== 9) –°–û–•–†–ê–ù–ï–ù–ò–ï ====
out_xlsx = "pulmo_profile_page.xlsx"
with pd.ExcelWriter(out_xlsx, engine="openpyxl") as wr:
    pulmo_page.to_excel(wr, sheet_name="pulmo_profile", index=False)
    within_df.to_excel(wr, sheet_name="within_raw", index=False)
    between_df.to_excel(wr, sheet_name="between_raw", index=False)

print("Saved:", out_xlsx)
try:
    from google.colab import files  # type: ignore
    files.download(out_xlsx)
except Exception:
    pass


Saving —Ñ–∞–π–ª –¥–ª—è —Ä–∞—Å—Å—á–µ—Ç–∞.xlsx to —Ñ–∞–π–ª –¥–ª—è —Ä–∞—Å—Å—á–µ—Ç–∞.xlsx
Saved: pulmo_profile_page.xlsx


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [42]:
# -*- coding: utf-8 -*-
# Pulmo profile ‚Üí formatted Excel + Word (—Å —Ä–∞—Å—à–∏—Ä–µ–Ω–Ω—ã–º –ø–æ–¥–±–æ—Ä–æ–º –ø–æ–∫–∞–∑–∞—Ç–µ–ª–µ–π)
!pip -q install pandas scipy statsmodels openpyxl python-docx
out_xlsx = "pulmo_profile_formatted.xlsx"
with pd.ExcelWriter(out_xlsx, engine="openpyxl") as writer:
    pulmo_page.to_excel(writer, sheet_name="pulmo_profile", index=False)
    within_df.to_excel(writer, sheet_name="within_raw", index=False)
    between_df.to_excel(writer, sheet_name="between_raw", index=False)

import re, numpy as np, pandas as pd
from scipy.stats import mannwhitneyu, wilcoxon
from statsmodels.stats.multitest import multipletests
from openpyxl import load_workbook
from openpyxl.styles import Font, Alignment, PatternFill, Border, Side
from openpyxl.formatting.rule import CellIsRule
from docx import Document
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.enum.table import WD_TABLE_ALIGNMENT
from docx.shared import Pt
from docx.oxml.ns import qn

# ===== 1) –ó–∞–≥—Ä—É–∑–∫–∞ =====
try:
    from google.colab import files  # type: ignore
    uploaded = files.upload()       # –≤—ã–±–µ—Ä–∏—Ç–µ .xlsx
    infile = next(iter(uploaded.keys()))
except Exception:
    infile = "/content/—Ñ–∞–π–ª_–¥–ª—è_—Ä–∞—Å—á–µ—Ç–∞.xlsx"

df = pd.read_excel(infile)

# ===== 2) –ù–∞—Å—Ç—Ä–æ–π–∫–∏ =====
GROUP_COL_CANDS = ["–¥–∏–∞–≥–Ω–æ–∑", "–¥–∏–∞–≥–Ωo–∑", "group", "–ì—Ä—É–ø–ø–∞", "–≥—Ä—É–ø–ø–∞"]
group_col = next((c for c in GROUP_COL_CANDS if c in df.columns), None)
if group_col is None:
    raise RuntimeError("–î–æ–±–∞–≤—å—Ç–µ —Å—Ç–æ–ª–±–µ—Ü '–¥–∏–∞–≥–Ω–æ–∑' (1=–ò–ó–õ, 2=–•–û–ë–õ).")

GROUP_LABELS = {1: "–ò–ó–õ", 2: "–•–û–ë–õ"}
DEC = 3

year_re = re.compile(r"_(\d{4})$")
marker_cols = [c for c in df.columns if year_re.search(str(c))]
if not marker_cols:
    raise RuntimeError("–ù–µ—Ç —Å—Ç–æ–ª–±—Ü–æ–≤ –≤–∏–¥–∞ '<–ø–æ–∫–∞–∑–∞—Ç–µ–ª—å>_–ì–û–î'.")

all_markers = sorted(set(re.sub(year_re, "", c) for c in marker_cols))
years = sorted(set(int(year_re.search(c).group(1)) for c in marker_cols))

# –ù–æ—Ä–º–∞–ª–∏–∑–∞—Ü–∏—è: –ø—Ä–∏–≤–æ–¥–∏–º –∫ –Ω–∏–∂–Ω–µ–º—É —Ä–µ–≥–∏—Å—Ç—Ä—É, —É–±–∏—Ä–∞–µ–º _ –∏ –ø—Ä–æ–±–µ–ª—ã,
# –∑–∞–º–µ–Ω—è–µ–º –∫–∏—Ä–∏–ª–ª–∏—á–µ—Å–∫–∏–µ ¬´–ø–æ—Ö–æ–∂–∏–µ¬ª –Ω–∞ –ª–∞—Ç–∏–Ω–∏—Ü—É (–∏ –Ω–∞–æ–±–æ—Ä–æ—Ç –Ω–µ —Ç—Ä–µ–±—É–µ—Ç—Å—è).
def norm_text(s: str) -> str:
    m = {"–°":"C","—Å":"c","–û":"O","–æ":"o","–¢":"T","—Ç":"t","–ú":"M","–º":"m",
         "–•":"X","—Ö":"x","–í":"B","–≤":"b","–†":"P","—Ä":"p","–ù":"H","–Ω":"h",
         "–ö":"K","–∫":"k","–ê":"A","–∞":"a","–ï":"E","–µ":"e","–£":"Y","—É":"y",
         "–Å":"E","—ë":"e","–ô":"I","–π":"i","–õ":"L","–ª":"l","–î":"D","–¥":"d"}
    s2 = "".join(m.get(ch, ch) for ch in str(s))
    return s2.lower().replace("_", "").replace(" ", "")

# ---- –†–ê–°–®–ò–†–ï–ù–ù–´–ô —Å–ª–æ–≤–∞—Ä—å –∫–ª—é—á–µ–π –ø—É–ª—å–º–æ–ø—Ä–æ—Ñ–∏–ª—è (–≤ ¬´—Å—ã—Ä–æ–º¬ª –≤–∏–¥–µ) ----
# –í–ê–ñ–ù–û: –º—ã –Ω–æ—Ä–º–∞–ª–∏–∑—É–µ–º –∫–ª—é—á–∏ —Ç–æ–π –∂–µ —Ñ—É–Ω–∫—Ü–∏–µ–π –∏ —Å—Ä–∞–≤–Ω–∏–≤–∞–µ–º —É–∂–µ –Ω–æ—Ä–º–∞–ª–∏–∑–æ–≤–∞–Ω–Ω—ã–µ —Å—Ç—Ä–æ–∫–∏.
pulmo_keys_raw = [
    # 6MWT, –ß–î, —Ä–∞–∑–Ω–∏—Ü–∞
    "–¥–∏—Å—Ç–∞–Ω—Ü–∏—è —Ç6–º—Ö", "t6–º—Ö", "t6mx", "—Ç6–º—Ö",
    "—á–¥ –≤ –º–∏–Ω—É—Ç—É", "—Ä–∞–∑–Ω–∏—Ü–∞ —á–¥–¥ —Ç6–º—Ö",
    # SaO2, –ë–æ—Ä–≥, —Å–ø–∏—Ä–æ–º–µ—Ç—Ä–∏—è
    "sao2 pct", "sao2", "—à–∫–∞–ª–∞ –±–æ—Ä–≥–∞",
    "—Ñ–∂–µ–ª", "–æ—Ñ–≤1", "ofv1",
    # –≥–∞–∑—ã –∏ –ª–∞–∫—Ç–∞—Ç
    "co2 –≤ –≤—ã–¥—ã—Ö–∞–µ–º–æ–º –≤–æ–∑–¥—É—Ö–µ", "ph", "po2", "pco2", "lac",
    # >>> –ü–†–û–ë–õ–ï–ú–ù–´–ï –ò–ó –°–û–û–ë–©–ï–ù–ò–Ø:
    # –æ–±—ä–µ–º –ª—ë–≥–∫–∏—Ö ‚Äî —É—á–∏—Ç—ã–≤–∞–µ–º ¬´–æ–±–™–µ–º/–æ–±–ï–º¬ª, a/–æ –∏ —Ç. –ø.
    "–æ–±—ä–µ–º –ª–µ–≥–∫–∏—Ö", "–æ–±—å–µ–º –ª–µ–≥–∫–∏—Ö", "–æ–±–µ–º –ª–µ–≥–∫–∏—Ö",
    # –¥–µ–Ω—Å–∏—Ç–æ–º–µ—Ç—Ä–∏—á–µ—Å–∫–∏–µ –ø–æ–∫–∞–∑–∞—Ç–µ–ª–∏ ‚Äî –ª—é–±—ã–µ –≤–∞—Ä–∏–∞—Ü–∏–∏ ¬´–¥–µ–Ω—Å–∏—Ç–æ‚Ä¶ –ø–æ–∫–∞–∑–∞—Ç‚Ä¶¬ª
    "–¥–µ–Ω—Å–∏—Ç–æ–º–µ—Ç—Ä–∏—á–µ—Å–∫–∏–µ –ø–æ–∫–∞–∑–∞—Ç–µ–ª–∏", "–¥–µ–Ω—Å–∏—Ç–æ–º–µ—Ç—Ä–∏—á–µc–∫–∏–µ –ø–æ–∫–∞–∑–∞—Ç–µ–ª–∏",
    "–¥–µ–Ω—Å–∏—Ç–æ–º–µ—Ç—Ä–∏—á–µ—Åkie –ø–æ–∫–∞–∑–∞—Ç–µ–ª–∏", "–¥–µ–Ω—Å–∏—Ç–æ",
    # –ø–ª–æ—Ç–Ω–æ—Å—Ç—å —Ñ–∏–±—Ä–æ–∑–∞ / –æ–±—â–∞—è ¬´–ø–ª–æ—Ç–Ω–æ—Å—Ç—å¬ª
    "–ø–ª–æ—Ç–Ω–æ—Å—Ç—å —Ñ–∏–±—Ä–æ–∑–∞", "–ø–ª–æ—Ç–Ω–æ—Å—Ç—å"
]

# –ù–æ—Ä–º–∞–ª–∏–∑—É–µ–º –∫–ª—é—á–∏ –∏ –∏–º–µ–Ω–∞ –º–∞—Ä–∫–µ—Ä–æ–≤, –ø–æ—Ç–æ–º —Å—Ä–∞–≤–Ω–∏–≤–∞–µ–º –ø–æ–¥—Å—Ç—Ä–æ–∫–∏
keys_norm = [norm_text(k) for k in pulmo_keys_raw]
def is_pulmo_marker(name: str) -> bool:
    n = norm_text(name)
    return any(kn in n for kn in keys_norm)

pulmo_markers = [m for m in all_markers if is_pulmo_marker(m)]
# –ï—Å–ª–∏ –≤–¥—Ä—É–≥ –ø—É—Å—Ç–æ ‚Äî –±–µ—Ä—ë–º –≤—Å—ë (–Ω–æ –æ–±—ã—á–Ω–æ —ç—Ç–æ –Ω–µ –ø–æ–Ω–∞–¥–æ–±–∏—Ç—Å—è)
if not pulmo_markers:
    pulmo_markers = all_markers.copy()

# ===== 3) LONG-—Ç–∞–±–ª–∏—Ü–∞ =====
rows = []
for m in pulmo_markers:
    for y in years:
        col = f"{m}_{y}"
        if col in df.columns:
            vals = pd.to_numeric(df[col], errors="coerce")
            rows.append(pd.DataFrame({
                "marker": m, "year": y,
                "group": df[group_col].map(GROUP_LABELS).fillna(df[group_col].astype(str)),
                "value": vals
            }))
long_df = pd.concat(rows, ignore_index=True)

def med_q1_q3(x):
    x = pd.to_numeric(pd.Series(x), errors="coerce").dropna()
    if len(x)==0: return np.nan, np.nan, np.nan, 0
    return float(np.nanmedian(x)), float(np.nanpercentile(x,25)), float(np.nanpercentile(x,75)), int(x.notna().sum())

def fnum(v):  return "NA" if pd.isna(v) else f"{v:.{DEC}g}"
def fstat(m,q1,q3,n,comma=False):
    t = (lambda s: s.replace(".", ",")) if comma else (lambda s: s)
    return f"{t(fnum(m))} [{t(fnum(q1))}; {t(fnum(q3))}] (n={n})"

# ===== 4) P-values =====
# –ú–µ–∂–≥—Ä—É–ø–ø–æ–≤—ã–µ (–ø–æ –≥–æ–¥–∞–º)
bt = []
for m in pulmo_markers:
    for y in years:
        sub = long_df[(long_df.marker==m)&(long_df.year==y)]
        v1 = sub[sub.group=="–ò–ó–õ"].value.dropna().values
        v2 = sub[sub.group=="–•–û–ë–õ"].value.dropna().values
        p = mannwhitneyu(v1, v2, alternative="two-sided").pvalue if len(v1)>0 and len(v2)>0 else np.nan
        bt.append({"marker":m,"year":y,"p_between":p})
between_df = pd.DataFrame(bt)
if not between_df.empty:
    ms = between_df.p_between.notna()
    q = np.full(len(between_df), np.nan)
    if ms.any(): q[ms.values] = multipletests(between_df.loc[ms,"p_between"], method="fdr_bh")[1]
    between_df["q_between"] = q

# –í–Ω—É—Ç—Ä–∏–≥—Ä—É–ø–ø–æ–≤—ã–µ (—Ç–æ–ª—å–∫–æ —Å–æ—Å–µ–¥–Ω–∏–µ –≥–æ–¥—ã)
pairs = {years[i+1]: years[i] for i in range(len(years)-1)}
wt = []
for m in pulmo_markers:
    for code,name in GROUP_LABELS.items():
        sub_idx = df[group_col]==code
        wide = {y: pd.to_numeric(df.loc[sub_idx, f"{m}_{y}"], errors="coerce")
                for y in years if f"{m}_{y}" in df.columns}
        if not wide: continue
        wide = pd.DataFrame(wide)
        for y2,y1 in pairs.items():
            if y2 in wide and y1 in wide:
                a,b = wide[y1],wide[y2]
                mask = a.notna() & b.notna()
                p = wilcoxon(a[mask], b[mask], zero_method="wilcox").pvalue if mask.sum()>1 else np.nan
                wt.append({"marker":m,"group":name,"year":y2,"prev_year":y1,"p_within":p})
within_df = pd.DataFrame(wt)
if not within_df.empty:
    ms = within_df.p_within.notna()
    q = np.full(len(within_df), np.nan)
    if ms.any(): q[ms.values] = multipletests(within_df.loc[ms,"p_within"], method="fdr_bh")[1]
    within_df["q_within"] = q

# ===== 5) –°–±–æ—Ä–∫–∞ ¬´—Å—Ç—Ä–∞–Ω–∏—Ü—ã¬ª =====
label = {y:(["a)","b)","c)","d)"][i] if i<4 else f"{i+1})") for i,y in enumerate(years)}
rows = []
for m in pulmo_markers:
    for y in years:
        sub = long_df[(long_df.marker==m)&(long_df.year==y)]
        mi,q1i,q3i,ni = med_q1_q3(sub[sub.group=="–ò–ó–õ"].value)
        mc,q1c,q3c,nc = med_q1_q3(sub[sub.group=="–•–û–ë–õ"].value)
        rb = between_df[(between_df.marker==m)&(between_df.year==y)]
        pb,qb = (rb.p_between.item(), rb.q_between.item()) if not rb.empty else (np.nan,np.nan)
        rwi = within_df[(within_df.marker==m)&(within_df.group=="–ò–ó–õ")&(within_df.year==y)]
        pwi,qwi = (rwi.p_within.item(), rwi.q_within.item()) if not rwi.empty else (np.nan,np.nan)
        rwc = within_df[(within_df.marker==m)&(within_df.group=="–•–û–ë–õ")&(within_df.year==y)]
        pwc,qwc = (rwc.p_within.item(), rwc.q_within.item()) if not rwc.empty else (np.nan,np.nan)
        rows.append({
            "Variables": m,
            "—Å—Ç—Ä–æ–∫–∞": f"{label[y]} {y} –≥–æ–¥",
            "–ú–ï [Q1;Q3] –ò–ó–õ": fstat(mi,q1i,q3i,ni),
            "–† –≤–Ω—É—Ç—Ä–∏–≥—Ä—É–ø–ø–æ–≤–æ–π (–ò–ó–õ)": (np.nan if y==years[0] else pwi),
            "–† —Å–∫–æ—Ä—Ä–µ–∫—Ç–∏—Ä–æ–≤–∞–Ω–Ω—ã–π (–ò–ó–õ)": (np.nan if y==years[0] else qwi),
            "–ú–ï [Q1;Q3] –•–û–ë–õ": fstat(mc,q1c,q3c,nc),
            "–† –≤–Ω—É—Ç—Ä–∏–≥—Ä—É–ø–ø–æ–≤–æ–π (–•–û–ë–õ)": (np.nan if y==years[0] else pwc),
            "–† —Å–∫–æ—Ä—Ä–µ–∫—Ç–∏—Ä–æ–≤–∞–Ω–Ω—ã–π (–•–û–ë–õ)": (np.nan if y==years[0] else qwc),
            "–† –º–µ–∂–≥—Ä—É–ø–ø–æ–≤–æ–π": pb,
            "–† —Å–∫–æ—Ä—Ä–µ–∫—Ç–∏—Ä–æ–≤–∞–Ω–Ω—ã–π (–º–µ–∂–≥—Ä—É–ø–ø–æ–≤–æ–π)": qb
        })
pulmo_page = pd.DataFrame(rows)
pulmo_page["year"] = pulmo_page["—Å—Ç—Ä–æ–∫–∞"].str.extract(r"(\d{4})").astype(int)
pulmo_page = pulmo_page.sort_values(["Variables","year"]).drop(columns=["year"]).reset_index(drop=True)

# ===== 6) Excel (–æ—Ñ–æ—Ä–º–ª–µ–Ω–∏–µ) =====
out_xlsx = "pulmo_profile_formatted.xlsx"
out_xlsx = "pulmo_profile_formatted.xlsx"
with pd.ExcelWriter(out_xlsx, engine="openpyxl") as writer:
    pulmo_page.to_excel(writer, sheet_name="pulmo_profile", index=False)
    within_df.to_excel(writer, sheet_name="within_raw", index=False)
    between_df.to_excel(writer, sheet_name="between_raw", index=False)

wb = load_workbook(out_xlsx); ws = wb["pulmo_profile"]
header_font = Font(bold=True, color="FFFFFF")
header_fill = PatternFill("solid", fgColor="4F81BD")
center = Alignment(horizontal="center", vertical="center", wrap_text=True)
left   = Alignment(horizontal="left",   vertical="center", wrap_text=True)
thin   = Side(border_style="thin", color="D9D9D9")
border = Border(left=thin, right=thin, top=thin, bottom=thin)
widths = {1:34,2:12,3:22,4:14,5:16,6:22,7:14,8:16,9:14,10:16}
for i,w in widths.items(): ws.column_dimensions[chr(ord('A')+i-1)].width = w
for c in ws[1]: c.font=header_font; c.fill=header_fill; c.alignment=center; c.border=border
ws.freeze_panes="A2"
for row in ws.iter_rows(min_row=2, max_row=ws.max_row, min_col=1, max_col=ws.max_column):
    for j,cell in enumerate(row, start=1):
        cell.alignment = left if j in (1,2,3,6) else center
        cell.border = border
for col in [4,5,7,8,9,10]:
    L = chr(ord('A')+col-1)
    for r in range(2, ws.max_row+1):
        cell = ws.cell(row=r, column=col)
        if isinstance(cell.value,float) and np.isnan(cell.value): cell.value=None
        cell.number_format="0.0000"
    for color,thr in [("FFF2CC",0.05),("FCE4D6",0.01),("F8CBAD",0.001)]:
        ws.conditional_formatting.add(f"{L}2:{L}{ws.max_row}",
            CellIsRule(operator='lessThan', formula=[str(thr)],
                       fill=PatternFill("solid", fgColor=color)))
wb.save(out_xlsx)

# ===== 7) Word (—Ç–∞–±–ª–∏—Ü–∞) =====
doc = Document()
style = doc.styles['Normal']; style.font.name='Times New Roman'
style._element.rPr.rFonts.set(qn('w:eastAsia'),'Times New Roman'); style.font.size=Pt(11)
p = doc.add_paragraph("–ü—É–ª—å–º–æ–Ω–æ–ª–æ–≥–∏—á–µ—Å–∫–∏–π –ø—Ä–æ—Ñ–∏–ª—å"); p.alignment = WD_ALIGN_PARAGRAPH.CENTER; p.runs[0].bold=True
n_ild, n_copd = int((df[group_col]==1).sum()), int((df[group_col]==2).sum())
doc.add_paragraph(f"–ò–ó–õ: n={n_ild};  –•–û–ë–õ: n={n_copd}")
headers = ["Variables","–ì–æ–¥","–ú–ï [Q1;Q3] –ò–ó–õ","p –≤–Ω—É—Ç—Ä–∏–≥—Ä. (–ò–ó–õ)","q (–ò–ó–õ)",
           "–ú–ï [Q1;Q3] –•–û–ë–õ","p –≤–Ω—É—Ç—Ä–∏–≥—Ä. (–•–û–ë–õ)","q (–•–û–ë–õ)","p –º–µ–∂–≥—Ä.","q –º–µ–∂–≥—Ä."]
tbl = doc.add_table(rows=1, cols=len(headers)); tbl.alignment=WD_TABLE_ALIGNMENT.CENTER; tbl.style='Table Grid'
for i,h in enumerate(headers): tbl.rows[0].cells[i].text = h
for m in pulmo_page["Variables"].unique():
    block = pulmo_page[pulmo_page["Variables"]==m]
    start = len(tbl.rows)
    for _,row in block.iterrows():
        tr = tbl.add_row().cells
        tr[0].text = m
        tr[1].text = row["—Å—Ç—Ä–æ–∫–∞"]
        tr[2].text = str(row["–ú–ï [Q1;Q3] –ò–ó–õ"]).replace(".", ",")
        tr[3].text = "" if pd.isna(row["–† –≤–Ω—É—Ç—Ä–∏–≥—Ä—É–ø–ø–æ–≤–æ–π (–ò–ó–õ)"]) else f"{row['–† –≤–Ω—É—Ç—Ä–∏–≥—Ä—É–ø–ø–æ–≤–æ–π (–ò–ó–õ)']:.4f}".replace(".", ",")
        tr[4].text = "" if pd.isna(row["–† —Å–∫–æ—Ä—Ä–µ–∫—Ç–∏—Ä–æ–≤–∞–Ω–Ω—ã–π (–ò–ó–õ)"]) else f"{row['–† —Å–∫–æ—Ä—Ä–µ–∫—Ç–∏—Ä–æ–≤–∞–Ω–Ω—ã–π (–ò–ó–õ)']:.4f}".replace(".", ",")
        tr[5].text = str(row["–ú–ï [Q1;Q3] –•–û–ë–õ"]).replace(".", ",")
        tr[6].text = "" if pd.isna(row["–† –≤–Ω—É—Ç—Ä–∏–≥—Ä—É–ø–ø–æ–≤–æ–π (–•–û–ë–õ)"]) else f"{row['–† –≤–Ω—É—Ç—Ä–∏–≥—Ä—É–ø–ø–æ–≤–æ–π (–•–û–ë–õ)']:.4f}".replace(".", ",")
        tr[7].text = "" if pd.isna(row["–† —Å–∫–æ—Ä—Ä–µ–∫—Ç–∏—Ä–æ–≤–∞–Ω–Ω—ã–π (–•–û–ë–õ)"]) else f"{row['–† —Å–∫–æ—Ä—Ä–µ–∫—Ç–∏—Ä–æ–≤–∞–Ω–Ω—ã–π (–•–û–ë–õ)']:.4f}".replace(".", ",")
        tr[8].text = "" if pd.isna(row["–† –º–µ–∂–≥—Ä—É–ø–ø–æ–≤–æ–π"]) else f"{row['–† –º–µ–∂–≥—Ä—É–ø–ø–æ–≤–æ–π']:.4f}".replace(".", ",")
        tr[9].text = "" if pd.isna(row["–† —Å–∫–æ—Ä—Ä–µ–∫—Ç–∏—Ä–æ–≤–∞–Ω–Ω—ã–π (–º–µ–∂–≥—Ä—É–ø–ø–æ–≤–æ–π)"]) else f"{row['–† —Å–∫–æ—Ä—Ä–µ–∫—Ç–∏—Ä–æ–≤–∞–Ω–Ω—ã–π (–º–µ–∂–≥—Ä—É–ø–ø–æ–≤–æ–π)']:.4f}".replace(".", ",")
    end = len(tbl.rows)-1
    if end >= start:
        tbl.cell(start,0).merge(tbl.cell(end,0)).text = m
doc.add_paragraph("–ü—Ä–∏–º–µ—á–∞–Ω–∏–µ: Wilcoxon ‚Äî —Å–æ—Å–µ–¥–Ω–∏–µ –≥–æ–¥—ã (2024 vs 2023; 2025 vs 2024); Mann‚ÄìWhitney ‚Äî –ø–æ –≥–æ–¥–∞–º; q ‚Äî FDR (BH).")
out_xlsx, out_docx = "pulmo_profile_formatted.xlsx", "pulmo_profile_table.docx"
doc.save(out_docx)

print("–ì–æ—Ç–æ–≤–æ:", out_xlsx, "–∏", out_docx)
try:
    from google.colab import files  # type: ignore
    files.download(out_xlsx); files.download(out_docx)
except Exception:
    pass


Saving pulmo_polarity_aligned –∏—Ç–æ–≥–æ–≤—ã–π —Ñ–∞–π–ª.xlsx to pulmo_polarity_aligned –∏—Ç–æ–≥–æ–≤—ã–π —Ñ–∞–π–ª (15).xlsx
–ì–æ—Ç–æ–≤–æ: pulmo_profile_formatted.xlsx –∏ pulmo_profile_table.docx


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# –ù–æ–≤—ã–π —Ä–∞–∑–¥–µ–ª