In [9]:
import pandas as pd

paths = {
    "Death": r"C:\Users\HP\OneDrive\Desktop\Phase 1\Clean Data\death_model_matrix_imputed_v1.csv",
    "Hospitalization": r"C:\Users\HP\OneDrive\Desktop\Phase 1\Clean Data\hospitalization_model_matrix_imputed_v1.csv",
    "Severe_ADR": r"C:\Users\HP\OneDrive\Desktop\Phase 1\Clean Data\severe_adr_model_matrix_imputed_v1.csv",
}

for name, path in paths.items():
    df = pd.read_csv(path)
    print(f"\n{name} matrix")
    print(f"Rows: {df.shape[0]}, Columns: {df.shape[1]}")
    print(df.columns.tolist())



Death matrix
Rows: 406, Columns: 75
['patient_id', 'death_outcome', 'survival_days', 'age', 'age_group', 'gender', 'ethnicity', 'education_level', 'bmi_category', 'employment_status', 'alcohol_consumption', 'smoking_status_detail', 'tumor_type', 'molecular_alterations', 'mutations_present', 'genotipo_DPYD_type', 'surgical_intervention', 'oncology_treatment_lines_n', 'radiotherapy_status', 'hypertension', 'aortic_insufficiency', 'dyslipidemia', 'IPB', 'obesity_comorbidity', 'ischemic_heart_disease', 'atrial_fibrillation', 'copd', 'asthma', 'diabete_tipo_II', 'gastroesophageal_reflux_full', 'hypertensive_heart_disease', 'renal_insufficiency', 'depressive_syndrome', 'anemia_comorbidity', 'psychiatric_disorders', 'cardiovascular_disorders', 'gastrointestinal_disorders', 'cerebrovascular_disorders', 'adr_description', 'adr_n_tot', 'adr_n_grado1', 'adr_n_grado2', 'adr_n_grado3', 'adr_n_grado4', 'adr_n_grado5', 'white_blood_cells_range', 'red_blood_cells_range', 'hemoglobin_range', 'neutroph

In [8]:
for name, path in paths.items():
    df = pd.read_csv(path)
    print(f"\n{name} matrix – dtypes")
    print(df.dtypes.value_counts())



Death matrix – dtypes
object     41
float64    33
int64       1
Name: count, dtype: int64

Hospitalization matrix – dtypes
object     38
float64     9
Name: count, dtype: int64

Severe_ADR matrix – dtypes
object     38
float64     8
Name: count, dtype: int64


In [13]:
import os
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# ----------------------------
# EDIT THESE PATHS
# ----------------------------
DATASETS = {
    "Death": r"C:\Users\HP\OneDrive\Desktop\Phase 1\Clean Data\death_model_matrix_imputed_v1.csv",
    "Hospitalization": r"C:\Users\HP\OneDrive\Desktop\Phase 1\Clean Data\hospitalization_model_matrix_imputed_v1.csv",
    "Severe_ADR": r"C:\Users\HP\OneDrive\Desktop\Phase 1\Clean Data\severe_adr_model_matrix_imputed_v1.csv",
}

OUT_BASE = r"C:\Users\HP\OneDrive\Desktop\Phase 2\plots"

OUTCOME_COL = {
    "Death": "death_outcome",
    "Hospitalization": "hospitalization_flag",
    "Severe_ADR": "severe_adr_flag",
}

AGE_COL = "age_group"
AGE_ORDER = ["<= 65 years", "> 65 years"]

# Controls
MAX_LEVELS_CATEGORICAL = 12
MIN_NON_MISSING = 20
NUMERIC_PLOT_MODE = "violin"   # "violin" or "hist"
DPI = 220

# Styling
sns.set_style("whitegrid")
plt.rcParams["figure.dpi"] = DPI

MISSING_PAT = re.compile(r"(missing|unknown|nan|none|null|not known|not available)", re.I)

# ----------------------------
# Helpers
# ----------------------------
def safe_filename(name: str) -> str:
    name = str(name).strip()
    name = re.sub(r"[<>:\"/\\|?*]", "_", name)
    name = re.sub(r"\s+", "_", name)
    return name

def is_missing_like(x) -> bool:
    if pd.isna(x):
        return True
    s = str(x).strip()
    if s == "":
        return True
    return bool(MISSING_PAT.search(s))

def clean_categorical(s: pd.Series) -> pd.Series:
    s2 = s.astype(str).str.strip()
    s2 = s2[~s2.apply(is_missing_like)]
    return s2

def clean_numeric(s: pd.Series) -> pd.Series:
    return pd.to_numeric(s, errors="coerce")

def coerce_binary_outcome(s: pd.Series) -> pd.Series:
    # numeric 0/1
    if pd.api.types.is_numeric_dtype(s):
        out = pd.to_numeric(s, errors="coerce")
        return out.where(out.isin([0, 1]), np.nan)

    x = s.astype(str).str.strip().str.lower()
    mapping = {
        "present / yes": 1, "absent / no": 0,
        "yes": 1, "no": 0,
        "true": 1, "false": 0,
        "1": 1, "0": 0,
    }
    return x.map(mapping)

def is_numeric_series(s: pd.Series) -> bool:
    if pd.api.types.is_numeric_dtype(s):
        return True
    tmp = pd.to_numeric(s, errors="coerce")
    # treat as numeric if at least half of non-missing can be parsed
    denom = max(1, s.notna().sum())
    return (tmp.notna().sum() / denom) >= 0.5


# ----------------------------
# Plotters (overlayed age groups)
# ----------------------------
def plot_categorical_eventrate_overlay(df, var, outcome_col, dataset_name, out_dir):
    """
    Grouped bar chart: event rate (%) by category, hue=age_group
    """
    tmp = df[[var, AGE_COL, outcome_col]].copy()
    tmp[var] = clean_categorical(tmp[var])
    tmp[outcome_col] = coerce_binary_outcome(tmp[outcome_col])

    tmp = tmp.dropna(subset=[var, AGE_COL, outcome_col]).copy()
    if len(tmp) < MIN_NON_MISSING or tmp[var].nunique() < 2:
        return False

    # limit categories to top N by frequency (keeps plots readable)
    top_levels = tmp[var].value_counts().index[:MAX_LEVELS_CATEGORICAL]
    tmp = tmp[tmp[var].isin(top_levels)].copy()

    # compute event rate
    # seaborn can do this directly with estimator=np.mean
    plt.figure(figsize=(12, 5))
    ax = sns.barplot(
        data=tmp,
        x=var,
        y=outcome_col,
        hue=AGE_COL,
        hue_order=AGE_ORDER,
        estimator=np.mean,
        errorbar=None
    )

    ax.set_ylabel("Event rate")
    ax.set_xlabel("")
    ax.set_ylim(0, 1)
    ax.set_title(f"{dataset_name}: {var} vs outcome (event rate), overlaid by age group")

    # show as %
    yticks = ax.get_yticks()
    ax.set_yticklabels([f"{int(y*100)}%" for y in yticks])

    plt.xticks(rotation=30, ha="right")

    # legend top-left (inside)
    ax.legend(title="Age group", loc="upper left", frameon=True)

    plt.tight_layout()
    out_path = os.path.join(out_dir, f"{safe_filename(var)}_bar_eventrate_overlay.png")
    plt.savefig(out_path, dpi=DPI)
    plt.close()
    return True


def plot_numeric_overlay(df, var, outcome_col, dataset_name, out_dir, mode="violin"):
    """
    Numeric overlay:
    - violin: distribution by outcome (0/1) with hue=age_group
    - hist: overlay histograms by age_group (optionally restrict to outcome=1)
    """
    tmp = df[[var, AGE_COL, outcome_col]].copy()
    tmp[var] = clean_numeric(tmp[var])
    tmp[outcome_col] = coerce_binary_outcome(tmp[outcome_col])

    tmp = tmp.dropna(subset=[var, AGE_COL, outcome_col]).copy()
    if len(tmp) < MIN_NON_MISSING:
        return False

    tmp[outcome_col] = tmp[outcome_col].astype(int).astype(str)  # "0"/"1" for seaborn
    tmp[AGE_COL] = tmp[AGE_COL].astype(str).str.strip()
    tmp = tmp[tmp[AGE_COL].isin(AGE_ORDER)].copy()

    if mode == "hist":
        # Overlay histograms for both age groups. Split by outcome: create 2 rows (0,1)
        g = sns.FacetGrid(tmp, row=outcome_col, height=3, aspect=2, sharex=True, sharey=False)
        g.map_dataframe(
            sns.histplot, x=var, hue=AGE_COL, hue_order=AGE_ORDER,
            element="step", stat="count", common_norm=False, alpha=0.35
        )
        g.add_legend(title="Age group")
        g.fig.suptitle(f"{dataset_name}: {var} distribution, overlaid by age group (by outcome)", y=1.02)

        # legend position (top-left-ish)
        leg = g._legend
        if leg:
            leg.set_bbox_to_anchor((0.12, 0.98))
            leg._loc = 2  # upper left

        out_path = os.path.join(out_dir, f"{safe_filename(var)}_hist_overlay_by_outcome.png")
        plt.tight_layout()
        plt.savefig(out_path, dpi=DPI, bbox_inches="tight")
        plt.close()
        return True

    # default: violin
    plt.figure(figsize=(10, 5))
    ax = sns.violinplot(
        data=tmp,
        x=outcome_col,          # 0 vs 1
        y=var,
        hue=AGE_COL,
        hue_order=AGE_ORDER,
        split=False,
        cut=0,
        inner="quartile"
    )

    ax.set_xlabel("Outcome (0=No event, 1=Event)")
    ax.set_title(f"{dataset_name}: {var} by outcome, overlaid by age group")

    # legend top-left (inside)
    ax.legend(title="Age group", loc="upper left", frameon=True)

    plt.tight_layout()
    out_path = os.path.join(out_dir, f"{safe_filename(var)}_violin_overlay.png")
    plt.savefig(out_path, dpi=DPI)
    plt.close()
    return True


# ----------------------------
# Runner
# ----------------------------
def run_dataset(dataset_name, path):
    df = pd.read_csv(path)
    df.columns = [str(c).strip() for c in df.columns]

    if AGE_COL not in df.columns:
        raise ValueError(f"[{dataset_name}] missing {AGE_COL}")

    outcome_col = OUTCOME_COL[dataset_name]
    if outcome_col not in df.columns:
        raise ValueError(f"[{dataset_name}] missing outcome col: {outcome_col}")

    # age group is already clean in your case, but we still enforce allowed labels
    df[AGE_COL] = df[AGE_COL].astype(str).str.strip()
    df = df[df[AGE_COL].isin(AGE_ORDER)].copy()

    out_dir = os.path.join(OUT_BASE, dataset_name)
    os.makedirs(out_dir, exist_ok=True)

    exclude = {"patient_id", AGE_COL, outcome_col}
    predictors = [c for c in df.columns if c not in exclude]

    saved, skipped = 0, 0

    for var in predictors:
        s = df[var]
        if s.notna().sum() < MIN_NON_MISSING or s.dropna().nunique() < 2:
            skipped += 1
            continue

        try:
            if is_numeric_series(s):
                ok = plot_numeric_overlay(df, var, outcome_col, dataset_name, out_dir, mode=NUMERIC_PLOT_MODE)
            else:
                # avoid insane cardinality plots
                cats = clean_categorical(s)
                if cats.nunique() > MAX_LEVELS_CATEGORICAL * 4:
                    skipped += 1
                    continue
                ok = plot_categorical_eventrate_overlay(df, var, outcome_col, dataset_name, out_dir)

            if ok:
                saved += 1
            else:
                skipped += 1

        except Exception:
            skipped += 1

    print(f"[{dataset_name}] Saved {saved} plots to: {out_dir} (skipped: {skipped})")


for name, p in DATASETS.items():
    run_dataset(name, p)

print("Done.")


  ax.set_yticklabels([f"{int(y*100)}%" for y in yticks])
  ax.set_yticklabels([f"{int(y*100)}%" for y in yticks])
  ax.set_yticklabels([f"{int(y*100)}%" for y in yticks])
  ax.set_yticklabels([f"{int(y*100)}%" for y in yticks])
  ax.set_yticklabels([f"{int(y*100)}%" for y in yticks])
  ax.set_yticklabels([f"{int(y*100)}%" for y in yticks])
  ax.set_yticklabels([f"{int(y*100)}%" for y in yticks])
  ax.set_yticklabels([f"{int(y*100)}%" for y in yticks])
  ax.set_yticklabels([f"{int(y*100)}%" for y in yticks])
  ax.set_yticklabels([f"{int(y*100)}%" for y in yticks])
  ax.set_yticklabels([f"{int(y*100)}%" for y in yticks])
  ax.set_yticklabels([f"{int(y*100)}%" for y in yticks])
  ax.set_yticklabels([f"{int(y*100)}%" for y in yticks])
  ax.set_yticklabels([f"{int(y*100)}%" for y in yticks])
  ax.set_yticklabels([f"{int(y*100)}%" for y in yticks])
  ax.set_yticklabels([f"{int(y*100)}%" for y in yticks])
  ax.set_yticklabels([f"{int(y*100)}%" for y in yticks])
  ax.set_yticklabels([f"{int(y*

  ax.set_yticklabels([f"{int(y*100)}%" for y in yticks])
  ax.set_yticklabels([f"{int(y*100)}%" for y in yticks])
  ax.set_yticklabels([f"{int(y*100)}%" for y in yticks])
  ax.set_yticklabels([f"{int(y*100)}%" for y in yticks])
  ax.set_yticklabels([f"{int(y*100)}%" for y in yticks])


[Death] Saved 72 plots to: C:\Users\HP\OneDrive\Desktop\Phase 2\plots\Death (skipped: 0)


  ax.set_yticklabels([f"{int(y*100)}%" for y in yticks])
  ax.set_yticklabels([f"{int(y*100)}%" for y in yticks])
  ax.set_yticklabels([f"{int(y*100)}%" for y in yticks])
  ax.set_yticklabels([f"{int(y*100)}%" for y in yticks])
  ax.set_yticklabels([f"{int(y*100)}%" for y in yticks])
  ax.set_yticklabels([f"{int(y*100)}%" for y in yticks])
  ax.set_yticklabels([f"{int(y*100)}%" for y in yticks])
  ax.set_yticklabels([f"{int(y*100)}%" for y in yticks])
  ax.set_yticklabels([f"{int(y*100)}%" for y in yticks])
  ax.set_yticklabels([f"{int(y*100)}%" for y in yticks])
  ax.set_yticklabels([f"{int(y*100)}%" for y in yticks])
  ax.set_yticklabels([f"{int(y*100)}%" for y in yticks])
  ax.set_yticklabels([f"{int(y*100)}%" for y in yticks])
  ax.set_yticklabels([f"{int(y*100)}%" for y in yticks])
  ax.set_yticklabels([f"{int(y*100)}%" for y in yticks])
  ax.set_yticklabels([f"{int(y*100)}%" for y in yticks])
  ax.set_yticklabels([f"{int(y*100)}%" for y in yticks])
  ax.set_yticklabels([f"{int(y*

  ax.set_yticklabels([f"{int(y*100)}%" for y in yticks])
  ax.set_yticklabels([f"{int(y*100)}%" for y in yticks])


[Hospitalization] Saved 44 plots to: C:\Users\HP\OneDrive\Desktop\Phase 2\plots\Hospitalization (skipped: 0)


  ax.set_yticklabels([f"{int(y*100)}%" for y in yticks])
  ax.set_yticklabels([f"{int(y*100)}%" for y in yticks])
  ax.set_yticklabels([f"{int(y*100)}%" for y in yticks])
  ax.set_yticklabels([f"{int(y*100)}%" for y in yticks])
  ax.set_yticklabels([f"{int(y*100)}%" for y in yticks])
  ax.set_yticklabels([f"{int(y*100)}%" for y in yticks])
  ax.set_yticklabels([f"{int(y*100)}%" for y in yticks])
  ax.set_yticklabels([f"{int(y*100)}%" for y in yticks])
  ax.set_yticklabels([f"{int(y*100)}%" for y in yticks])
  ax.set_yticklabels([f"{int(y*100)}%" for y in yticks])
  ax.set_yticklabels([f"{int(y*100)}%" for y in yticks])
  ax.set_yticklabels([f"{int(y*100)}%" for y in yticks])
  ax.set_yticklabels([f"{int(y*100)}%" for y in yticks])
  ax.set_yticklabels([f"{int(y*100)}%" for y in yticks])
  ax.set_yticklabels([f"{int(y*100)}%" for y in yticks])
  ax.set_yticklabels([f"{int(y*100)}%" for y in yticks])
  ax.set_yticklabels([f"{int(y*100)}%" for y in yticks])
  ax.set_yticklabels([f"{int(y*

  ax.set_yticklabels([f"{int(y*100)}%" for y in yticks])


[Severe_ADR] Saved 42 plots to: C:\Users\HP\OneDrive\Desktop\Phase 2\plots\Severe_ADR (skipped: 1)
Done.
