In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import numpy as np

# -------------------------------
# CONFIGURATION
# -------------------------------

DATA_DIR = Path(r"C:\Users\HP\OneDrive\Desktop\Phase 1\Clean Data")
INPUT_FILE = DATA_DIR / "codige_master_clean__v2.xlsx"

OUTPUT_DIR = DATA_DIR / "Phase1_Univariate_Plots"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# Non-informative values to ignore
INVALID_VALUES = {"unknown", "missing", "nan", "none", ""}

# Plot style
sns.set(style="whitegrid")
plt.rcParams["figure.dpi"] = 150


# -------------------------------
# LOAD DATA
# -------------------------------

df = pd.read_excel(INPUT_FILE)

print(f"Dataset loaded: {df.shape[0]} rows, {df.shape[1]} columns")


# -------------------------------
# HELPER FUNCTIONS
# -------------------------------

def clean_series(series):
    """Remove NaN and non-informative string values."""
    s = series.dropna()
    if s.dtype == object:
        s = s[~s.str.lower().isin(INVALID_VALUES)]
    return s


def plot_numeric(series, colname):
    """Histogram for numeric variables."""
    plt.figure()
    sns.histplot(series, bins=30, kde=False)
    plt.xlabel(colname)
    plt.ylabel("Count")
    plt.title(f"Distribution of {colname}")
    plt.tight_layout()
    plt.savefig(OUTPUT_DIR / f"{colname}_hist.png")
    plt.close()


def plot_categorical(series, colname):
    """Bar plot for categorical variables."""
    counts = series.value_counts().sort_values(ascending=False)

    plt.figure(figsize=(8, max(4, len(counts) * 0.3)))
    sns.barplot(x=counts.values, y=counts.index)
    plt.xlabel("Count")
    plt.ylabel(colname)
    plt.title(f"Distribution of {colname}")
    plt.tight_layout()
    plt.savefig(OUTPUT_DIR / f"{colname}_bar.png")
    plt.close()


# -------------------------------
# UNIVARIATE PLOTTING LOOP
# -------------------------------

for col in df.columns:
    print(f"Processing: {col}")

    series = clean_series(df[col])

    # Skip empty or near-empty variables
    if series.empty or series.nunique() < 2:
        print(f"  Skipped (no usable data)")
        continue

    # Numeric vs categorical
    if pd.api.types.is_numeric_dtype(series):
        plot_numeric(series, col)
    else:
        plot_categorical(series, col)

print("Phase 1 univariate plots completed.")
print(f"Plots saved to: {OUTPUT_DIR}")


Dataset loaded: 406 rows, 137 columns
Processing: patient_id
Processing: birth_date
Processing: age
Processing: age_group
Processing: gender
Processing: ethnicity
Processing: education_level
Processing: bmi_value
Processing: bmi_category
Processing: employment_status
Processing: alcohol_consumption
Processing: smoking_status_binary
Processing: smoking_status_detail
Processing: smoking_years
Processing: observation_start_date
Processing: observation_end_date
Processing: observation_end_reason
Processing: tumor_diagnosis_date
Processing: Oncology Unit Intake Date
Processing: tumor_type
Processing: breast_cancer_subtype
  Skipped (no usable data)
Processing: colon_cancer_location
Processing: stadio_TNM
Processing: tumor_stage_roman
Processing: histological_grade
Processing: molecular_alterations
Processing: mutations_present
Processing: genotipo_DPYD
Processing: genotipo_DPYD_type
Processing: surgical_intervention
Processing: surgery_date
Processing: surgery_type
Processing: surgery_type_

  plt.tight_layout()


Processing: prior_surgery
Processing: prior_radiotherapy
Processing: Number of Previous Treatment Lines
Processing: Other Surgical Intervention
Processing: surgery_complications
Processing: reoperation_for_complication
Processing: hospitalization_for_surgery_complication
Processing: oncology_treatment_lines_n
Processing: radiotherapy_status
Processing: radiotherapy_start_date
Processing: radiotherapy_end_date
Processing: transfusion_received
Processing: transfusions_total_n
Processing: hypertension
Processing: aortic_insufficiency
Processing: dyslipidemia
Processing: IPB
Processing: obesity_comorbidity
Processing: ischemic_heart_disease
Processing: atrial_fibrillation
Processing: copd
Processing: asthma
Processing: diabete_tipo_II
Processing: gastroesophageal_reflux_full
Processing: hypertensive_heart_disease
Processing: renal_insufficiency
Processing: depressive_syndrome
Processing: anemia_comorbidity
Processing: psychiatric_disorders
Processing: other_comorbidities
Processing: cardio

  plt.tight_layout()


Processing: cci_score
Processing: n_treatment_lines
Processing: total_chemo_cycles
Processing: received_chemo
Processing: treatment_duration_days
Processing: any_dose_reduction
Processing: any_toxicity
Processing: end_due_to_progression
Processing: max_combo_regimen_size
Processing: total_unique_active_drugs
Processing: time_from_diagnosis_to_first_treatment_days
Processing: received_cardiotoxic_drug
Processing: received_nephrotoxic_drug
Processing: received_neurotoxic_drug
Processing: received_hematotoxic_drug
Processing: received_targeted_therapy
Processing: n_unique_active_principles
Processing: n_chronic_drugs
  Skipped (no usable data)
Processing: polypharmacy_flag
Processing: chronic_polypharmacy_flag
  Skipped (no usable data)
Processing: n_med_events
Processing: medication_duration_days
Processing: uses_antithrombotics
Processing: uses_antidiabetics_atc
Processing: uses_renin_angiotensin_drugs
Processing: uses_diuretics
Processing: uses_lipid_lowering_drugs
Processing: uses_car

In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import numpy as np
import re

# -------------------------------
# CONFIGURATION
# -------------------------------
DATA_DIR = Path(r"C:\Users\HP\OneDrive\Desktop\Phase 1\Clean Data")
INPUT_FILE = DATA_DIR / "codige_master_clean__v2.xlsx"

OUTPUT_DIR = DATA_DIR / "Phase1_Univariate_Plots_AgeGroupsTogether"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

AGE_GROUP_COL = "age_group"
AGE_GROUPS_KEEP = ["<= 65 years", "> 65 years"]

INVALID_VALUES = {"unknown", "missing", "nan", "none", "", "missing/unknown", "not known", "not available"}

sns.set(style="whitegrid")
plt.rcParams["figure.dpi"] = 150

# -------------------------------
# HELPERS
# -------------------------------
def safe_filename(name: str) -> str:
    name = str(name).strip()
    name = re.sub(r'[<>:"/\\|?*]', "_", name)
    name = re.sub(r"\s+", "_", name)
    return name[:180]

def is_missing_like(x) -> bool:
    if pd.isna(x):
        return True
    s = str(x).strip().lower()
    if s in INVALID_VALUES:
        return True
    if ("missing" in s) or ("unknown" in s):
        return True
    return False

def clean_series(series: pd.Series) -> pd.Series:
    s = series.dropna()
    if s.dtype == object:
        s = s[~s.apply(is_missing_like)]
    return s

def clean_age_group(df: pd.DataFrame) -> pd.DataFrame:
    d = df.copy()
    d[AGE_GROUP_COL] = d[AGE_GROUP_COL].astype(str).str.strip()
    d = d[~d[AGE_GROUP_COL].apply(is_missing_like)].copy()
    d = d[d[AGE_GROUP_COL].isin(AGE_GROUPS_KEEP)].copy()
    return d

def is_boolean_like(series: pd.Series) -> bool:
    # True boolean dtype
    if pd.api.types.is_bool_dtype(series):
        return True

    # Sometimes stored as object but only contains True/False-ish values
    s = clean_series(series).astype(str).str.strip().str.lower()
    if s.empty:
        return False

    bool_tokens = {"true", "false", "t", "f", "yes", "no"}
    if set(s.unique()).issubset(bool_tokens):
        return True

    # Sometimes it is 0/1 but meant as flags. If only {0,1} and not continuous,
    # we treat as categorical (bar chart) not histogram.
    s_num = pd.to_numeric(clean_series(series), errors="coerce").dropna()
    if not s_num.empty and set(s_num.unique()).issubset({0, 1}) and s_num.nunique() <= 2:
        return True

    return False

def is_numeric_for_hist(series: pd.Series) -> bool:
    # numeric dtype BUT not boolean-like and has enough unique values
    if not pd.api.types.is_numeric_dtype(series):
        return False
    if is_boolean_like(series):
        return False
    s_num = pd.to_numeric(clean_series(series), errors="coerce").dropna()
    return (len(s_num) > 0) and (s_num.nunique() >= 5)  # histogram makes sense

# -------------------------------
# LOAD DATA
# -------------------------------
df = pd.read_excel(INPUT_FILE)
df.columns = [str(c).strip() for c in df.columns]

if AGE_GROUP_COL not in df.columns:
    raise ValueError(f"'{AGE_GROUP_COL}' not found. Columns: {df.columns.tolist()}")

df = clean_age_group(df)

print("Loaded:", df.shape)
print("Age groups:", df[AGE_GROUP_COL].value_counts().to_dict())
print("Saving plots to:", OUTPUT_DIR)

# -------------------------------
# PLOTTING FUNCTIONS
# -------------------------------
def plot_numeric_two_groups(df, col, out_path):
    s1 = clean_series(df.loc[df[AGE_GROUP_COL] == AGE_GROUPS_KEEP[0], col])
    s2 = clean_series(df.loc[df[AGE_GROUP_COL] == AGE_GROUPS_KEEP[1], col])

    s1 = pd.to_numeric(s1, errors="coerce").dropna()
    s2 = pd.to_numeric(s2, errors="coerce").dropna()

    if s1.empty and s2.empty:
        return False
    if s1.nunique() < 2 and s2.nunique() < 2:
        return False

    plt.figure(figsize=(8, 5))

    all_vals = pd.concat([s1, s2], axis=0)
    if all_vals.empty:
        return False

    # build numeric bins using numeric min/max
    bins = 30
    plt.hist(s1.values, bins=bins, alpha=0.55, label=AGE_GROUPS_KEEP[0])
    plt.hist(s2.values, bins=bins, alpha=0.55, label=AGE_GROUPS_KEEP[1])

    plt.xlabel(col)
    plt.ylabel("Count")
    plt.title(f"{col}: distribution by age group")
    plt.legend(title="Age group", frameon=False)
    plt.tight_layout()
    plt.savefig(out_path, dpi=200)
    plt.close()
    return True

def plot_categorical_two_groups(df, col, out_path, top_k=30):
    s = df[[AGE_GROUP_COL, col]].copy()
    s[col] = s[col].astype(object)

    s = s[~s[col].apply(is_missing_like)].copy()
    if s.empty:
        return False

    ct = (
        s.groupby([col, AGE_GROUP_COL])
         .size()
         .reset_index(name="count")
    )

    totals = ct.groupby(col)["count"].sum().sort_values(ascending=False)
    keep_levels = totals.head(top_k).index.tolist()
    ct = ct[ct[col].isin(keep_levels)].copy()

    if ct.empty or ct[col].nunique() < 2:
        return False

    cat_order = totals.loc[keep_levels].index.tolist()

    plt.figure(figsize=(10, max(5, 0.35 * len(cat_order))))
    sns.barplot(
        data=ct,
        x="count",
        y=col,
        hue=AGE_GROUP_COL,
        order=cat_order,
        hue_order=AGE_GROUPS_KEEP
    )

    plt.xlabel("Count")
    plt.ylabel(col)
    plt.title(f"{col}: distribution by age group")
    plt.legend(title="Age group", frameon=False, loc="best")
    plt.tight_layout()
    plt.savefig(out_path, dpi=200)
    plt.close()
    return True

# -------------------------------
# LOOP THROUGH VARIABLES
# -------------------------------
skipped = []
saved = 0

for col in df.columns:
    if col == AGE_GROUP_COL:
        continue

    series = clean_series(df[col])
    if series.empty or series.nunique() < 2:
        skipped.append(col)
        continue

    out_path = OUTPUT_DIR / f"{safe_filename(col)}_agegroup_compare.png"

    # Decide plot type safely
    if is_numeric_for_hist(series):
        ok = plot_numeric_two_groups(df, col, out_path)
    else:
        ok = plot_categorical_two_groups(df, col, out_path)

    if ok:
        saved += 1
    else:
        skipped.append(col)

print(f"Done. Saved {saved} plots.")
print(f"Skipped {len(skipped)} variables (empty/constant/too sparse).")
print("Output folder:", OUTPUT_DIR)


Loaded: (406, 137)
Age groups: {'<= 65 years': 207, '> 65 years': 199}
Saving plots to: C:\Users\HP\OneDrive\Desktop\Phase 1\Clean Data\Phase1_Univariate_Plots_AgeGroupsTogether
Done. Saved 133 plots.
Skipped 3 variables (empty/constant/too sparse).
Output folder: C:\Users\HP\OneDrive\Desktop\Phase 1\Clean Data\Phase1_Univariate_Plots_AgeGroupsTogether
