In [1]:
# %% [markdown]
# 02 - STANDARDIZATION
# - Clean text tokens and unify missing markers
# - Normalize key categorical labels (e.g., gender)
# - Robust date parsing (handles strings, Excel serials, tz)
# - Soft numeric coercion for mostly-numeric object columns
# - Derive useful variables (age_at_observation_start, observation_length_days_calc, smoking_pack_index_like)
# - Drop duplicate patient_id rows (keep first)
# - Save standardized dataset

# %%
import os
import re
import pandas as pd
import numpy as np

# ------------------ CONFIG ------------------
INPUT_PATH = "C:/Users/HP/OneDrive/Desktop/VERO_code/Phase_1/data/processed/merged_codige_wide_english_values_translated.xlsx"
OUT_DIR = r"C:\Users\HP\OneDrive\Desktop\VERO_code\Phase_1\outputs\standardization"
os.makedirs(OUT_DIR, exist_ok=True)

# ------------------ LOAD ------------------
df = pd.read_excel(INPUT_PATH)

# ------------------ 1) BASIC STRING CLEANUPS ------------------
# - Trim whitespace
# - Normalize common missing tokens to np.nan
# - Unify "Missing" phrasing for later visuals
missing_aliases = {
    "nan": np.nan, "NaN": np.nan, "NONE": np.nan, "None": np.nan, "": np.nan,
    "missing": "Missing / Not Known", "Missing": "Missing / Not Known",
    "Non noto": "Missing / Not Known", "non noto": "Missing / Not Known",
    "Non noto / Non conosciuto": "Missing / Not Known",
    "Non disponibile": "Missing / Not Known",
}

obj_like = df.select_dtypes(include=["object", "category"]).columns.tolist()
for c in obj_like:
    # Work in string space to avoid .str errors
    s = df[c].astype(str).str.strip()
    # Replace explicit tokens
    s = s.replace(missing_aliases)
    # Collapse strings that are literally "nan" (created by astype(str) on NaN)
    s = s.replace({"NaT": np.nan, "NaN": np.nan, "None": np.nan, "nan": np.nan})
    # If everything became "" after strip, set to NaN
    s = s.replace({"": np.nan})
    df[c] = s

# ------------------ 2) KEY CATEGORICAL NORMALIZATION ------------------
# Gender (defensive): unify various forms to "Male"/"Female"
if "gender" in df.columns:
    df["gender"] = (
        df["gender"]
        .replace({
            "1": "Male", "2": "Female",
            "Maschio": "Male", "Femmina": "Female",
            "M": "Male", "F": "Female",
            "male": "Male", "female": "Female",
        })
        .where(df["gender"].notna(), other=np.nan)
    )

# If you need other categorical normalizations, add here:
# e.g., yes/no variants → Present/Absent
binary_alias_map = {
    "si": "Present / Yes",
    "sì": "Present / Yes",
    "yes": "Present / Yes",
    "no": "Absent / No",
}
for c in obj_like:
    # Only apply to small-cardinality columns (avoid damaging free text)
    if df[c].nunique(dropna=True) <= 6:
        df[c] = df[c].astype(str).str.lower().replace(binary_alias_map)
        df[c] = df[c].replace({"present / yes": "Present / Yes", "absent / no": "Absent / No"})

# ------------------ 3) ROBUST DATE PARSING ------------------
def parse_to_datetime_series_strict(s: pd.Series) -> pd.Series:
    """
    Robustly coerce a series to datetime64[ns] with NaT for failures.
    - Tries parsing with dayfirst=True and dayfirst=False, then combines.
    - Converts Excel serial numbers to dates (origin 1899-12-30).
    - Normalizes any timezone-aware stamps to naive.
    """
    # First passes: strings, existing datetimes
    p1 = pd.to_datetime(s, errors="coerce", dayfirst=True)
    p2 = pd.to_datetime(s, errors="coerce", dayfirst=False)

    # Excel serials
    numeric = pd.to_numeric(s, errors="coerce")
    serial_dt = pd.Series(pd.NaT, index=s.index, dtype="datetime64[ns]")
    mask = numeric.notna()
    if mask.any():
        serial_dt.loc[mask] = pd.to_datetime("1899-12-30") + pd.to_timedelta(numeric.loc[mask], unit="D")

    parsed = p1.combine_first(p2).combine_first(serial_dt)

    # Normalize timezone (if any) to naive
    try:
        parsed = parsed.dt.tz_convert(None)
    except Exception:
        try:
            parsed = parsed.dt.tz_localize(None)
        except Exception:
            pass

    parsed = pd.to_datetime(parsed, errors="coerce")
    return parsed

date_candidates = [c for c in df.columns if c.lower().endswith("_date") or c.lower().endswith("_dt") or ("date" in c.lower())]
for c in date_candidates:
    df[c] = parse_to_datetime_series_strict(df[c])

# ------------------ 4) SOFT NUMERIC COERCION (ROBUST) ------------------
# Identify object-like columns that are "mostly numeric-looking" (including commas as decimal separators),
# skip obvious ID-like columns, and coerce them to real numbers.

id_like_cols = {"patient_id"}  # extend if needed (e.g., {'patient_id','mrn','internal_id'})
obj_or_cat = df.select_dtypes(include=["object", "category"]).columns.tolist()
coerced_cols = []

for c in obj_or_cat:
    if c in id_like_cols:
        continue
    s = df[c]
    non_null = s.dropna()
    if non_null.empty:
        continue

    s_str = non_null.astype(str)

    # Heuristic: if many letters, likely categorical text (skip)
    alpha_ratio = s_str.str.contains(r"[A-Za-z]", regex=True).mean()
    if alpha_ratio > 0.20:
        continue

    # Try numeric probe (replace comma decimal to dot)
    s_num_probe = pd.to_numeric(s_str.str.replace(",", ".", regex=False), errors="coerce")
    if s_num_probe.notna().mean() > 0.80:  # mostly numeric-like
        df[c] = pd.to_numeric(df[c].astype(str).str.replace(",", ".", regex=False), errors="coerce")
        coerced_cols.append(c)

if coerced_cols:
    print("Soft-coerced to numeric:", coerced_cols)

# ------------------ 5) DERIVED VARIABLES ------------------
# 5.1 age_at_observation_start
if {"birth_date", "observation_start_date"}.issubset(df.columns):
    df["age_at_observation_start"] = ((df["observation_start_date"] - df["birth_date"]).dt.days / 365.25).round(2)

# 5.2 observation_length_days_calc (from dates)
if {"observation_start_date", "observation_end_date"}.issubset(df.columns):
    df["observation_length_days_calc"] = (df["observation_end_date"] - df["observation_start_date"]).dt.days

# 5.3 smoking_pack_index_like (simple proxy)
if {"cigarettes_per_day", "smoking_years"}.issubset(df.columns):
    df["smoking_pack_index_like"] = df["cigarettes_per_day"].fillna(0) * df["smoking_years"].fillna(0)

# ------------------ 6) DUPLICATES (CONSERVATIVE) ------------------
# Keep the first occurrence of each patient_id (if exists). If you need aggregation, change strategy here.
if "patient_id" in df.columns:
    before = len(df)
    df = df.drop_duplicates(subset=["patient_id"], keep="first")
    after = len(df)
    if before - after > 0:
        print(f"Dropped {before - after} duplicate patient_id rows (kept first).")

# ------------------ 7) SAVE ------------------
OUT_PATH = os.path.join(OUT_DIR, "standardized_dataset.xlsx")

# ------------------ TZ STRIP + COERCE TO DATE (YYYY-MM-DD) ------------------
import pandas as pd

def strip_tz_and_to_date(s: pd.Series) -> pd.Series:
    # Ensure datetime dtype where possible
    s = pd.to_datetime(s, errors="coerce")

    # Remove any timezone if present
    try:
        s = s.dt.tz_convert(None)
    except Exception:
        try:
            s = s.dt.tz_localize(None)
        except Exception:
            pass

    return pd.to_datetime(s.dt.date, errors="coerce")

# Find datetime-like columns after your parsing step
dt_cols = [c for c in df.columns if pd.api.types.is_datetime64_any_dtype(df[c])]
for c in dt_cols:
    df[c] = strip_tz_and_to_date(df[c])


df.to_excel(OUT_PATH, index=False)

print("✅ Standardization complete.")
print("Saved to:", OUT_PATH)

# Optional: quick sanity snapshot
print("Rows:", len(df), "| Columns:", df.shape[1])
num_cols = df.select_dtypes(include="number").columns
print("Numeric columns:", len(num_cols))
print("Date columns:", sum(pd.api.types.is_datetime64_any_dtype(df[c]) for c in df.columns))


  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (
  p1 = pd.to_datetime(s, errors="coerce", dayfirst=True)
  p2 = pd.to_datetime(s, errors="coerce", dayfirst=False)
  p1 = pd.to_datetime(s, errors="coerce", dayfirst=True)
  p2 = pd.to_datetime(s, errors="coerce", dayfirst=False)
  p1 = pd.to_datetime(s, errors="coerce", dayfirst=True)
  p2 = pd.to_datetime(s, errors="coerce", dayfirst=False)
  p1 = pd.to_datetime(s, errors="coerce", dayfirst=True)
  p2 = pd.to_datetime(s, errors="coerce", dayfirst=False)
  p1 = pd.to_datetime(s, errors="coerce", dayfirst=True)
  p2 = pd.to_datetime(s, errors="coerce", dayfirst=False)
  p1 = pd.to_datetime(s, errors="coerce", dayfirst=True)
  p2 = pd.to_datetime(s, errors="coerce", dayfirst=False)
  p1 = pd.to_datetime(s, errors="coerce", dayfirst=True)
  p2 = pd.to_datetime(s, errors="coerce", dayfirst=False)
  p1 = pd.to_datetime(s, errors="coerce", dayfirst=True)
  p1 = pd.to_datetime(s, errors="coerce", dayfir

  df["age_at_observation_start"] = ((df["observation_start_date"] - df["birth_date"]).dt.days / 365.25).round(2)
  df["observation_length_days_calc"] = (df["observation_end_date"] - df["observation_start_date"]).dt.days
  df.to_excel(OUT_PATH, index=False)


✅ Standardization complete.
Saved to: C:\Users\HP\OneDrive\Desktop\VERO_code\Phase_1\outputs\standardization\standardized_dataset.xlsx
Rows: 403 | Columns: 137
Numeric columns: 37
Date columns: 14
