# Data Merging

In [13]:
import os
import sys
import pandas as pd
import numpy as np
from pathlib import Path
from pandas.api.types import is_numeric_dtype

# ============== CONFIG ==============
FILES = {
    "codige": r"C:\Users\HP\OneDrive\Desktop\VERO_code\Phase_1\data\raw\codige.xlsx",
    "trattamento": r"C:\Users\HP\OneDrive\Desktop\VERO_code\Phase_1\data\raw\codige_trattamento_oncologico.xlsx",
    "ricoveri": r"C:\Users\HP\OneDrive\Desktop\VERO_code\Phase_1\data\raw\codige_ricoveri.xlsx",
    "comorbidita": r"C:\Users\HP\OneDrive\Desktop\VERO_code\Phase_1\data\raw\codige_comorbidità.xlsx",
    "adr": r"C:\Users\HP\OneDrive\Desktop\VERO_code\Phase_1\data\raw\codige_adr_clean.xlsx",
}

OUTPUT_PATH = r"C:\Users\HP\OneDrive\Desktop\VERO_code\Phase_1\data\interim\merged_codige_dataset.xlsx"
ID_COL_STD = "id_paziente"

# ============== HELPERS ==============
def read_excel_safe(path: str) -> pd.DataFrame:
    """
    Read the first sheet of an Excel file with basic safeguards.
    """
    p = Path(os.path.normpath(path))
    if not p.exists():
        raise FileNotFoundError(f"Missing file: {p}")
    try:
        df = pd.read_excel(p)
        return df
    except Exception as e:
        raise RuntimeError(f"Failed to read {p}: {e}")

def make_unique_columns(df: pd.DataFrame) -> pd.DataFrame:
    """
    Ensure column names are unique by appending suffixes like '.1', '.2' if needed.
    """
    cols = pd.Series(df.columns, dtype="object")
    for i in range(len(cols)):
        dup_count = (cols[:i] == cols[i]).sum()
        if dup_count > 0:
            cols[i] = f"{cols[i]}.{dup_count}"
    df.columns = cols
    return df

def find_and_standardize_id(df: pd.DataFrame, id_name_std: str = ID_COL_STD) -> pd.DataFrame:
    """
    Find the ID column case-insensitively and standardize its name and type.
    """
    candidates = [c for c in df.columns if str(c).strip().lower() == id_name_std.lower()]
    if not candidates:
        raise KeyError(f"Could not find '{id_name_std}' (case-insensitive) in columns: {list(df.columns)}")
    actual = candidates[0]
    if actual != id_name_std:
        df = df.rename(columns={actual: id_name_std})
    df[id_name_std] = df[id_name_std].astype(str).str.strip()
    return df

def clean_columns(df: pd.DataFrame) -> pd.DataFrame:
    """
    Trim and lowercase column names. Then make them unique.
    """
    df = df.copy()
    df.columns = df.columns.astype(str).str.strip().str.lower()
    df = make_unique_columns(df)
    return df

def aggregate_by_id(df: pd.DataFrame, id_col: str = ID_COL_STD) -> pd.DataFrame:
    """
    Aggregate to one row per ID:
      - numeric columns: mean
      - non-numeric columns: unique, sorted, joined by '; '
    """
    df = df.copy()
    df = clean_columns(df)
    df = find_and_standardize_id(df, id_col)

    agg_funcs = {}
    for col in df.columns:
        if col == id_col:
            continue
        if is_numeric_dtype(df[col]):
            agg_funcs[col] = "mean"
        else:
            agg_funcs[col] = lambda s: (
                "; ".join(sorted(set(s.dropna().astype(str))))
                if s.notna().any() else np.nan
            )

    # If a file has only the ID column, preserve it as-is
    if not agg_funcs:
        return df.drop_duplicates(subset=[id_col], keep="first")

    df_agg = df.groupby(id_col, as_index=False).agg(agg_funcs)
    return df_agg

def left_merge(base: pd.DataFrame, right: pd.DataFrame, id_col: str, suffix: str) -> pd.DataFrame:
    """
    Left join with clear suffixing for overlapping columns (excluding ID).
    """
    common = [c for c in base.columns if c in right.columns and c != id_col]
    if common:
        # Suffix existing overlapping columns to avoid accidental overwrite
        base = base.rename(columns={c: f"{c}_left" for c in common})
    merged = pd.merge(base, right, on=id_col, how="left", suffixes=("", suffix))
    return merged

# ============== MAIN PIPELINE ==============
def main():
    # Load, clean, aggregate each file
    agg_dfs = {}
    for name, path in FILES.items():
        print(f"Reading and aggregating: {name}")
        df = read_excel_safe(path)
        df_agg = aggregate_by_id(df, ID_COL_STD)

        # basic info
        n_rows, n_cols = df.shape
        n_rows_agg, n_cols_agg = df_agg.shape
        print(f"  Original: {n_rows} rows, {n_cols} cols")
        print(f"  Aggregated: {n_rows_agg} rows, {n_cols_agg} cols\n")

        agg_dfs[name] = df_agg

    # Stepwise left merges
    order = ["codige", "trattamento", "ricoveri", "comorbidita", "adr"]
    if order[0] not in agg_dfs:
        raise KeyError(f"Expected base '{order[0]}' not found in aggregated dataframes.")

    merged = agg_dfs[order[0]].copy()
    for name in order[1:]:
        if name not in agg_dfs:
            print(f"Skipping '{name}' because it was not loaded.")
            continue
        print(f"Merging: base + {name}")
        merged = left_merge(merged, agg_dfs[name], ID_COL_STD, suffix=f"_{name}")
        print(f"  Merged shape: {merged.shape}\n")

    # Save
    out_path = Path(os.path.normpath(OUTPUT_PATH))
    out_path.parent.mkdir(parents=True, exist_ok=True)
    merged.to_excel(out_path, index=False)

    # Report
    print("✅ Merging completed successfully!")
    print(f"Saved: {out_path}")
    print(f"Final shape: {merged.shape}")

if __name__ == "__main__":
    try:
        main()
    except Exception as e:
        print(f"❌ Error: {e}", file=sys.stderr)
        raise


Reading and aggregating: codige
  Original: 579 rows, 97 cols
  Aggregated: 403 rows, 97 cols

Reading and aggregating: trattamento
  Original: 2635 rows, 12 cols
  Aggregated: 406 rows, 12 cols

Reading and aggregating: ricoveri
  Original: 408 rows, 11 cols
  Aggregated: 156 rows, 11 cols

Reading and aggregating: comorbidita
  Original: 953 rows, 7 cols
  Aggregated: 249 rows, 7 cols

Reading and aggregating: adr
  Original: 2847 rows, 12 cols
  Aggregated: 288 rows, 12 cols

Merging: base + trattamento
  Merged shape: (403, 108)

Merging: base + ricoveri
  Merged shape: (403, 118)

Merging: base + comorbidita
  Merged shape: (403, 124)

Merging: base + adr
  Merged shape: (403, 135)



  merged.to_excel(out_path, index=False)


✅ Merging completed successfully!
Saved: C:\Users\HP\OneDrive\Desktop\VERO_code\Phase_1\data\interim\merged_codige_dataset.xlsx
Final shape: (403, 135)


# Data Translation

In [14]:
import pandas as pd


try:
    df = pd.read_excel('C:/Users/HP/OneDrive/Desktop/VERO_code/Phase_1/data/interim/merged_codige_dataset.xlsx')
except FileNotFoundError:
    print("Error: File not found. Check the path and file name.")

In [15]:
# Extract column names
cols = df.columns.tolist()

# Print in the desired format
print("italian_columns = [")
for c in cols:
    print(f"    '{c}',")
print("]")


italian_columns = [
    'id_paziente',
    'data_nascita',
    'eta',
    'eta_gruppo',
    'sesso',
    'etnia',
    'titolo_studio',
    'bmi',
    'bmi_fasce',
    'attivita_lavorativa',
    'alcool',
    'fumo',
    'fumo_dettaglio',
    'da_quanti_anni_fuma',
    'data_osservazione_fu',
    'fine_osservazione_codige_dt',
    'fine_osservazione_codige_motivo',
    'diagnosi_tumore_dt',
    'presa_incarico_uo_dt',
    'tipo_tumore',
    'kmammella_sottotipo',
    'kcolon_locazione',
    'stadio_tnm',
    'stadio',
    'grado_istologico',
    'alterazioni_molecolari',
    'mutazioni',
    'genotipo_dpyd',
    'genotipo_dpyd_type',
    'intervento_chirurgico',
    'intervento_chirurgico_dt',
    'intervento_chirurgico_tipo',
    'intervento_chirurgico_specificare',
    'pregresso_intervento',
    'pregresso_radioterapia',
    'pregresso_numero_linee_trattamento',
    'intervento_chirurgico_altro',
    'intervento_chirurgico_complicanze',
    'intervento_chirurgico_per_complicanze_inte

In [17]:
import pandas as pd
import numpy as np


#italian_columns = [
    #'id_paziente', 'unita_operativa', 'data_nascita', 'eta', 'eta_gruppo', 'sesso', 'etnia', 'titolo_studio',
    #'bmi', 'bmi fasce', 'attivita_lavorativa', 'alcool', 'fumo', 'fumo_dettaglio', 'da_quanti_anni_fuma',
    #'n_sigarette', 'data_osservazione_FU', 'fine_osservazione_codige_dt', 'fine_osservazione_codige_motivo',
    #'diagnosi_tumore_dt', 'presa_incarico_UO_dt', 'tipo_tumore', 'kmammella_sottotipo', 'kcolon_locazione',
    #'stadio', 'stadio_TNM', 'grado_istologico', 'alterazioni_molecolari', 'mutazioni', 'genotipo_DPYD',
    #'genotipo_DPYD_type', 'intervento_chirurgico', 'intervento_chirurgico_dt', 'intervento_chirurgico_tipo',
   # 'pregresso_intervento', 'intervento_chirurgico_complicanze',
   # 'intervento_chirurgico_per_complicanze_intervento', 'ricovero_per_complicanze_intervento',
    #'linee_trattamento_oncologico_n', 'pregresso_radioterapia', 'radioterapia_dt_inizio',
   # 'radioterapia_dt_fine', 'trasfusione', 'trasfusioni_ntot', 'ipertensione', 'insufficienza_aortica',
  #  'dislipidemie', 'IPB', 'obesita', 'cardiopatia_ischemica', 'fIbrillazione_atriale', 'bpco', 'asma',
   # 'diabete_tipoII', 'reflusso_gastro', 'insufficienza_renale', 'sindrome_depressiva', 'anemia',
   # 'patologie_psichiatriche', 'altre_patologie', 'patologie_cardiovascolari', 'patologie_gastrointestinali',
   # 'patologie_cerebrovascolari', 'ricoveri_n', 'ricoveri_ord_n', 'decesso', 'adr', 'adr_n_tot',
   # 'adr_n_grado1', 'adr_n_grado2', 'adr_n_grado3', 'adr_n_grado4', 'adr_n_grado5', 'giorni_osservazione',
   # 'data_decesso', 'glicemia', 'gb', 'gr', 'hb', 'neu', 'conta_piastrinica', 'creati', 'ast_got',
   # 'alt_gpt', 'azotemia', 'gamma_gt', 'bilirubina_tot', 'bilirubina_dir', 'albumina', 'fe', 'fe_cat',
   # 'chemio_linee', 'chemio_fine_progressione', 'chemio_fine_tossicità', 'chemio_fine_altro',
   # 'insorgenza_dt', 'tipo', 'grado', 'esito', 'correlazione_chemio', 'azione_chemio', 'provenienza',
   # 'ADR_clean', 'macrocategoria', 'accesso_dt', 'dimissione_dt', 'diagnosi_accesso', 'diagnosi_dimissione',
    #'modalita', 'durata_ps', 'correlazione', 'modifica_schema_onco', 'linea_tattamento_oncologico',
   # 'nome_schema_chemio', 'inizio_schema_chemio_dt', 'fine__schema_chemio_dt', 'cicli_n',
   # 'motivo_fine__schema_chemio', 'tossicita_tipo', 'principio_attivo', 'dose_ridotta',
   # 'principio_attivo_n', 'comorbidità_cat', 'comorbidità_dt', 'unita_operativa', 'data_nascita',
    #'intervention_type', 'intervention_specify', 'pregresso_numero_linee_trattamento', 'radioterapia',
   # 'cardiopatia_ipertensiva', 'reflusso_gastroesofageo'
#]



italian_columns = [
    'id_paziente',
    'data_nascita',
    'eta',
    'eta_gruppo',
    'sesso',
    'etnia',
    'titolo_studio',
    'bmi',
    'bmi_fasce',
    'attivita_lavorativa',
    'alcool',
    'fumo',
    'fumo_dettaglio',
    'da_quanti_anni_fuma',
    'data_osservazione_fu',
    'fine_osservazione_codige_dt',
    'fine_osservazione_codige_motivo',
    'diagnosi_tumore_dt',
    'presa_incarico_uo_dt',
    'tipo_tumore',
    'kmammella_sottotipo',
    'kcolon_locazione',
    'stadio_tnm',
    'stadio',
    'grado_istologico',
    'alterazioni_molecolari',
    'mutazioni',
    'genotipo_dpyd',
    'genotipo_dpyd_type',
    'intervento_chirurgico',
    'intervento_chirurgico_dt',
    'intervento_chirurgico_tipo',
    'intervento_chirurgico_specificare',
    'pregresso_intervento',
    'pregresso_radioterapia',
    'pregresso_numero_linee_trattamento',
    'intervento_chirurgico_altro',
    'intervento_chirurgico_complicanze',
    'intervento_chirurgico_per_complicanze_intervento',
    'ricovero_per_complicanze_intervento',
    'linee_trattamento_oncologico_n',
    'radioterapia',
    'radioterapia_dt_inizio',
    'radioterapia_dt_fine',
    'trasfusione',
    'trasfusioni_ntot',
    'ipertensione',
    'insufficienza_aortica',
    'dispilidemie',
    'ipb',
    'obesita',
    'cardiopatia_ischemica',
    'fibrillazione_atriale',
    'bpco',
    'asma',
    'diabete_tipo_ii',
    'reflusso_gastroesofageo',
    'cardiopatia_ipertensiva',
    'insufficienza_renale',
    'sindrome_depressiva',
    'anemia',
    'patologie_psichiatriche',
    'altre_patologie',
    'patologie_cardiovascolari',
    'patologie_gastrointestinali',
    'patologie_cerebrovascolari',
    'ricoveri_ord_n',
    'decesso',
    'adr_left',
    'adr_n_tot',
    'adr_n_grado1',
    'adr_n_grado2',
    'adr_n_grado3',
    'adr_n_grado4',
    'adr_n_grado5',
    'giorni_osservazione',
    'glicemia',
    'gb',
    'gr',
    'hb',
    'neu',
    'conta_piastrinica',
    'creati',
    'ast_got',
    'alt_gpt',
    'azotemia',
    'gamma_gt',
    'bilirubina_tot',
    'bilirubina_dir',
    'albumina',
    'fe',
    'fe_cat',
    'data_decesso',
    'chemio_fine_progressione',
    'chemio_fine_tossicita',
    'chemio_fine_altro',
    'farmaci_cat_n',
    'linea_tattamento_oncologico',
    'nome_schema_chemio',
    'inizio_schema_chemio_dt',
    'fine_schema_chemio_dt',
    'cicli_n',
    'motivo_fine_schema_chemio',
    'tossicita_tipo',
    'principio_attivo',
    'dose_ridotta',
    'principio_attivo_n',
    'linea_trattamento_oncologico',
    'ricovero_n',
    'accesso_dt',
    'dimissione_dt',
    'diagnosi_accesso',
    'diagnosi_dimissione',
    'modalita',
    'durata_ps',
    'tipo_left',
    'correlazione',
    'modifica_schema_onco',
    'comorbidita',
    'comorbilita_cat',
    'altro',
    'data',
    'comorbidita_cat',
    'altre_pat_n',
    'adr',
    'insorgenza_dt',
    'tipo',
    'grado',
    'esito',
    'correlazione_chemio',
    'azione_chemio',
    'provenienza',
    'adr_clean',
    'macrocategoria',
    'adr_clean.1',
]

import pandas as pd
import numpy as np

df = pd.DataFrame(columns=italian_columns)
# --- End of Demo Setup ---


# --- 2. Comprehensive Italian-to-English Column Translation Map ---
column_translation_map = {
    # Main Patient and Demographics
    'id_paziente': 'patient_id',
    'unita_operativa': 'operating_unit_name', # nome unità operativa
    'data_nascita': 'birth_date',
    'eta': 'age',
    'eta_gruppo': 'age_group',
    'sesso': 'gender',
    'etnia': 'ethnicity',
    'titolo_studio': 'education_level',
    'bmi': 'bmi_value',
    'bmi fasce': 'bmi_category',
    'attivita_lavorativa': 'employment_status',
    'alcool': 'alcohol_consumption',
    'fumo': 'smoking_status_binary', # 0 = NO, 1 = SI, missing
    'fumo_dettaglio': 'smoking_status_detail', # Never, Current, Ex-Smoker
    'da_quanti_anni_fuma': 'smoking_years',
    'n_sigarette': 'cigarettes_per_day',

    # Observation Period and Outcome
    'data_osservazione_FU': 'observation_start_date', # Data di inizio osservazione
    'fine_osservazione_codige_dt': 'observation_end_date',
    'fine_osservazione_codige_motivo': 'observation_end_reason',
    'giorni_osservazione': 'observation_days',
    'decesso': 'death_during_observation', # 0 = NO, 1 = SI
    'data_decesso': 'death_date',

    # Tumor Diagnosis and Staging
    'diagnosi_tumore_dt': 'tumor_diagnosis_date',
    'presa_incarico_UO_dt': 'oncology_unit_start_date', # data di inzio della presa in carico
    'tipo_tumore': 'tumor_type',
    'kmammella_sottotipo': 'breast_cancer_subtype', # Sottotipo per tumore alla mammella
    'kcolon_locazione': 'colon_cancer_location', # Sede per il tumore al colon
    'stadio': 'tumor_stage_roman', # I, II, III, IV
    'stadio_TNM': 'tumor_stage_tnm',
    'grado_istologico': 'histological_grade', # G1, G2, G3

    # Molecular/Genetic
    'alterazioni_molecolari': 'molecular_alterations', # 0=absent, 1=present
    'mutazioni': 'mutations_present', # 0=absent, 1=present
    'genotipo_DPYD': 'dpyd_genotype_known', # 0=not known, 1=known
    'genotipo_DPYD_type': 'dpyd_genotype_type', # Wild-type vs Polymorphism

    # Surgical Intervention (Intervento Chirurgico)
    'intervento_chirurgico': 'surgical_intervention', # 0=NO, 1=SI
    'intervento_chirurgico_dt': 'surgery_date',
    'intervento_chirurgico_tipo': 'surgery_type',
    'pregresso_intervento': 'prior_surgery', # If surgery was BEFORE observation start date
    'intervento_chirurgico_complicanze': 'surgery_complications', # 0=NO, 1=SI
    'intervento_chirurgico_per_complicanze_intervento': 'reoperation_for_complication', # 0=NO, 1=SI
    'ricovero_per_complicanze_intervento': 'hospitalization_for_surgery_complication', # 0=NO, 1=SI
    'intervention_type': 'surgery_type_detail',
    'intervention_specify': 'surgery_type_specify',
    'intervento_chirurgico_specificare': 'surgery_specify',
    'intervento_chirurgico_altro': 'surgery_other',


    # Oncology Treatment History (General)
    'linee_trattamento_oncologico_n': 'oncology_treatment_lines_n',
    'pregresso_radioterapia': 'prior_radiotherapy', # If radiotherapy was BEFORE observation start date
    'radioterapia': 'radiotherapy_status',
    'radioterapia_dt_inizio': 'radiotherapy_start_date',
    'radioterapia_dt_fine': 'radiotherapy_end_date',
    'trasfusione': 'transfusion_received', # 0=NO, 1=SI
    'trasfusioni_ntot': 'transfusions_total_n',
    'pregresso_numero_linee_trattamento': 'prior_treatment_lines_n',

    # Comorbidities (Patologie) - ALL are 0=NO, 1=SI BEFORE observation start date
    'ipertensione': 'hypertension',
    'insufficienza_aortica': 'aortic_insufficiency',
    'dislipidemie': 'dyslipidemia',
    'IPB': 'bph', # Benign Prostatic Hyperplasia (IPB)
    'obesita': 'obesity_comorbidity',
    'cardiopatia_ischemica': 'ischemic_heart_disease',
    'fIbrillazione_atriale': 'atrial_fibrillation',
    'bpco': 'copd',
    'asma': 'asthma',
    'diabete_tipoII': 'diabetes_type_ii',
    'reflusso_gastro': 'gastroesophageal_reflux',
    'cardiopatia_ipertensiva': 'hypertensive_heart_disease',
    'reflusso_gastroesofageo': 'gastroesophageal_reflux_full', # Appears as a duplicate or full name
    'insufficienza_renale': 'renal_insufficiency',
    'sindrome_depressiva': 'depressive_syndrome',
    'anemia': 'anemia_comorbidity',
    'patologie_psichiatriche': 'psychiatric_disorders',
    'altre_patologie': 'other_comorbidities',
    'patologie_cardiovascolari': 'cardiovascular_disorders', # Grouped
    'patologie_gastrointestinali': 'gastrointestinal_disorders', # Grouped
    'patologie_cerebrovascolari': 'cerebrovascular_disorders', # Grouped

    # Hospitalization (Ricovero) - from separate table
    'ricoveri_n': 'hospitalizations_n', # Total during observation
    'ricoveri_ord_n': 'ordinary_hospitalizations_n',
    'accesso_dt': 'hospital_admission_date',
    'dimissione_dt': 'hospital_discharge_date',
    'diagnosi_accesso': 'admission_diagnosis',
    'diagnosi_dimissione': 'discharge_diagnosis',
    'modalita': 'admission_mode', # Programmed vs ER
    'durata_ps': 'er_stay_duration', # <24h vs >24h
    'tipo': 'hospitalization_type', # Ordinary vs DH/Day Service
    'correlazione': 'hospitalization_cause', # Tumor, Onc. Treatment, Other
    'modifica_schema_onco': 'oncology_schema_modified', # 0=No, 1=Si due to hospitalization

    # Adverse Drug Reactions (ADR) - from separate table
    'adr': 'adr_description', # tipo di ard
    'insorgenza_dt': 'adr_onset_date',
    'grado': 'adr_ctcae_grade', # 1-5
    'esito': 'adr_outcome', # Complete Resolution, Worsening, Death, etc.
    'correlazione_chemio': 'adr_chemo_correlation', # Dubious, Probable, Possible, Highly Possible
    'azione_chemio': 'adr_chemo_action', # Treatment Interrupted, Dose Reduced, None, etc.
    'provenienza': 'adr_source_project', # onco22 or codige
    'ADR_clean': 'adr_description_clean',
    'macrocategoria': 'adr_macro_category',

    # Chemotherapy Treatment Lines - from separate table
    'linea_tattamento_oncologico': 'treatment_line_n',
    'nome_schema_chemio': 'chemo_schema_name',
    'inizio_schema_chemio_dt': 'chemo_schema_start_date',
    'fine__schema_chemio_dt': 'chemo_schema_end_date',
    'cicli_n': 'chemo_cycles_n',
    'motivo_fine__schema_chemio': 'chemo_schema_end_reason', # Progression, Toxicity, Other
    'tossicita_tipo': 'toxicity_type',
    'principio_attivo': 'active_principle',
    'dose_ridotta': 'dose_reduced', # 0=No, 1=Si
    'principio_attivo_n': 'active_principles_n',
    'chemio_linee': 'chemo_lines_total_n', # Total number of chemotherapy lines

    # Chemotherapy End Reasons Aggregation
    'chemio_fine_progressione': 'end_reason_progression_any_line', # 0=no, 1=si
    'chemio_fine_tossicità': 'end_reason_toxicity_any_line', # 0=no, 1=si
    'chemio_fine_altro': 'end_reason_other_any_line', # 0=no, 1=si

    # Comorbidity List/Date - from separate table
    'comorbidità_cat': 'comorbidity_category_list', # The list of 1-20 categories
    'comorbidità_dt': 'comorbidity_diagnosis_date',

    # Lab Values (Ranges) at Observation Start
    'glicemia': 'blood_glucose_range',
    'gb': 'white_blood_cells_range', # GB (Globuli Bianchi)
    'gr': 'red_blood_cells_range', # GR (Globuli Rossi)
    'hb': 'hemoglobin_range', # HB
    'neu': 'neutrophils_percent_range', # NEU (%)
    'conta_piastrinica': 'platelet_count_range',
    'creati': 'creatinine_range', # Creatininemia
    'ast_got': 'ast_got_range',
    'alt_gpt': 'alt_gpt_range',
    'azotemia': 'azotemia_range',
    'gamma_gt': 'gamma_gt_range',
    'bilirubina_tot': 'total_bilirubin_range',
    'bilirubina_dir': 'direct_bilirubin_range',
    'albumina': 'albumin_range',
    'fe': 'ejection_fraction_percent', # FE (Frazione di Eiezione)
    'fe_cat': 'ejection_fraction_category' # FE Category (Preserved, Media, Ridotta)
}


# --- 3. Apply the Translation to your DataFrame ---

# Applying the rename operation:
df.rename(columns=column_translation_map, inplace=True)

# You can print the first few columns to verify the change
# print(df.columns)
print(df.head())

print("Column names successfully translated and renamed to snake_case in the DataFrame.")
print("The DataFrame's columns are now:")
print(list(df.columns))
print("\nNext, you should focus on translating the cell values using similar mapping dictionaries.")


Empty DataFrame
Columns: [patient_id, birth_date, age, age_group, gender, ethnicity, education_level, bmi_value, bmi_fasce, employment_status, alcohol_consumption, smoking_status_binary, smoking_status_detail, smoking_years, data_osservazione_fu, observation_end_date, observation_end_reason, tumor_diagnosis_date, presa_incarico_uo_dt, tumor_type, breast_cancer_subtype, colon_cancer_location, stadio_tnm, tumor_stage_roman, histological_grade, molecular_alterations, mutations_present, genotipo_dpyd, genotipo_dpyd_type, surgical_intervention, surgery_date, surgery_type, surgery_specify, prior_surgery, prior_radiotherapy, prior_treatment_lines_n, surgery_other, surgery_complications, reoperation_for_complication, hospitalization_for_surgery_complication, oncology_treatment_lines_n, radiotherapy_status, radiotherapy_start_date, radiotherapy_end_date, transfusion_received, transfusions_total_n, hypertension, aortic_insufficiency, dispilidemie, ipb, obesity_comorbidity, ischemic_heart_disease

In [19]:
import pandas as pd

# Load the dataset
in_path = "C:/Users/HP/OneDrive/Desktop/VERO_code/Phase_1/data/interim/merged_codige_dataset.xlsx"
merged_df = pd.read_excel(in_path)

# Column mapping based on the user's requested English names
col_map = {
    "id_paziente": "patient_id",
    "unita_operativa": "operating_unit_name",
    "data_nascita": "birth_date",
    "eta": "age",
    "eta_gruppo": "age_group",
    "sesso": "gender",
    "etnia": "ethnicity",
    "titolo_studio": "education_level",
    "bmi": "bmi_value",
    "bmi_fasce": "bmi_category",
    "attivita_lavorativa": "employment_status",
    "alcool": "alcohol_consumption",
    "fumo": "smoking_status_binary",
    "fumo_dettaglio": "smoking_status_detail",
    "da_quanti_anni_fuma": "smoking_years",
    "n_sigarette": "cigarettes_per_day",
    "data_osservazione_fu": "observation_start_date",
    "fine_osservazione_codige_dt": "observation_end_date",
    "fine_osservazione_codige_motivo": "observation_end_reason",
    "diagnosi_tumore_dt": "tumor_diagnosis_date",
    "presa_incarico_uo_dt": "oncology_unit_start_date",
    "tipo_tumore": "tumor_type",
    "kmammella_sottotipo": "breast_cancer_subtype",
    "kcolon_locazione": "colon_cancer_location",
    "stadio": "tumor_stage_roman",
    "stadio_tnm": "tumor_stage_tnm",
    "grado_istologico": "histological_grade",
    "alterazioni_molecolari": "molecular_alterations",
    "mutazioni": "mutations_present",
    "genotipo_dpyd": "dpyd_genotype_known",
    "genotipo_dpyd_type": "dpyd_genotype_type",
    "intervento_chirurgico": "surgical_intervention",
    "intervento_chirurgico_dt": "surgery_date",
    "intervento_chirurgico_tipo": "surgery_type",
    "intervento_chirurgico_specificare": "surgery_type_specify",
    "pregresso_intervento": "prior_surgery",
    "intervento_chirurgico_complicanze": "surgery_complications",
    "intervento_chirurgico_per_complicanze_intervento": "reoperation_for_complication",
    "ricovero_per_complicanze_intervento": "hospitalization_for_surgery_complication",
    "linee_trattamento_oncologico_n": "oncology_treatment_lines_n",
    "pregresso_radioterapia": "prior_radiotherapy",
    "radioterapia_dt_inizio": "radiotherapy_start_date",
    "radioterapia_dt_fine": "radiotherapy_end_date",
    "trasfusione": "transfusion_received",
    "trasfusioni_ntot": "transfusions_total_n",
    "ipertensione": "hypertension",
    "insufficienza_aortica": "aortic_insufficiency",
    "dislipidemie": "dyslipidemia",
    "dispilidemie": "dyslipidemia",
    "ipb": "bph",
    "obesita": "obesity_comorbidity",
    "cardiopatia_ischemica": "ischemic_heart_disease",
    "fibrillazione_atriale": "atrial_fibrillation",
    "bpco": "copd",
    "asma": "asthma",
    "diabete_tipo_ii": "diabetes_type_ii",
    "diabete_tipoII": "diabetes_type_ii",
    "reflusso_gastro": "gastroesophageal_reflux",
    "reflusso_gastroesofageo": "gastroesophageal_reflux_full",
    "insufficienza_renale": "renal_insufficiency",
    "sindrome_depressiva": "depressive_syndrome",
    "anemia": "anemia_comorbidity",
    "patologie_psichiatriche": "psychiatric_disorders",
    "altre_patologie": "other_comorbidities",
    "patologie_cardiovascolari": "cardiovascular_disorders",
    "patologie_gastrointestinali": "gastrointestinal_disorders",
    "patologie_cerebrovascolari": "cerebrovascular_disorders",
    "ricoveri_n": "hospitalizations_n",
    "ricoveri_ord_n": "ordinary_hospitalizations_n",
    "decesso": "death_during_observation",
    "adr": "adr_description",
    "adr_n_tot": "adr_n_tot",
    "adr_n_grado1": "adr_n_grado1",
    "adr_n_grado2": "adr_n_grado2",
    "adr_n_grado3": "adr_n_grado3",
    "adr_n_grado4": "adr_n_grado4",
    "adr_n_grado5": "adr_n_grado5",
    "giorni_osservazione": "observation_days",
    "data_decesso": "death_date",
    "glicemia": "blood_glucose_range",
    "gb": "white_blood_cells_range",
    "gr": "red_blood_cells_range",
    "hb": "hemoglobin_range",
    "neu": "neutrophils_percent_range",
    "conta_piastrinica": "platelet_count_range",
    "creati": "creatinine_range",
    "ast_got": "ast_got_range",
    "alt_gpt": "alt_gpt_range",
    "azotemia": "azotemia_range",
    "gamma_gt": "gamma_gt_range",
    "bilirubina_tot": "total_bilirubin_range",
    "bilirubina_dir": "direct_bilirubin_range",
    "albumina": "albumin_range",
    "fe": "ejection_fraction_percent",
    "fe_cat": "ejection_fraction_category",
    "chemio_linee": "chemo_lines_total_n",
    "chemio_fine_progressione": "end_reason_progression_any_line",
    "chemio_fine_tossicità": "end_reason_toxicity_any_line",
    "chemio_fine_altro": "end_reason_other_any_line",
    "insorgenza_dt": "adr_onset_date",
    "tipo": "hospitalization_type",
    "grado": "adr_ctcae_grade",
    "esito": "adr_outcome",
    "correlazione_chemio": "adr_chemo_correlation",
    "azione_chemio": "adr_chemo_action",
    "provenienza": "adr_source_project",
    "adr_clean": "adr_description_clean",
    "macrocategoria": "adr_macro_category",
    "accesso_dt": "hospital_admission_date",
    "dimissione_dt": "hospital_discharge_date",
    "diagnosi_accesso": "admission_diagnosis",
    "diagnosi_dimissione": "discharge_diagnosis",
    "modalita": "admission_mode",
    "durata_ps": "er_stay_duration",
    "correlazione": "hospitalization_cause",
    "modifica_schema_onco": "oncology_schema_modified",
    "linea_tattamento_oncologico": "treatment_line_n",
    "nome_schema_chemio": "chemo_schema_name",
    "inizio_schema_chemio_dt": "chemo_schema_start_date",
    "fine_schema_chemio_dt": "chemo_schema_end_date",
    "cicli_n": "chemo_cycles_n",
    "motivo_fine_schema_chemio": "chemo_schema_end_reason",
    "tossicita_tipo": "toxicity_type",
    "principio_attivo": "active_principle",
    "dose_ridotta": "dose_reduced",
    "principio_attivo_n": "active_principles_n",
    "comorbidita_cat": "comorbidity_category_list",
    "comorbidita_dt": "comorbidity_diagnosis_date",
    "cardiopatia_ipertensiva": "hypertensive_heart_disease",
    "radioterapia": "radiotherapy_status",
    # suffixed ricoveri2 columns
    "ricovero_n_ricoveri2": "hospitalizations_n_ricoveri2",
    "accesso_dt_ricoveri2": "hospital_admission_date_ricoveri2",
    "dimissione_dt_ricoveri2": "hospital_discharge_date_ricoveri2",
    "diagnosi_accesso_ricoveri2": "admission_diagnosis_ricoveri2",
    "diagnosi_dimissione_ricoveri2": "discharge_diagnosis_ricoveri2",
    "modalita_ricoveri2": "admission_mode_ricoveri2",
    "durata_ps_ricoveri2": "er_stay_duration_ricoveri2",
    "tipo_ricoveri2": "hospitalization_type_ricoveri2",
    "correlazione_ricoveri2": "hospitalization_cause_ricoveri2",
    "modifica_schema_onco_ricoveri2": "oncology_schema_modified_ricoveri2",
}

# Apply mapping only for columns that exist in the dataframe
existing_map = {k: v for k, v in col_map.items() if k in merged_df.columns}
renamed = merged_df.rename(columns=existing_map)

# Save output
out_path = "/mnt/data/merged_codige_wide_english.xlsx"
renamed.to_excel(out_path, index=False)

# Provide a quick summary for the user
summary = {
    "columns_total": len(merged_df.columns),
    "columns_renamed": len(existing_map),
    "unmapped_columns_example": [c for c in merged_df.columns if c not in existing_map][:10],
    "new_columns_preview": list(renamed.columns[:20]),
}
out_path, summary


  renamed.to_excel(out_path, index=False)


('/mnt/data/merged_codige_wide_english.xlsx',
 {'columns_total': 135,
  'columns_renamed': 121,
  'unmapped_columns_example': ['pregresso_numero_linee_trattamento',
   'intervento_chirurgico_altro',
   'adr_left',
   'chemio_fine_tossicita',
   'farmaci_cat_n',
   'linea_trattamento_oncologico',
   'ricovero_n',
   'tipo_left',
   'comorbidita',
   'comorbilita_cat'],
  'new_columns_preview': ['patient_id',
   'birth_date',
   'age',
   'age_group',
   'gender',
   'ethnicity',
   'education_level',
   'bmi_value',
   'bmi_category',
   'employment_status',
   'alcohol_consumption',
   'smoking_status_binary',
   'smoking_status_detail',
   'smoking_years',
   'observation_start_date',
   'observation_end_date',
   'observation_end_reason',
   'tumor_diagnosis_date',
   'oncology_unit_start_date',
   'tumor_type']})

In [20]:
import pandas as pd
import numpy as np

# --- 1. MAPPING DICTIONARIES ---
# These dictionaries map the Italian numerical/string codes to clear English values.

# Demographic & Lifestyle Mappings
AGE_GROUP_MAP = {
    1: '<= 65 years',
    2: '> 65 years'
}

GENDER_MAP = {
    1: 'Male',
    2: 'Female'
}

ETHNICITY_MAP = {
    0: 'Missing / Not Noted',
    1: 'African or African American',
    2: 'Asian',
    3: 'Caucasian'
}

EDUCATION_LEVEL_MAP = {
    0: 'Not Known / Missing',
    1: 'Elementary School',
    2: 'Middle School',
    3: 'High School',
    4: 'University Degree'
}

BMI_CATEGORY_MAP = {
    1: '<18.5 Underweight',
    2: '18.5-24.9 Normal Weight',
    3: '25-29.99 Overweight', # Preobeso -> Overweight
    4: '>=30 Obese'
}

EMPLOYMENT_STATUS_MAP = {
    0: 'Not Known / Missing',
    1: 'Unemployed',
    2: 'Homemaker',
    3: 'Worker',
    4: 'Retired'
}

ALCOHOL_CONSUMPTION_MAP = {
    0: 'Not Known / Missing',
    1: 'Non-drinker', # Astemio
    2: 'Yes, Moderate',
    3: 'Yes, High'
}

SMOKING_STATUS_DETAIL_MAP = {
    0: 'Not Known / Missing',
    1: 'Never Smoked',
    2: 'Current Smoker',
    3: 'Ex-Smoker'
}

# Observation & Outcome Mappings
OBSERVATION_END_REASON_MAP = {
    1: 'End of Treatment with Follow-Up',
    2: 'Lost to Follow-Up',
    3: 'End of Study Period (ONCO22 - 31/12/2023)',
    4: 'Death',
    5: 'Informed Consent Withdrawal',
    6: 'Other',
    7: 'Transferred',
    8: 'End of Study Period (Codige - 31/12/2024)'
}

DEATH_STATUS_MAP = {
    0: 'No',
    1: 'Yes'
}

# Tumor & Disease Mappings
BREAST_CANCER_SUBTYPE_MAP = {
    1: 'Luminal',
    2: 'HER2',
    3: 'Triple Negative'
}

COLON_CANCER_LOCATION_MAP = {
    1: 'Left',
    2: 'Right'
}

TUMOR_STAGE_ROMAN_MAP = {
    'I': 'Stage I',
    'II': 'Stage II',
    'III': 'Stage III',
    'IV': 'Stage IV'
}

HISTOLOGICAL_GRADE_MAP = {
    1: 'G1',
    2: 'G2',
    3: 'G3'
}

BINARY_PRESENCE_MAP = {
    0: 'Absent / No',
    1: 'Present / Yes'
}

DPYD_GENOTYPE_KNOWN_MAP = {
    0: 'Not Known',
    1: 'Known'
}

DPYD_GENOTYPE_TYPE_MAP = {
    1: 'Wild-Type Genotype',
    2: 'Polymorphism'
}

# Surgery Mappings
SURGERY_TYPE_MAP = {
    1: 'Partial Tumor Excision',
    2: 'Total Tumor Excision',
    3: 'Surgical Biopsy',
    4: 'Laparoscopy',
    5: 'Palliative Procedure (e.g., Debulking)',
    6: 'Other (Specify)'
}

# Comorbidities Mappings (All are 0=NO, 1=SI)
# For all comorbidity columns (hypertension, copd, diabetes_type_ii, etc.):
# Use the BINARY_PRESENCE_MAP above.

# Hospitalization Mappings
ADMISSION_MODE_MAP = {
    1: 'Programmed',
    2: 'Emergency Room (PS)'
}

ER_STAY_DURATION_MAP = {
    1: 'Less than 24 hours',
    2: 'More than 24 hours'
}

HOSPITALIZATION_TYPE_MAP = {
    1: 'Ordinary',
    2: 'Day Hospital / Day Service (PACC)'
}

HOSPITALIZATION_CAUSE_MAP = {
    'Tumore': 'Tumor-Related',
    'Trattamento oncologico': 'Oncology Treatment-Related',
    'Altro (Specificare)': 'Other (Specify)'
}

ONCOLOGY_SCHEMA_MODIFIED_MAP = {
    0: 'No',
    1: 'Yes'
}

# Adverse Drug Reaction (ADR) Mappings
ADR_TYPE_MAP = {
    1: 'Intermittent',
    2: 'Continuous'
}

ADR_GRADE_MAP = {
    1: 'Grade 1 Mild',
    2: 'Grade 2 Moderate',
    3: 'Grade 3 Severe',
    4: 'Grade 4 Very Severe',
    5: 'Grade 5 Fatal'
}

ADR_OUTCOME_MAP = {
    1: 'Complete Resolution',
    2: 'Resolution with Sequelae', # Postumi
    3: 'Improvement',
    4: 'Unchanged or Worsened Reaction',
    5: 'Death',
    6: 'Not Available'
}

ADR_CHEMO_CORRELATION_MAP = {
    1: 'Dubious',
    2: 'Probable',
    3: 'Possible',
    4: 'Highly Possible'
}

ADR_CHEMO_ACTION_MAP = {
    1: 'Chemotherapy Treatment Interrupted',
    2: 'Oncological Schema Modified',
    3: 'Dose Reduction of One or More Drugs',
    4: 'None',
    5: 'Not Known, Not Applicable',
    6: 'Other (Specify)'
}

# Chemotherapy Mappings
CHEMO_SCHEMA_END_REASON_MAP = {
    'Progressione della malattia': 'Disease Progression',
    'Tossicità (Specificare)': 'Toxicity (Specify)',
    'Altro (Specificare)': 'Other (Specify)'
}

DOSE_REDUCED_MAP = {
    0: 'No',
    1: 'Yes'
}

# Ejection Fraction Category
EJECTION_FRACTION_CATEGORY_MAP = {
    1: 'Preserved >=50%',
    2: 'Mid-Range 40-49%',
    3: 'Reduced <40%'
}


# --- 2. FUNCTION TO APPLY ALL MAPPINGS ---

def apply_categorical_translations(df):
    """
    Applies all categorical translation maps to the relevant columns
    in the DataFrame. Assumes columns were already renamed in the first step.
    """
    print("Starting categorical value translation...")

    # Define a list of columns and their corresponding map dictionaries
    column_map_pairs = [
        ('age_group', AGE_GROUP_MAP),
        ('gender', GENDER_MAP),
        ('ethnicity', ETHNICITY_MAP),
        ('education_level', EDUCATION_LEVEL_MAP),
        ('bmi_category', BMI_CATEGORY_MAP),
        ('employment_status', EMPLOYMENT_STATUS_MAP),
        ('alcohol_consumption', ALCOHOL_CONSUMPTION_MAP),
        ('smoking_status_detail', SMOKING_STATUS_DETAIL_MAP),
        ('observation_end_reason', OBSERVATION_END_REASON_MAP),
        ('death_during_observation', DEATH_STATUS_MAP),
        ('breast_cancer_subtype', BREAST_CANCER_SUBTYPE_MAP),
        ('colon_cancer_location', COLON_CANCER_LOCATION_MAP),
        ('tumor_stage_roman', TUMOR_STAGE_ROMAN_MAP),
        ('histological_grade', HISTOLOGICAL_GRADE_MAP),
        ('molecular_alterations', BINARY_PRESENCE_MAP),
        ('mutations_present', BINARY_PRESENCE_MAP),
        ('dpyd_genotype_known', DPYD_GENOTYPE_KNOWN_MAP),
        ('dpyd_genotype_type', DPYD_GENOTYPE_TYPE_MAP),
        ('surgical_intervention', BINARY_PRESENCE_MAP),
        ('surgery_type', SURGERY_TYPE_MAP),
        ('prior_surgery', BINARY_PRESENCE_MAP),
        ('surgery_complications', BINARY_PRESENCE_MAP),
        ('reoperation_for_complication', BINARY_PRESENCE_MAP),
        ('hospitalization_for_surgery_complication', BINARY_PRESENCE_MAP),
        ('prior_radiotherapy', BINARY_PRESENCE_MAP),
        ('radiotherapy_status', BINARY_PRESENCE_MAP),
        ('transfusion_received', BINARY_PRESENCE_MAP),
        ('admission_mode', ADMISSION_MODE_MAP),
        ('er_stay_duration', ER_STAY_DURATION_MAP),
        ('hospitalization_type', HOSPITALIZATION_TYPE_MAP),
        ('hospitalization_cause', HOSPITALIZATION_CAUSE_MAP),
        ('oncology_schema_modified', ONCOLOGY_SCHEMA_MODIFIED_MAP),
        ('adr_type', ADR_TYPE_MAP),
        ('adr_ctcae_grade', ADR_GRADE_MAP),
        ('adr_outcome', ADR_OUTCOME_MAP),
        ('adr_chemo_correlation', ADR_CHEMO_CORRELATION_MAP),
        ('adr_chemo_action', ADR_CHEMO_ACTION_MAP),
        ('chemo_schema_end_reason', CHEMO_SCHEMA_END_REASON_MAP),
        ('dose_reduced', DOSE_REDUCED_MAP),
        ('ejection_fraction_category', EJECTION_FRACTION_CATEGORY_MAP),
        ('end_reason_progression_any_line', BINARY_PRESENCE_MAP),
        ('end_reason_toxicity_any_line', BINARY_PRESENCE_MAP),
        ('end_reason_other_any_line', BINARY_PRESENCE_MAP),

        # Comorbidity binary columns (0=NO, 1=SI)
        ('hypertension', BINARY_PRESENCE_MAP),
        ('aortic_insufficiency', BINARY_PRESENCE_MAP),
        ('dyslipidemia', BINARY_PRESENCE_MAP),
        ('bph', BINARY_PRESENCE_MAP),
        ('obesity_comorbidity', BINARY_PRESENCE_MAP),
        ('ischemic_heart_disease', BINARY_PRESENCE_MAP),
        ('atrial_fibrillation', BINARY_PRESENCE_MAP),
        ('copd', BINARY_PRESENCE_MAP),
        ('asthma', BINARY_PRESENCE_MAP),
        ('diabetes_type_ii', BINARY_PRESENCE_MAP),
        ('gastroesophageal_reflux', BINARY_PRESENCE_MAP),
        ('renal_insufficiency', BINARY_PRESENCE_MAP),
        ('depressive_syndrome', BINARY_PRESENCE_MAP),
        ('anemia_comorbidity', BINARY_PRESENCE_MAP),
        ('psychiatric_disorders', BINARY_PRESENCE_MAP),
        ('other_comorbidities', BINARY_PRESENCE_MAP),
        ('cardiovascular_disorders', BINARY_PRESENCE_MAP),
        ('gastrointestinal_disorders', BINARY_PRESENCE_MAP),
        ('cerebrovascular_disorders', BINARY_PRESENCE_MAP),
    ]

    for col, mapping in column_map_pairs:
        # Check if the column exists (important since the dataset is merged from multiple sources)
        if col in df.columns:
            # Map the values. .astype(float) and .astype('Int64') help handle mixed types/NaNs sometimes
            try:
                # Attempt to map directly (useful for string keys or mixed types)
                df[col] = df[col].replace(mapping)
            except Exception:
                # Fallback to converting to a type that matches the map keys (like int for 1/2/3)
                df[col] = df[col].astype(str).str.strip().str.replace(r'\.0$', '', regex=True)
                df[col] = df[col].replace(mapping)

    print("Categorical value translation complete.")
    return df

In [21]:
import pandas as pd
import numpy as np

# Paths
in_path_raw = "C:/Users/HP/OneDrive/Desktop/VERO_code/Phase_1/data/interim/merged_codige_dataset.xlsx"
out_path = "C:/Users/HP/OneDrive/Desktop/VERO_code/Phase_1/data/processed/merged_codige_wide_english_values_translated.xlsx"

# Load raw (Italian or partially English) wide file
df = pd.read_excel(in_path_raw)

# 1) Apply the same column renaming mapping used previously
col_map = {
    "id_paziente": "patient_id",
    "unita_operativa": "operating_unit_name",
    "data_nascita": "birth_date",
    "eta": "age",
    "eta_gruppo": "age_group",
    "sesso": "gender",
    "etnia": "ethnicity",
    "titolo_studio": "education_level",
    "bmi": "bmi_value",
    "bmi_fasce": "bmi_category",
    "attivita_lavorativa": "employment_status",
    "alcool": "alcohol_consumption",
    "fumo": "smoking_status_binary",
    "fumo_dettaglio": "smoking_status_detail",
    "da_quanti_anni_fuma": "smoking_years",
    "n_sigarette": "cigarettes_per_day",
    "data_osservazione_fu": "observation_start_date",
    "fine_osservazione_codige_dt": "observation_end_date",
    "fine_osservazione_codige_motivo": "observation_end_reason",
    "diagnosi_tumore_dt": "tumor_diagnosis_date",
    "presa_incarico_uo_dt": "oncology_unit_start_date",
    "tipo_tumore": "tumor_type",
    "kmammella_sottotipo": "breast_cancer_subtype",
    "kcolon_locazione": "colon_cancer_location",
    "stadio": "tumor_stage_roman",
    "stadio_tnm": "tumor_stage_tnm",
    "grado_istologico": "histological_grade",
    "alterazioni_molecolari": "molecular_alterations",
    "mutazioni": "mutations_present",
    "genotipo_dpyd": "dpyd_genotype_known",
    "genotipo_dpyd_type": "dpyd_genotype_type",
    "intervento_chirurgico": "surgical_intervention",
    "intervento_chirurgico_dt": "surgery_date",
    "intervento_chirurgico_tipo": "surgery_type",
    "intervento_chirurgico_specificare": "surgery_type_specify",
    "pregresso_intervento": "prior_surgery",
    "intervento_chirurgico_complicanze": "surgery_complications",
    "intervento_chirurgico_per_complicanze_intervento": "reoperation_for_complication",
    "ricovero_per_complicanze_intervento": "hospitalization_for_surgery_complication",
    "linee_trattamento_oncologico_n": "oncology_treatment_lines_n",
    "pregresso_radioterapia": "prior_radiotherapy",
    "radioterapia_dt_inizio": "radiotherapy_start_date",
    "radioterapia_dt_fine": "radiotherapy_end_date",
    "trasfusione": "transfusion_received",
    "trasfusioni_ntot": "transfusions_total_n",
    "ipertensione": "hypertension",
    "insufficienza_aortica": "aortic_insufficiency",
    "dislipidemie": "dyslipidemia",
    "dispilidemie": "dyslipidemia",
    "ipb": "bph",
    "obesita": "obesity_comorbidity",
    "cardiopatia_ischemica": "ischemic_heart_disease",
    "fibrillazione_atriale": "atrial_fibrillation",
    "bpco": "copd",
    "asma": "asthma",
    "diabete_tipo_ii": "diabetes_type_ii",
    "diabete_tipoII": "diabetes_type_ii",
    "reflusso_gastro": "gastroesophageal_reflux",
    "reflusso_gastroesofageo": "gastroesophageal_reflux_full",
    "insufficienza_renale": "renal_insufficiency",
    "sindrome_depressiva": "depressive_syndrome",
    "anemia": "anemia_comorbidity",
    "patologie_psichiatriche": "psychiatric_disorders",
    "altre_patologie": "other_comorbidities",
    "patologie_cardiovascolari": "cardiovascular_disorders",
    "patologie_gastrointestinali": "gastrointestinal_disorders",
    "patologie_cerebrovascolari": "cerebrovascular_disorders",
    "ricoveri_n": "hospitalizations_n",
    "ricoveri_ord_n": "ordinary_hospitalizations_n",
    "decesso": "death_during_observation",
    "adr": "adr_description",
    "adr_n_tot": "adr_n_tot",
    "adr_n_grado1": "adr_n_grado1",
    "adr_n_grado2": "adr_n_grado2",
    "adr_n_grado3": "adr_n_grado3",
    "adr_n_grado4": "adr_n_grado4",
    "adr_n_grado5": "adr_n_grado5",
    "giorni_osservazione": "observation_days",
    "data_decesso": "death_date",
    "glicemia": "blood_glucose_range",
    "gb": "white_blood_cells_range",
    "gr": "red_blood_cells_range",
    "hb": "hemoglobin_range",
    "neu": "neutrophils_percent_range",
    "conta_piastrinica": "platelet_count_range",
    "creati": "creatinine_range",
    "ast_got": "ast_got_range",
    "alt_gpt": "alt_gpt_range",
    "azotemia": "azotemia_range",
    "gamma_gt": "gamma_gt_range",
    "bilirubina_tot": "total_bilirubin_range",
    "bilirubina_dir": "direct_bilirubin_range",
    "albumina": "albumin_range",
    "fe": "ejection_fraction_percent",
    "fe_cat": "ejection_fraction_category",
    "chemio_linee": "chemo_lines_total_n",
    "chemio_fine_progressione": "end_reason_progression_any_line",
    "chemio_fine_tossicità": "end_reason_toxicity_any_line",
    "chemio_fine_altro": "end_reason_other_any_line",
    "insorgenza_dt": "adr_onset_date",
    "tipo": "hospitalization_type",
    "grado": "adr_ctcae_grade",
    "esito": "adr_outcome",
    "correlazione_chemio": "adr_chemo_correlation",
    "azione_chemio": "adr_chemo_action",
    "provenienza": "adr_source_project",
    "adr_clean": "adr_description_clean",
    "macrocategoria": "adr_macro_category",
    "accesso_dt": "hospital_admission_date",
    "dimissione_dt": "hospital_discharge_date",
    "diagnosi_accesso": "admission_diagnosis",
    "diagnosi_dimissione": "discharge_diagnosis",
    "modalita": "admission_mode",
    "durata_ps": "er_stay_duration",
    "correlazione": "hospitalization_cause",
    "modifica_schema_onco": "oncology_schema_modified",
    "linea_tattamento_oncologico": "treatment_line_n",
    "nome_schema_chemio": "chemo_schema_name",
    "inizio_schema_chemio_dt": "chemo_schema_start_date",
    "fine_schema_chemio_dt": "chemo_schema_end_date",
    "cicli_n": "chemo_cycles_n",
    "motivo_fine_schema_chemio": "chemo_schema_end_reason",
    "tossicita_tipo": "toxicity_type",
    "principio_attivo": "active_principle",
    "dose_ridotta": "dose_reduced",
    "principio_attivo_n": "active_principles_n",
    "comorbidita_cat": "comorbidity_category_list",
    "comorbidita_dt": "comorbidity_diagnosis_date",
    "cardiopatia_ipertensiva": "hypertensive_heart_disease",
    "radioterapia": "radiotherapy_status",
    # suffixed ricoveri2 columns (if present)
    "ricovero_n_ricoveri2": "hospitalizations_n_ricoveri2",
    "accesso_dt_ricoveri2": "hospital_admission_date_ricoveri2",
    "dimissione_dt_ricoveri2": "hospital_discharge_date_ricoveri2",
    "diagnosi_accesso_ricoveri2": "admission_diagnosis_ricoveri2",
    "diagnosi_dimissione_ricoveri2": "discharge_diagnosis_ricoveri2",
    "modalita_ricoveri2": "admission_mode_ricoveri2",
    "durata_ps_ricoveri2": "er_stay_duration_ricoveri2",
    "tipo_ricoveri2": "hospitalization_type_ricoveri2",
    "correlazione_ricoveri2": "hospitalization_cause_ricoveri2",
    "modifica_schema_onco_ricoveri2": "oncology_schema_modified_ricoveri2",
}
existing_map = {k: v for k, v in col_map.items() if k in df.columns}
df.rename(columns=existing_map, inplace=True)

# 2) Apply categorical translations (reusing the user's maps)
BINARY_PRESENCE_MAP = {0: 'Absent / No', 1: 'Present / Yes', "0": 'Absent / No', "1": 'Present / Yes'}
GENDER_MAP_PRE_PROCESSING = {1: 'Male', 2: 'Female', "1": 'Male', "2": 'Female', "Maschio": "Male", "Femmina": "Female"}

STANDARD_CATEGORICAL_MAPS = {
    'age_group': {1: '<= 65 years', 2: '> 65 years', "1": '<= 65 years', "2": '> 65 years'},
    'ethnicity': {0: 'Missing / Not Noted', 1: 'African or African American', 2: 'Asian', 3: 'Caucasian',
                  "0": 'Missing / Not Noted', "1": 'African or African American', "2": 'Asian', "3": 'Caucasian'},
    'education_level': {0: 'Not Known / Missing', 1: 'Elementary School', 2: 'Middle School', 3: 'High School', 4: 'University Degree',
                        "0": 'Not Known / Missing', "1": 'Elementary School', "2": 'Middle School', "3": 'High School', "4": 'University Degree'},
    'bmi_category': {1: '<18.5 Underweight', 2: '18.5-24.9 Normal Weight', 3: '25-29.99 Overweight', 4: '>=30 Obese',
                     "1": '<18.5 Underweight', "2": '18.5-24.9 Normal Weight', "3": '25-29.99 Overweight', "4": '>=30 Obese'},
    'employment_status': {0: 'Not Known / Missing', 1: 'Unemployed', 2: 'Homemaker', 3: 'Worker', 4: 'Retired',
                          "0": 'Not Known / Missing', "1": 'Unemployed', "2": 'Homemaker', "3": 'Worker', "4": 'Retired'},
    'alcohol_consumption': {0: 'Not Known / Missing', 1: 'Non-drinker', 2: 'Yes, Moderate', 3: 'Yes, High',
                            "0": 'Not Known / Missing', "1": 'Non-drinker', "2": 'Yes, Moderate', "3": 'Yes, High'},
    'smoking_status_detail': {0: 'Not Known / Missing', 1: 'Never Smoked', 2: 'Current Smoker', 3: 'Ex-Smoker',
                              "0": 'Not Known / Missing', "1": 'Never Smoked', "2": 'Current Smoker', "3": 'Ex-Smoker'},
    'observation_end_reason': {
        1: 'End of Treatment with Follow-Up', 2: 'Lost to Follow-Up', 3: 'End of Study Period (ONCO22 - 31/12/2023)',
        4: 'Death', 5: 'Informed Consent Withdrawal', 6: 'Other', 7: 'Transferred', 8: 'End of Study Period (Codige - 31/12/2024)',
        "1": 'End of Treatment with Follow-Up', "2": 'Lost to Follow-Up', "3": 'End of Study Period (ONCO22 - 31/12/2023)',
        "4": 'Death', "5": 'Informed Consent Withdrawal', "6": 'Other', "7": 'Transferred', "8": 'End of Study Period (Codige - 31/12/2024)',
    },
    'breast_cancer_subtype': {1: 'Luminal', 2: 'HER2', 3: 'Triple Negative', "1": 'Luminal', "2": 'HER2', "3": 'Triple Negative'},
    'colon_cancer_location': {1: 'Left', 2: 'Right', "1": 'Left', "2": 'Right'},
    'tumor_stage_roman': {'I': 'Stage I', 'II': 'Stage II', 'III': 'Stage III', 'IV': 'Stage IV'},
    'histological_grade': {1: 'G1', 2: 'G2', 3: 'G3', "1": 'G1', "2": 'G2', "3": 'G3'},
    'surgery_type': {1: 'Partial Tumor Excision', 2: 'Total Tumor Excision', 3: 'Surgical Biopsy', 4: 'Laparoscopy', 5: 'Palliative Procedure', 6: 'Other (Specify)',
                     "1": 'Partial Tumor Excision', "2": 'Total Tumor Excision', "3": 'Surgical Biopsy', "4": 'Laparoscopy', "5": 'Palliative Procedure', "6": 'Other (Specify)'},
    'admission_mode': {1: 'Programmed', 2: 'Emergency Room (PS)', "1": 'Programmed', "2": 'Emergency Room (PS)'},
    'er_stay_duration': {1: 'Less than 24 hours', 2: 'More than 24 hours', "1": 'Less than 24 hours', "2": 'More than 24 hours'},
    'hospitalization_type': {1: 'Ordinary', 2: 'Day Hospital / Day Service (PACC)', "1": 'Ordinary', "2": 'Day Hospital / Day Service (PACC)'},
    'hospitalization_cause': {'Tumore': 'Tumor-Related', 'Trattamento oncologico': 'Oncology Treatment-Related', 'Altro (Specificare)': 'Other (Specify)'},
    'adr_event_present': BINARY_PRESENCE_MAP,
    'death_during_observation': BINARY_PRESENCE_MAP,
    'molecular_alterations': BINARY_PRESENCE_MAP,
    'mutations_present': BINARY_PRESENCE_MAP,
    'dpyd_genotype_known': {0: 'Not Known', 1: 'Known', "0": 'Not Known', "1": 'Known'},
    'dpyd_genotype_type': {1: 'Wild-Type Genotype', 2: 'Polymorphism', "1": 'Wild-Type Genotype', "2": 'Polymorphism'},
    'surgical_intervention': BINARY_PRESENCE_MAP,
    'prior_surgery': BINARY_PRESENCE_MAP,
    'surgery_complications': BINARY_PRESENCE_MAP,
    'reoperation_for_complication': BINARY_PRESENCE_MAP,
    'hospitalization_for_surgery_complication': BINARY_PRESENCE_MAP,
    'prior_radiotherapy': BINARY_PRESENCE_MAP,
    'transfusion_received': BINARY_PRESENCE_MAP,
    'oncology_schema_modified': BINARY_PRESENCE_MAP,
    'dose_reduced': BINARY_PRESENCE_MAP,
    'end_reason_progression_any_line': BINARY_PRESENCE_MAP,
    'end_reason_toxicity_any_line': BINARY_PRESENCE_MAP,
    'end_reason_other_any_line': BINARY_PRESENCE_MAP,
    'hypertension': BINARY_PRESENCE_MAP, 'aortic_insufficiency': BINARY_PRESENCE_MAP, 'dyslipidemia': BINARY_PRESENCE_MAP,
    'bph': BINARY_PRESENCE_MAP, 'obesity_comorbidity': BINARY_PRESENCE_MAP, 'ischemic_heart_disease': BINARY_PRESENCE_MAP,
    'atrial_fibrillation': BINARY_PRESENCE_MAP, 'copd': BINARY_PRESENCE_MAP, 'asthma': BINARY_PRESENCE_MAP,
    'diabetes_type_ii': BINARY_PRESENCE_MAP, 'gastroesophageal_reflux': BINARY_PRESENCE_MAP, 'renal_insufficiency': BINARY_PRESENCE_MAP,
    'depressive_syndrome': BINARY_PRESENCE_MAP, 'anemia_comorbidity': BINARY_PRESENCE_MAP, 'psychiatric_disorders': BINARY_PRESENCE_MAP,
    'other_comorbidities': BINARY_PRESENCE_MAP, 'cardiovascular_disorders': BINARY_PRESENCE_MAP, 'gastrointestinal_disorders': BINARY_PRESENCE_MAP,
    'cerebrovascular_disorders': BINARY_PRESENCE_MAP,
    'adr_type_or_hosp_type': {1: 'Intermittent', 2: 'Continuous', "1": 'Intermittent', "2": 'Continuous'},
    'adr_outcome': {1: 'Complete Resolution', 2: 'Resolution with Sequelae', 3: 'Improvement', 4: 'Unchanged or Worsened Reaction', 5: 'Death', 6: 'Not Available',
                    "1": 'Complete Resolution', "2": 'Resolution with Sequelae', "3": 'Improvement', "4": 'Unchanged or Worsened Reaction', "5": 'Death', "6": 'Not Available'},
    'adr_chemo_action': {
        1: 'Chemotherapy Treatment Interrupted', 2: 'Oncological Schema Modified', 3: 'Dose Reduction of One or More Drugs',
        4: 'None', 5: 'Not Known, Not Applicable', 6: 'Other (Specify)',
        "1": 'Chemotherapy Treatment Interrupted', "2": 'Oncological Schema Modified', "3": 'Dose Reduction of One or More Drugs',
        "4": 'None', "5": 'Not Known, Not Applicable', "6": 'Other (Specify)',
    },
    'chemo_schema_end_reason': {'Progressione della malattia': 'Disease Progression', 'Tossicità (Specificare)': 'Toxicity (Specify)', 'Altro (Specificare)': 'Other (Specify)'},
    'ejection_fraction_category': {1: 'Preserved >=50%', 2: 'Mid-Range 40-49%', 3: 'Reduced <40%', "1": 'Preserved >=50%', "2": 'Mid-Range 40-49%', "3": 'Reduced <40%'},
}

# Normalize gender early
if 'gender' in df.columns:
    df['gender'] = df['gender'].replace(GENDER_MAP_PRE_PROCESSING)

# Apply standard categorical maps
for col, cmap in STANDARD_CATEGORICAL_MAPS.items():
    if col in df.columns:
        df[col] = df[col].replace(cmap)

# Fallback replacements for common Italian strings
fallback_repl = {
    "Non noto": "Missing / Not Known",
    "Non nota": "Missing / Not Known",
    "non noto": "Missing / Not Known",
    "non nota": "Missing / Not Known",
    "Si": "Yes",
    "Sì": "Yes",
    "NO": "No",
    "SI": "Yes",
    "Maschio": "Male",
    "Femminile": "Female",
    "Femmina": "Female",
    "Programmato": "Programmed",
    "PS": "Emergency Room (PS)",
    "Altro (Specificare)": "Other (Specify)",
    "Tossicità (Specificare)": "Toxicity (Specify)",
    "Progressione della malattia": "Disease Progression",
}
for col in df.columns:
    if pd.api.types.is_numeric_dtype(df[col]) or pd.api.types.is_datetime64_any_dtype(df[col]):
        continue
    df[col] = df[col].replace(fallback_repl)

# 3) Apply lab maps
SIMPLE_LAB_MAPS = {
    'ast_got': {'<21': 'Normal (<21 U/L)', '≥21': 'High (>=21 U/L)', '>=21': 'High (>=21 U/L)', 'Non noto': 'Missing / Not Known'},
    'alt_gpt': {'<21': 'Normal (<21 U/L)', '≥21': 'High (>=21 U/L)', '>=21': 'High (>=21 U/L)', 'Non noto': 'Missing / Not Known'},
    'bun': {'<10': 'Low (<10 mg/dL)', '10-50': 'Normal (10-50 mg/dL)', '>50': 'High (>50 mg/dL)', 'Non noto': 'Missing / Not Known'},
    'total_bilirubin': {'<0.2': 'Low (<0.2 mg/dL)', '0.2-1': 'Normal (0.2-1 mg/dL)', '>1': 'High (>1 mg/dL)', 'Non noto': 'Missing / Not Known'},
    'direct_bilirubin': {'<0.2': 'Normal (<0.2 mg/dL)', '>0.2': 'High (>0.2 mg/dL)', 'Non noto': 'Missing / Not Known'},
    'albumin': {'<3.6': 'Low (<3.6 g/dL)', '3.6-4.9': 'Normal (3.6-4.9 g/dL)', '>4.9': 'High (>4.9 g/dL)', 'Non noto': 'Missing / Not Known'},
    'glucose': {'<65': 'Low (<65 mg/dL)', '65-110': 'Normal (65-110 mg/dL)', '>110': 'High (>110 mg/dL)', 'Non noto': 'Missing / Not Known'},
    'white_blood_cells': {'<4000': 'Low (<4000 cells/µL)', '4000-11000': 'Normal (4000-11000 cells/µL)', '>11000': 'High (>11000 cells/µL)', 'Non noto': 'Missing / Not Known'},
    'neutrophils': {'<40': 'Low (<40%)', '40-75': 'Normal (40-75%)', '>75': 'High (>75%)', 'Non noto': 'Missing / Not Known'},
    'platelet_count': {'<150000': 'Low (<150k /µL)', '150000-400000': 'Normal (150k-400k /µL)', '>400000': 'High (>400k /µL)', 'Non noto': 'Missing / Not Known'},
}
# map dataset columns to these lab keys
SIMPLE_DATASET_TO_KEYS = {
    'ast_got_range': 'ast_got',
    'alt_gpt_range': 'alt_gpt',
    'azotemia_range': 'bun',
    'total_bilirubin_range': 'total_bilirubin',
    'direct_bilirubin_range': 'direct_bilirubin',
    'albumin_range': 'albumin',
    'blood_glucose_range': 'glucose',
    'white_blood_cells_range': 'white_blood_cells',
    'neutrophils_percent_range': 'neutrophils',
    'platelet_count_range': 'platelet_count',
}
for ds_col, lab_key in SIMPLE_DATASET_TO_KEYS.items():
    if ds_col in df.columns:
        cmap = SIMPLE_LAB_MAPS[lab_key]
        df[ds_col] = df[ds_col].astype(str).str.strip().str.replace("≥", ">=", regex=False)
        df[ds_col] = df[ds_col].replace(cmap)

# gender-specific
GENDER_SPECIFIC_LAB_MAPS = {
    'creatinine': {
        'Male': {'<0.70': 'Low (<0.70 mg/dL)', '0.70-1.40': 'Normal (0.70-1.40 mg/dL)', '>1.40': 'High (>1.40 mg/dL)', 'Non noto': 'Missing / Not Known'},
        'Female': {'<0.70': 'Low (<0.70 mg/dL)', '0.70-1.20': 'Normal (0.70-1.20 mg/dL)', '>1.20': 'High (>1.20 mg/dL)', 'Non noto': 'Missing / Not Known'}
    },
    'red_blood_cells': {
        'Male': {'<4.5': 'Low (<4.5 mil/µL)', '4.5-5.9': 'Normal (4.5-5.9 mil/µL)', '>5.9': 'High (>5.9 mil/µL)', 'Non noto': 'Missing / Not Known'},
        'Female': {'<4.0': 'Low (<4.0 mil/µL)', '4.0-5.0': 'Normal (4.0-5.0 mil/µL)', '>5.0': 'High (>5.0 mil/µL)', 'Non noto': 'Missing / Not Known'}
    },
    'hemoglobin': {
        'Male': {'<13': 'Low (<13 g/dL)', '13-17': 'Normal (13-17 g/dL)', '>17': 'High (>17 g/dL)', 'Non noto': 'Missing / Not Known'},
        'Female': {'<12': 'Low (<12 g/dL)', '12-16': 'Normal (12-16 g/dL)', '>16': 'High (>16 g/dL)', 'Non noto': 'Missing / Not Known'}
    }
}
def map_gender_specific(value, gender_value, lab_key):
    if pd.isna(value):
        return value
    s = str(value).strip().replace("≥", ">=")
    if gender_value not in ("Male", "Female"):
        return s
    return GENDER_SPECIFIC_LAB_MAPS[lab_key].get(gender_value, {}).get(s, s)

if 'gender' in df.columns:
    for ds_col, lab_key in {'creatinine_range':'creatinine','red_blood_cells_range':'red_blood_cells','hemoglobin_range':'hemoglobin'}.items():
        if ds_col in df.columns:
            df[ds_col] = df.apply(lambda r: map_gender_specific(r[ds_col], r['gender'], lab_key), axis=1)

# Save translated values
with pd.ExcelWriter(out_path, engine="xlsxwriter") as writer:
    df.to_excel(writer, index=False, sheet_name="data")

out_path


'C:/Users/HP/OneDrive/Desktop/VERO_code/Phase_1/data/processed/merged_codige_wide_english_values_translated.xlsx'

In [23]:
import pandas as pd
import re

# Load the translated dataset
path = "C:/Users/HP/OneDrive/Desktop/VERO_code/Phase_1/data/processed/merged_codige_wide_english_values_translated.xlsx"
df = pd.read_excel(path)

# Identify object-type (categorical/text) columns
text_cols = [col for col in df.columns if df[col].dtype == 'object']

# Regex for detecting possible Italian words
italian_pattern = re.compile(r"\b(non|sì|si|maschio|femmina|altro|specificare|noto|tossicità|progressione|trattamento|tumore|oncologico|ps|programmato)\b", re.IGNORECASE)

# Scan for columns containing Italian-like text
results = []
for col in text_cols:
    sample_values = df[col].dropna().astype(str).unique().tolist()
    matches = [val for val in sample_values if italian_pattern.search(val)]
    if matches:
        counts = df[col].value_counts().loc[matches].to_dict()
        results.append({
            "column": col,
            "n_unique_matches": len(matches),
            "sample_values": matches[:5],
            "frequency": counts
        })

# Convert to a readable DataFrame
freq_report = pd.DataFrame(results)

# Save audit report
out_audit = "C:/Users/HP/OneDrive/Desktop/VERO_code/Phase_1/data/raw/italian_frequency_audit.xlsx"
with pd.ExcelWriter(out_audit, engine="xlsxwriter") as writer:
    freq_report.to_excel(writer, index=False, sheet_name="italian_terms")

out_audit


'C:/Users/HP/OneDrive/Desktop/VERO_code/Phase_1/data/raw/italian_frequency_audit.xlsx'

In [24]:
import pandas as pd
import numpy as np

# Paths
data_path = "C:/Users/HP/OneDrive/Desktop/VERO_code/Phase_1/data/processed/merged_codige_wide_english_values_translated.xlsx"
mapping_book_path = "C:/Users/HP/OneDrive/Desktop/VERO_code/Phase_1/outputs/profiling/column_index_mapping.xlsx"

# Load data
df = pd.read_excel(data_path)

# --- Create (or recreate) the column index mapping base sheet ---
def excel_col_letter(n):
    result = ""
    while n > 0:
        n, remainder = divmod(n - 1, 26)
        result = chr(65 + remainder) + result
    return result

col_map_df = pd.DataFrame({
    "Column Name": df.columns,
    "Excel Column Letter": [excel_col_letter(i+1) for i in range(len(df.columns))]
})

with pd.ExcelWriter(mapping_book_path, engine="xlsxwriter") as writer:
    col_map_df.to_excel(writer, sheet_name="column_index", index=False)

# ------------------ Build profiling pieces ------------------
# 1) Dtypes
dtypes_df = pd.DataFrame({
    "column": df.columns,
    "dtype": [str(t) for t in df.dtypes]
})

# 2) Missingness
missing_df = pd.DataFrame({
    "column": df.columns,
    "missing_count": df.isna().sum().values,
    "missing_pct": (df.isna().mean().values * 100).round(2)
}).sort_values("missing_pct", ascending=False)

# 3) Cardinality / Uniques
card_df = pd.DataFrame({
    "column": df.columns,
    "n_unique": [df[c].nunique(dropna=True) for c in df.columns]
}).sort_values("n_unique", ascending=False)

# 4) Numeric summary
num_cols = df.select_dtypes(include=["number"]).columns.tolist()
numeric_summary = df[num_cols].describe(percentiles=[0.25, 0.5, 0.75]).T if num_cols else pd.DataFrame()
if not numeric_summary.empty:
    numeric_summary = numeric_summary.rename(columns={
        "25%": "p25", "50%": "median", "75%": "p75"
    }).reset_index().rename(columns={"index": "column"})

# 5) Categorical summary (object columns)
cat_cols = df.select_dtypes(include=["object"]).columns.tolist()
cat_rows = []
for c in cat_cols:
    s = df[c].astype("string")
    vc = s.value_counts(dropna=False)
    top = vc.index[0] if len(vc) else np.nan
    top_freq = int(vc.iloc[0]) if len(vc) else np.nan
    cat_rows.append({"column": c, "n_unique": int(s.nunique(dropna=True)), "top": top, "top_freq": top_freq})
categorical_summary = pd.DataFrame(cat_rows).sort_values("n_unique", ascending=False)

# 6) Datetime summary (if any)
dt_cols = df.select_dtypes(include=["datetime64[ns]", "datetime64[ns, UTC]"]).columns.tolist()
dt_rows = []
for c in dt_cols:
    s = pd.to_datetime(df[c], errors="coerce")
    dt_rows.append({
        "column": c,
        "min_date": s.min(),
        "max_date": s.max(),
        "n_missing": s.isna().sum(),
        "n_unique": s.nunique(dropna=True)
    })
datetime_summary = pd.DataFrame(dt_rows)

# 7) Overview
overview = pd.DataFrame({
    "n_rows": [len(df)],
    "n_columns": [df.shape[1]],
    "n_numeric": [len(num_cols)],
    "n_categorical": [len(cat_cols)],
    "n_datetime": [len(dt_cols)],
    "missing_any_cols": [(missing_df["missing_count"] > 0).sum()],
})

# Append profiling sheets
with pd.ExcelWriter(mapping_book_path, mode="a", engine="openpyxl", if_sheet_exists="replace") as writer:
    overview.to_excel(writer, sheet_name="overview", index=False)
    dtypes_df.to_excel(writer, sheet_name="dtypes", index=False)
    missing_df.to_excel(writer, sheet_name="missingness", index=False)
    card_df.to_excel(writer, sheet_name="cardinality", index=False)
    if not numeric_summary.empty:
        numeric_summary.to_excel(writer, sheet_name="numeric_summary", index=False)
    categorical_summary.to_excel(writer, sheet_name="categorical_summary", index=False)
    if not datetime_summary.empty:
        datetime_summary.to_excel(writer, sheet_name="datetime_summary", index=False)

mapping_book_path


'C:/Users/HP/OneDrive/Desktop/VERO_code/Phase_1/outputs/profiling/column_index_mapping.xlsx'

# Visuals

In [25]:
import pandas as pd
import matplotlib.pyplot as plt
import os

# ------------------ CONFIG ------------------
input_path = "C:/Users/HP/OneDrive/Desktop/VERO_code/Phase_1/data/processed/merged_codige_wide_english_values_translated.xlsx"
output_dir = r"C:\Users\HP\OneDrive\Desktop\VERO_code\Phase_1\outputs\plots"

# Create output folder if not exists
os.makedirs(output_dir, exist_ok=True)

# Load dataset
df = pd.read_excel(input_path)

# ------------------ TYPE DETECTION ------------------
num_cols = df.select_dtypes(include=['number']).columns.tolist()
cat_cols = df.select_dtypes(exclude=['number', 'datetime']).columns.tolist()

# ------------------ VISUALIZE CATEGORICAL ------------------
for col in cat_cols:
    # Skip columns with too many unique values
    if df[col].nunique(dropna=False) > 1 and df[col].nunique(dropna=False) < 50:
        plt.figure(figsize=(10, 5))
        df[col] = df[col].fillna("Missing")  # handle missing as explicit category
        df[col].value_counts(dropna=False).plot(kind='bar', color='steelblue', edgecolor='black')
        plt.title(f'Distribution of {col}', fontsize=13)
        plt.xlabel(col)
        plt.ylabel('Count')
        plt.xticks(rotation=45, ha='right')
        plt.tight_layout()
        plt.savefig(os.path.join(output_dir, f"{col}_bar.png"))
        plt.close()

# ------------------ VISUALIZE NUMERIC ------------------
for col in num_cols:
    if df[col].nunique(dropna=True) > 1:
        plt.figure(figsize=(8, 5))
        df[col].dropna().plot(kind='hist', bins=30, color='orange', edgecolor='black')
        plt.title(f'Histogram of {col}', fontsize=13)
        plt.xlabel(col)
        plt.ylabel('Frequency')
        plt.tight_layout()
        plt.savefig(os.path.join(output_dir, f"{col}_hist.png"))
        plt.close()

# ------------------ SUMMARY ------------------
print("✅ Visualization complete.")
print(f"Bar charts and histograms saved to: {output_dir}")


  plt.tight_layout()


✅ Visualization complete.
Bar charts and histograms saved to: C:\Users\HP\OneDrive\Desktop\VERO_code\Phase_1\outputs\plots
