In [2]:
import pandas as pd

death = pd.read_csv(r"C:\Users\HP\OneDrive\Desktop\Phase 3\merged_with_phenotypes\death_with_phenotype.csv")
master = pd.read_csv(r"C:\Users\HP\OneDrive\Desktop\Phase 3\merged_with_phenotypes\master_with_phenotype.csv")

death_cols = set(death.columns)
master_cols = set(master.columns)

only_in_death = sorted(death_cols - master_cols)
only_in_master = sorted(master_cols - death_cols)
common_cols = sorted(death_cols & master_cols)

print("Only in death_with_phenotype:")
print(only_in_death)

print("\nOnly in master_with_phenotype:")
print(only_in_master)

print("\nCommon columns:")
print(common_cols)


Only in death_with_phenotype:
[]

Only in master_with_phenotype:
['Number of Previous Treatment Lines', 'Oncology Unit Intake Date', 'Other Surgical Intervention', 'albumin_range', 'azotemia_range', 'birth_date', 'blood_glucose_range', 'bmi_value', 'breast_cancer_subtype', 'cci_groups', 'chronic_polypharmacy_flag', 'colon_cancer_location', 'death_date', 'ejection_fraction_category', 'ejection_fraction_percent', 'gamma_gt_range', 'genotipo_DPYD', 'histological_grade', 'hospitalization_for_surgery_complication', 'medication_duration_days', 'n_analgesic_drugs', 'n_cardiovascular_drugs', 'n_chronic_drugs', 'n_cns_drugs', 'n_gastrointestinal_drugs', 'n_med_events', 'n_metabolic_drugs', 'n_respiratory_drugs', 'n_unique_active_principles', 'observation_end_date', 'observation_end_reason', 'observation_start_date', 'ordinary_hospitalizations_n', 'other_comorbidities', 'polypharmacy_flag', 'prior_radiotherapy', 'prior_surgery', 'radiotherapy_end_date', 'radiotherapy_start_date', 'reoperation_fo

In [3]:
def audit_columns(df, id_cols=None):
    if id_cols is None:
        id_cols = []

    rows = []

    for col in df.columns:
        if col in id_cols:
            continue

        s = df[col]
        n = len(s)
        missing = s.isna().mean()

        nunique = s.nunique(dropna=True)

        if s.dtype == "object":
            vc = s.value_counts(dropna=True, normalize=True)
            top_level = vc.index[0] if len(vc) else None
            top_prop = vc.iloc[0] if len(vc) else None
        else:
            top_level = None
            top_prop = None

        rows.append({
            "variable": col,
            "dtype": str(s.dtype),
            "missing_pct": round(100 * missing, 2),
            "n_unique": nunique,
            "top_level": top_level,
            "top_level_pct": round(100 * top_prop, 2) if top_prop is not None else None
        })

    return pd.DataFrame(rows)


In [4]:
death_audit = audit_columns(death, id_cols=["patient_id"])
master_audit = audit_columns(master, id_cols=["patient_id"])

death_audit.sort_values("top_level_pct", ascending=False).head(10)


Unnamed: 0,variable,dtype,missing_pct,n_unique,top_level,top_level_pct
33,psychiatric_disorders,object,0.0,2,Absent / No,99.75
19,aortic_insufficiency,object,0.0,2,Absent / No,99.51
26,asthma,object,0.0,2,Absent / No,99.26
5,ethnicity,object,0.0,4,Caucasian,99.01
36,cerebrovascular_disorders,object,0.0,2,Absent / No,99.01
30,renal_insufficiency,object,0.0,2,Absent / No,98.77
22,obesity_comorbidity,object,0.0,2,Absent / No,97.78
31,depressive_syndrome,object,0.0,2,Absent / No,97.78
24,atrial_fibrillation,object,0.0,2,Absent / No,97.54
32,anemia_comorbidity,object,0.0,2,Absent / No,97.29


In [5]:
death_audit.to_csv("death_table_audit.csv", index=False)
master_audit.to_csv("master_table_audit.csv", index=False)


In [6]:
import pandas as pd
import numpy as np

# ----------------------------
# 1) Load data
# ----------------------------
df = pd.read_csv(r"C:\Users\HP\OneDrive\Desktop\Phase 3\merged_with_phenotypes\master_with_phenotype.csv")
df.columns = [c.strip() for c in df.columns]

ID_COLS = ["patient_id"]

N = len(df)

# ----------------------------
# 2) Audit function
# ----------------------------
rows = []

for col in df.columns:
    if col in ID_COLS:
        continue

    s = df[col]

    missing_pct = 100 * s.isna().mean()
    nunique = s.nunique(dropna=True)

    dominant_level = None
    dominant_pct = None

    if s.dtype == "object" or nunique < 15:
        vc = s.value_counts(dropna=True, normalize=True)
        if len(vc) > 0:
            dominant_level = str(vc.index[0])
            dominant_pct = 100 * vc.iloc[0]

    near_zero_variance = nunique <= 1
    extreme_dominance = dominant_pct is not None and dominant_pct >= 95
    high_missingness = missing_pct >= 40

    rows.append({
        "variable": col,
        "dtype": str(s.dtype),
        "missing_pct": round(missing_pct, 2),
        "n_unique": nunique,
        "dominant_level": dominant_level,
        "dominant_level_pct": round(dominant_pct, 2) if dominant_pct is not None else None,
        "flag_near_zero_variance": near_zero_variance,
        "flag_extreme_dominance": extreme_dominance,
        "flag_high_missingness": high_missingness,
        "audit_flagged": near_zero_variance or extreme_dominance or high_missingness,
        "decision": "",      # YOU fill: KEEP / DROP / COLLAPSE
        "notes": ""          # YOU justify
    })

audit_df = pd.DataFrame(rows)

# ----------------------------
# 3) Sort for human review
# ----------------------------
audit_df = audit_df.sort_values(
    by=["audit_flagged", "dominant_level_pct", "missing_pct"],
    ascending=[False, False, False]
)

# ----------------------------
# 4) Save
# ----------------------------
audit_path = "phase5_variable_audit_table.csv"
audit_df.to_csv(audit_path, index=False)

print("Saved audit table:", audit_path)
print("\nFlagged variables (top 10):")
print(
    audit_df[audit_df["audit_flagged"]]
    .head(10)[
        ["variable", "missing_pct", "dominant_level_pct",
         "flag_near_zero_variance", "flag_extreme_dominance", "flag_high_missingness"]
    ]
)


Saved audit table: phase5_variable_audit_table.csv

Flagged variables (top 10):
                       variable  missing_pct  dominant_level_pct  \
19        breast_cancer_subtype        99.75              100.00   
114             n_chronic_drugs        54.68              100.00   
116   chronic_polypharmacy_flag        54.68              100.00   
60        psychiatric_disorders         0.00               99.75   
73                 adr_n_grado5         0.00               99.75   
46         aortic_insufficiency         0.00               99.51   
100              received_chemo         0.00               99.51   
53                       asthma         0.00               99.26   
56   hypertensive_heart_disease         0.00               99.01   
64    cerebrovascular_disorders         0.00               99.01   

     flag_near_zero_variance  flag_extreme_dominance  flag_high_missingness  
19                      True                    True                   True  
114            

In [7]:
# List all column names
cols = death.columns.tolist()

print(f"Number of columns: {len(cols)}\n")
for i, c in enumerate(cols, 1):
    print(f"{i:02d}. {c}")
    
    
cols



Number of columns: 76

01. patient_id
02. death_outcome
03. survival_days
04. age
05. age_group
06. gender
07. ethnicity
08. education_level
09. bmi_category
10. employment_status
11. alcohol_consumption
12. smoking_status_detail
13. tumor_type
14. molecular_alterations
15. mutations_present
16. genotipo_DPYD_type
17. surgical_intervention
18. oncology_treatment_lines_n
19. radiotherapy_status
20. hypertension
21. aortic_insufficiency
22. dyslipidemia
23. IPB
24. obesity_comorbidity
25. ischemic_heart_disease
26. atrial_fibrillation
27. copd
28. asthma
29. diabete_tipo_II
30. gastroesophageal_reflux_full
31. hypertensive_heart_disease
32. renal_insufficiency
33. depressive_syndrome
34. anemia_comorbidity
35. psychiatric_disorders
36. cardiovascular_disorders
37. gastrointestinal_disorders
38. cerebrovascular_disorders
39. adr_description
40. adr_n_tot
41. adr_n_grado1
42. adr_n_grado2
43. adr_n_grado3
44. adr_n_grado4
45. adr_n_grado5
46. white_blood_cells_range
47. red_blood_cells_ran

['patient_id',
 'death_outcome',
 'survival_days',
 'age',
 'age_group',
 'gender',
 'ethnicity',
 'education_level',
 'bmi_category',
 'employment_status',
 'alcohol_consumption',
 'smoking_status_detail',
 'tumor_type',
 'molecular_alterations',
 'mutations_present',
 'genotipo_DPYD_type',
 'surgical_intervention',
 'oncology_treatment_lines_n',
 'radiotherapy_status',
 'hypertension',
 'aortic_insufficiency',
 'dyslipidemia',
 'IPB',
 'obesity_comorbidity',
 'ischemic_heart_disease',
 'atrial_fibrillation',
 'copd',
 'asthma',
 'diabete_tipo_II',
 'gastroesophageal_reflux_full',
 'hypertensive_heart_disease',
 'renal_insufficiency',
 'depressive_syndrome',
 'anemia_comorbidity',
 'psychiatric_disorders',
 'cardiovascular_disorders',
 'gastrointestinal_disorders',
 'cerebrovascular_disorders',
 'adr_description',
 'adr_n_tot',
 'adr_n_grado1',
 'adr_n_grado2',
 'adr_n_grado3',
 'adr_n_grado4',
 'adr_n_grado5',
 'white_blood_cells_range',
 'red_blood_cells_range',
 'hemoglobin_range',

In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer


try:
    death
except NameError:
    death = pd.read_csv("/mnt/data/death_with_phenotype.csv")

death.columns = [c.strip() for c in death.columns]

# ----------------------------
# 1) Locked drops (leakage/outcomes/admin/redundant ADR grades)
# ----------------------------
DROP_COLS = [
    "patient_id",
    "death_outcome",
    "survival_days",
    "end_due_to_progression",
    "end_reason_progression_any_line",
    "end_reason_other_any_line",
    "chemio_fine_tossicita",
    "adr_description",
    "adr_n_grado1", "adr_n_grado2", "adr_n_grado3", "adr_n_grado4", "adr_n_grado5",
]

TARGET_COL = "phenotype"

# ----------------------------
# 2) Frozen Phase 5 feature list (as agreed)
# ----------------------------
FEATURE_COLS = [
    # Demographics & Socio-economic
    "age", "age_group", "gender", "ethnicity", "education_level",
    "employment_status", "alcohol_consumption", "smoking_status_detail",

    # Tumor & Molecular Context
    "tumor_type", "molecular_alterations", "mutations_present", "genotipo_DPYD_type",

    # Treatment Exposure (Baseline)
    "surgical_intervention", "radiotherapy_status", "received_chemo", "received_targeted_therapy",
    "oncology_treatment_lines_n", "n_treatment_lines", "max_combo_regimen_size",
    "total_chemo_cycles", "treatment_duration_days",

    # Comorbidities & Clinical Conditions
    "hypertension", "dyslipidemia", "ischemic_heart_disease", "atrial_fibrillation",
    "hypertensive_heart_disease", "diabete_tipo_II", "obesity_comorbidity", "copd", "asthma",
    "renal_insufficiency", "anemia_comorbidity", "depressive_syndrome", "psychiatric_disorders",
    "cerebrovascular_disorders", "gastroesophageal_reflux_full", "gastrointestinal_disorders",
    "cardiovascular_disorders",

    # Frailty & Burden Indices
    "cci_score", "IPB", "farmaci_cat_n", "total_unique_active_drugs",

    # Laboratory Ranges
    "white_blood_cells_range", "red_blood_cells_range", "hemoglobin_range",
    "neutrophils_percent_range", "platelet_count_range", "creatinine_range",
    "ast_got_range", "alt_gpt_range", "total_bilirubin_range", "direct_bilirubin_range",

    # ADR Summary (single representation)
    "adr_n_tot",
]

# ----------------------------
# 3) Sanity checks: confirm required columns exist
# ----------------------------
missing_features = [c for c in FEATURE_COLS if c not in death.columns]
if missing_features:
    raise ValueError(
        "These frozen Phase 5 features are missing from `death_with_phenotype`:\n"
        + "\n".join(missing_features)
    )

if TARGET_COL not in death.columns:
    raise ValueError(f"Target column '{TARGET_COL}' is missing from dataset.")

# Confirm age_group labels (as you insisted)
expected_age = {"<= 65 years", "> 65 years"}
age_vals = set(death["age_group"].dropna().astype(str).str.strip().unique().tolist())
if not expected_age.issubset(age_vals):
    print("WARNING: age_group does not contain both expected labels.")
    print("Found labels:", sorted(age_vals))

# ----------------------------
# 4) Build modeling frame: drop banned columns + keep frozen features + target
# ----------------------------
use_cols = FEATURE_COLS + [TARGET_COL]
df = death[use_cols].copy()

# Clean target: ensure numeric 0/1
df[TARGET_COL] = pd.to_numeric(df[TARGET_COL], errors="coerce")
df = df[df[TARGET_COL].isin([0, 1])].copy()

# ----------------------------
# 5) Quick audit summary (missingness + dtype)
# ----------------------------
audit = pd.DataFrame({
    "variable": FEATURE_COLS,
    "dtype": [str(df[c].dtype) for c in FEATURE_COLS],
    "missing_pct": [round(100 * df[c].isna().mean(), 2) for c in FEATURE_COLS],
    "n_unique": [df[c].nunique(dropna=True) for c in FEATURE_COLS],
})
audit.to_csv("phase5_feature_audit_death_with_phenotype.csv", index=False)

# Save frozen feature list for traceability
pd.DataFrame({"feature": FEATURE_COLS}).to_csv("phase5_frozen_features.csv", index=False)

print("Saved:")
print(" - phase5_feature_audit_death_with_phenotype.csv")
print(" - phase5_frozen_features.csv")

# ----------------------------
# 6) Split X and y
# ----------------------------
X = df[FEATURE_COLS].copy()
y = df[TARGET_COL].astype(int).copy()

# If you want to keep only your two age_group labels strictly:
# X = X[X["age_group"].isin(["<= 65 years", "> 65 years"])]
# y = y.loc[X.index]

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.20,
    random_state=42,
    stratify=y
)

print("\nSplit summary:")
print("Train size:", X_train.shape, "| Test size:", X_test.shape)
print("Phenotype distribution (train):\n", y_train.value_counts(normalize=True).round(3))
print("Phenotype distribution (test):\n", y_test.value_counts(normalize=True).round(3))

# ----------------------------
# 7) Preprocessing (no model yet)
# ----------------------------
# Identify numeric vs categorical based on dtype AND low-cardinality numeric treated as numeric still.
numeric_cols = [c for c in FEATURE_COLS if pd.api.types.is_numeric_dtype(X_train[c])]
categorical_cols = [c for c in FEATURE_COLS if c not in numeric_cols]

numeric_pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median"))
])

categorical_pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_pipe, numeric_cols),
        ("cat", categorical_pipe, categorical_cols),
    ],
    remainder="drop"
)

# Fit the preprocessor on train only (prevents leakage)
preprocessor.fit(X_train)

# Transform to matrices (ready for modeling)
X_train_mat = preprocessor.transform(X_train)
X_test_mat = preprocessor.transform(X_test)

print("\nMatrix shapes after preprocessing:")
print("X_train_mat:", X_train_mat.shape)
print("X_test_mat:", X_test_mat.shape)

# Optional: get feature names after one-hot
try:
    cat_feature_names = preprocessor.named_transformers_["cat"].named_steps["onehot"].get_feature_names_out(categorical_cols)
    feature_names = np.concatenate([np.array(numeric_cols), cat_feature_names])
    pd.DataFrame({"feature": feature_names}).to_csv("phase5_model_matrix_feature_names.csv", index=False)
    print("Saved: phase5_model_matrix_feature_names.csv")
except Exception as e:
    print("Note: Could not export one-hot feature names:", e)




Saved:
 - phase5_feature_audit_death_with_phenotype.csv
 - phase5_frozen_features.csv

Split summary:
Train size: (324, 53) | Test size: (82, 53)
Phenotype distribution (train):
 phenotype
0    0.676
1    0.324
Name: proportion, dtype: float64
Phenotype distribution (test):
 phenotype
0    0.671
1    0.329
Name: proportion, dtype: float64

Matrix shapes after preprocessing:
X_train_mat: (324, 128)
X_test_mat: (82, 128)
Saved: phase5_model_matrix_feature_names.csv
