
# Phase 3 - Feature Integration and Selection Notebook

This notebook reproduces the integration step for Phase 3:
- Load cleaned analytic dataset and Phase 2 outputs
- Normalize feature names
- Merge robust signals from RI significance, univariate screening, and model coefficients
- Score and select top features (default: 150)
- Export the integrated dataset ready for Gower distance, UMAP, PCA, and clustering


In [1]:

import pandas as pd
import numpy as np
import re
from pathlib import Path

# Parameters
TOP_N = 150  # number of features to keep
BASE = Path("C:/Users/HP/OneDrive/Desktop/VERO_code/Phase_3")
BASE1 = Path("C:/Users/HP/OneDrive/Desktop/VERO_code/Phase_1/data/processed")
BASE2 = Path("C:/Users/HP/OneDrive/Desktop/VERO_code/Phase_2/results/tables")
ART_DIR = BASE / "phase3_outputs"
ART_DIR.mkdir(exist_ok=True)

print("Artifacts will be written to:", ART_DIR)


  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (


Artifacts will be written to: C:\Users\HP\OneDrive\Desktop\VERO_code\Phase_3\phase3_outputs


In [2]:

def norm(s):
    """Lightweight normalization for feature keys."""
    if pd.isna(s):
        return np.nan
    s = str(s).strip().replace("\n", " ")
    s = re.sub(r"\s+", " ", s)
    return s.lower()

def rank01(series):
    """Convert a numeric series to percentile ranks in [0,1], tolerant to missing."""
    s = series.copy()
    s = s.replace([np.inf, -np.inf], np.nan)
    if s.notna().sum() < 3:
        return pd.Series(np.nan, index=s.index)
    return s.rank(pct=True)

def prep_coef(df):
    """Prepare coefficient tables to expose a unified feature_norm and coefficient column."""
    cols = {c.lower(): c for c in df.columns}
    feat_col = cols.get("feature_expanded", None)
    if feat_col is None:
        candidates = [c for c in df.columns if "feature" in c.lower()]
        feat_col = candidates[0] if candidates else df.columns[0]
    df = df.copy()
    df["feature_norm"] = df[feat_col].map(norm)
    coef_col = cols.get("coefficient", None)
    if coef_col is None:
        alt = [c for c in df.columns if "coef" in c.lower()]
        coef_col = alt[0] if alt else None
    if coef_col:
        df["coefficient"] = pd.to_numeric(df[coef_col], errors="coerce")
    else:
        df["coefficient"] = np.nan
    return df


In [3]:

# Core data
clean = pd.read_excel(BASE1 / "cleaned_data.xlsx")
types = pd.read_excel(BASE2 / "features_data_types.xlsx")

# Phase 2 outputs
ri_sig = pd.read_excel(BASE2 / "RI_Significance_Table_robust.xlsx")
uni_top = pd.read_excel(BASE2 / "univariate_summary_top25.xlsx")
coef_frailty = pd.read_excel(BASE2 / "coefficients_Frailty_Category.xlsx")
coef_adr = pd.read_excel(BASE2 / "coefficients_Severe_ADRs.xlsx")
coef_readm = pd.read_excel(BASE2 / "coefficients_readmission_flag.xlsx")

clean.shape, types.shape, ri_sig.shape, uni_top.shape, coef_frailty.shape, coef_adr.shape, coef_readm.shape


((403, 109), (114, 2), (112, 16), (75, 7), (81, 3), (83, 3), (83, 3))

In [4]:

# Build original->normalized column map
clean_cols = pd.Series(clean.columns, name="feature_orig")
col_map = pd.DataFrame({
    "feature_orig": clean_cols,
    "feature_norm": clean_cols.map(norm)
})

# Prepare typing table
if "Feature" in types.columns:
    types = types.rename(columns={"Feature": "feature_orig"})
    types["feature_norm"] = types["feature_orig"].map(norm)
elif "feature" in types.columns:
    types = types.rename(columns={"feature": "feature_orig"})
    types["feature_norm"] = types["feature_orig"].map(norm)
else:
    # fallback to dtype introspection
    types = pd.DataFrame({
        "feature_orig": clean.columns,
        "feature_norm": col_map["feature_norm"],
        "Data Type": clean.dtypes.astype(str).values
    })

# RI significance (robust)
ri_sig_cols = {c.lower(): c for c in ri_sig.columns}
if "variable" in ri_sig_cols:
    var_col = ri_sig_cols["variable"]
else:
    var_candidates = [c for c in ri_sig.columns if "var" in c.lower()]
    var_col = var_candidates[0] if var_candidates else ri_sig.columns[0]
ri_sig = ri_sig.copy()
ri_sig["feature_norm"] = ri_sig[var_col].map(norm)

# Univariate summary
uni_cols = {c.lower(): c for c in uni_top.columns}
feat_col_uni = uni_cols.get("feature", list(uni_top.columns)[0])
p_col_uni = uni_cols.get("p-value", None)

eff_col_uni = None
for k in ["effect size", "effect_size", "effsize"]:
    if k in uni_cols:
        eff_col_uni = uni_cols[k]
        break

uni_top = uni_top.copy()
uni_top["feature_norm"] = uni_top[feat_col_uni].map(norm)
if p_col_uni and p_col_uni in uni_top.columns:
    uni_top["p_value"] = pd.to_numeric(uni_top[p_col_uni], errors="coerce")
else:
    uni_top["p_value"] = np.nan

if eff_col_uni and eff_col_uni in uni_top.columns:
    uni_top["effect_size"] = pd.to_numeric(uni_top[eff_col_uni], errors="coerce")
else:
    uni_top["effect_size"] = np.nan

# Coefficient tables
coef_frailty_p = prep_coef(coef_frailty)
coef_adr_p = prep_coef(coef_adr)
coef_readm_p = prep_coef(coef_readm)


In [5]:

# Initialize meta with all columns from cleaned data
meta = pd.DataFrame({"feature_norm": col_map["feature_norm"].unique()})
meta = meta.merge(col_map.groupby("feature_norm").agg(feature_orig=("feature_orig","first")).reset_index(),
                  on="feature_norm", how="left")

# Attach data type
if "Data Type" in types.columns:
    meta = meta.merge(types[["feature_norm","Data Type"]], on="feature_norm", how="left")
else:
    dtype_map = clean.dtypes.astype(str).to_dict()
    meta["Data Type"] = meta["feature_orig"].map(dtype_map)

# Attach RI metrics (when available)
for col in ["Combined_RI","Mean_log_effect","Mean_sd","Mean_sel"]:
    src = [c for c in ri_sig.columns if c.lower()==col.lower()]
    if src:
        meta = meta.merge(ri_sig[["feature_norm", src[0]]].rename(columns={src[0]: col}), on="feature_norm", how="left")

# Univariate aggregation across outcomes
agg_uni = (uni_top
           .groupby("feature_norm", as_index=False)
           .agg(min_p_value=("p_value","min"),
                max_abs_effect=("effect_size", lambda s: np.nanmax(np.abs(s)))))
meta = meta.merge(agg_uni, on="feature_norm", how="left")

# Coefficient magnitudes (max abs per feature across families)
def attach_coef(meta_df, coef_df, name):
    agg = (coef_df.groupby("feature_norm", as_index=False)
           .agg(**{f"{name}_abs_coef": ("coefficient", lambda s: np.nanmax(np.abs(s)))}))
    return meta_df.merge(agg, on="feature_norm", how="left")

meta = attach_coef(meta, coef_frailty_p, "frailty")
meta = attach_coef(meta, coef_adr_p, "adr")
meta = attach_coef(meta, coef_readm_p, "readm")

# Scores (percentile ranks)
meta["score_ri"] = rank01(meta["Combined_RI"])
meta["score_logeff"] = rank01(meta["Mean_log_effect"].abs())
meta["score_p"] = rank01(-np.log10(meta["min_p_value"]))  # smaller p => larger score

for nm in ["frailty_abs_coef","adr_abs_coef","readm_abs_coef","max_abs_effect"]:
    if nm in meta.columns:
        meta[f"score_{nm}"] = rank01(meta[nm])

# Weighted ensemble
weights = {
    "score_ri": 0.35,
    "score_logeff": 0.15,
    "score_p": 0.20,
    "score_frailty_abs_coef": 0.10,
    "score_adr_abs_coef": 0.10,
    "score_readm_abs_coef": 0.10
}

def weighted_sum(row):
    total, wsum = 0.0, 0.0
    for k, w in weights.items():
        if k in row.index and pd.notna(row[k]):
            total += w * row[k]
            wsum += w
    return total / wsum if wsum > 0 else np.nan

meta["feature_score"] = meta.apply(weighted_sum, axis=1)
meta.head()


Unnamed: 0,feature_norm,feature_orig,Data Type,Combined_RI,Mean_log_effect,Mean_sd,Mean_sel,min_p_value,max_abs_effect,frailty_abs_coef,adr_abs_coef,readm_abs_coef,score_ri,score_logeff,score_p,score_frailty_abs_coef,score_adr_abs_coef,score_readm_abs_coef,score_max_abs_effect,feature_score
0,patient_id,patient_id,object,,,,,,,,,,,,,,,,,
1,birth_date,birth_date,object,,,,,,,,,,,,,,,,,
2,age,age,int64,0.254108,0.047614,0.25,0.5,0.073006,0.393446,,,,0.432749,0.333333,0.011364,,,,0.414773,0.29105
3,age_group,age_group,object,,,,,0.112358,0.079088,,,,,,0.005682,,,,0.005682,0.005682
4,gender,gender,object,,,,,,,,,,,,,,,,,


In [6]:
meta

Unnamed: 0,feature_norm,feature_orig,Data Type,Combined_RI,Mean_log_effect,Mean_sd,Mean_sel,min_p_value,max_abs_effect,frailty_abs_coef,adr_abs_coef,readm_abs_coef,score_ri,score_logeff,score_p,score_frailty_abs_coef,score_adr_abs_coef,score_readm_abs_coef,score_max_abs_effect,feature_score
0,patient_id,patient_id,object,,,,,,,,,,,,,,,,,
1,birth_date,birth_date,object,,,,,,,,,,,,,,,,,
2,age,age,int64,0.254108,0.047614,0.25,0.5,0.073006,0.393446,,,,0.432749,0.333333,0.011364,,,,0.414773,0.291050
3,age_group,age_group,object,,,,,0.112358,0.079088,,,,,,0.005682,,,,0.005682,0.005682
4,gender,gender,object,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
239,adr_chemo_correlation,adr_chemo_correlation,float64,,,,,,,,,,,,,,,,,
240,adr_chemo_action,adr_chemo_action,object,,,,,,,,,,,,,,,,,
241,adr_source_project,adr_source_project,object,,,,,,,,,,,,,,,,,
242,adr_macro_category,adr_macro_category,object,,,,,,,,,,,,,,,,,


In [7]:

# Selection logic
candidates = meta.sort_values("feature_score", ascending=False)

q75 = candidates["feature_score"].quantile(0.75)
sel_high = candidates[candidates["feature_score"] >= q75].copy()
sel_p = candidates[(candidates["min_p_value"] <= 0.05)].copy()

selected = pd.concat([sel_high, sel_p], axis=0).drop_duplicates(subset=["feature_norm"])
selected = selected.sort_values("feature_score", ascending=False).head(TOP_N).copy()

# Retain only columns that exist in the cleaned dataset
selected = selected[selected["feature_orig"].isin(clean.columns)]

selected.shape, selected.head(10)


((45, 20),
                 feature_norm             feature_orig Data Type  Combined_RI  \
 81      transfusion_received     transfusion_received    object     1.000000   
 89       atrial_fibrillation      atrial_fibrillation    object     0.720105   
 87       obesity_comorbidity      obesity_comorbidity    object     0.605841   
 86                       bph                      bph    object     0.576195   
 85              dyslipidemia             dyslipidemia    object     0.518550   
 189  white_blood_cells_range  white_blood_cells_range    object          NaN   
 237          adr_ctcae_grade          adr_ctcae_grade   float64     0.411648   
 83              hypertension             hypertension    object     0.995395   
 204         treatment_line_n         treatment_line_n   float64     1.000000   
 31    observation_end_reason   observation_end_reason    object          NaN   
 
      Mean_log_effect  Mean_sd  Mean_sel   min_p_value  max_abs_effect  \
 81          0.196648 

In [8]:
selected.info()

<class 'pandas.core.frame.DataFrame'>
Index: 45 entries, 81 to 170
Data columns (total 20 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   feature_norm            45 non-null     object 
 1   feature_orig            45 non-null     object 
 2   Data Type               45 non-null     object 
 3   Combined_RI             24 non-null     float64
 4   Mean_log_effect         24 non-null     float64
 5   Mean_sd                 24 non-null     float64
 6   Mean_sel                24 non-null     float64
 7   min_p_value             38 non-null     float64
 8   max_abs_effect          38 non-null     float64
 9   frailty_abs_coef        11 non-null     float64
 10  adr_abs_coef            11 non-null     float64
 11  readm_abs_coef          11 non-null     float64
 12  score_ri                24 non-null     float64
 13  score_logeff            24 non-null     float64
 14  score_p                 38 non-null     float64

In [9]:

# Try to locate a patient id column
id_candidates = [c for c in clean.columns if re.search(r"(patient.*id|id.*patient|^id$)", c, flags=re.I)]
id_col = id_candidates[0] if id_candidates else None

keep_cols = ([id_col] if id_col else []) + selected["feature_orig"].tolist()
integrated = clean[keep_cols].copy()

integrated.shape, id_col, integrated.head(3)


((403, 46),
 'patient_id',
                            patient_id transfusion_received  \
 0                       10_AO San Pio          Absent / No   
 1               10_AORN A. Cardarelli          Absent / No   
 2  10_AORN Monaldi – Cotugno - C.T.O.          Absent / No   
 
   atrial_fibrillation obesity_comorbidity          bph dyslipidemia  \
 0       Present / Yes         Absent / No  Absent / No  Absent / No   
 1         Absent / No         Absent / No  Absent / No  Absent / No   
 2         Absent / No         Absent / No  Absent / No  Absent / No   
 
         white_blood_cells_range  adr_ctcae_grade   hypertension  \
 0  Normal (4000-11000 cells/µL)         1.500000  Present / Yes   
 1  Normal (4000-11000 cells/µL)         1.285714  Present / Yes   
 2  Normal (4000-11000 cells/µL)         2.333333  Present / Yes   
 
    treatment_line_n  ...  ethnicity oncology_treatment_lines_n  \
 0          1.857143  ...  Caucasian                        3.0   
 1          1.000000 

In [10]:
integrated.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 403 entries, 0 to 402
Data columns (total 46 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   patient_id                       403 non-null    object 
 1   transfusion_received             403 non-null    object 
 2   atrial_fibrillation              403 non-null    object 
 3   obesity_comorbidity              403 non-null    object 
 4   bph                              403 non-null    object 
 5   dyslipidemia                     403 non-null    object 
 6   white_blood_cells_range          389 non-null    object 
 7   adr_ctcae_grade                  403 non-null    float64
 8   hypertension                     403 non-null    object 
 9   treatment_line_n                 403 non-null    float64
 10  observation_end_reason           403 non-null    object 
 11  death_during_observation         403 non-null    object 
 12  surgical_intervention 

In [12]:

meta.to_csv(ART_DIR / "feature_metadata_scores.csv", index=False)
selected.to_csv(ART_DIR / "selected_features_manifest.csv", index=False)
integrated.to_csv(ART_DIR / "phase3_integrated_data.csv", index=False)

print("Artifacts saved:")
print(" -", ART_DIR / "feature_metadata_scores.csv")
print(" -", ART_DIR / "selected_features_manifest.csv")
print(" -", ART_DIR / "phase3_integrated_data.csv")


Artifacts saved:
 - C:\Users\HP\OneDrive\Desktop\VERO_code\Phase_3\phase3_outputs\feature_metadata_scores.csv
 - C:\Users\HP\OneDrive\Desktop\VERO_code\Phase_3\phase3_outputs\selected_features_manifest.csv
 - C:\Users\HP\OneDrive\Desktop\VERO_code\Phase_3\phase3_outputs\phase3_integrated_data.csv
