In [2]:
import pandas as pd
from pathlib import Path

base = Path("C:/Users/HP/OneDrive/Desktop/VERO_code/Phase_1/new_data")

master = pd.read_excel(base / "codige_with_cci.xlsx")
treat = pd.read_excel(base / "codige_trattamento_oncologico_english.xlsx")

# Clean IDs and dates
master["patient_id"] = master["patient_id"].astype(str).str.strip()
treat["patient_id"] = treat["patient_id"].astype(str).str.strip()
for col in ["chemo_schema_start_date", "chemo_schema_end_date"]:
    treat[col] = pd.to_datetime(treat[col], errors="coerce")

# Line-level summary
line_group_cols = ["patient_id", "linea_trattamento_oncologico"]
line_summary = treat.groupby(line_group_cols).agg(
    first_schema_name=("chemo_schema_name", "first"),
    line_start_date=("chemo_schema_start_date", "min"),
    line_end_date=("chemo_schema_end_date", "max"),
    line_cycles=("chemo_cycles_n", "max"),
    line_end_reason=("chemo_schema_end_reason", lambda x: x.dropna().iloc[0] if x.dropna().size > 0 else pd.NA),
    line_any_toxicity=("toxicity_type", lambda x: x.notna().any()),
    line_any_dose_reduction=("dose_reduced", lambda x: (x == "Present / Yes").any()),
    line_n_active_drugs=("active_principles_n", "max"),
)
line_summary.reset_index(inplace=True)

# Build base oncology predictors DataFrame with unique patient_ids from treatment data
oncology_predictors = pd.DataFrame({"patient_id": treat["patient_id"].unique()})
oncology_predictors.set_index("patient_id", inplace=True)

# Helper to safely add a series (aligning on patient_id)
def add_series(df, series, name):
    s = series.copy()
    s.name = name
    return df.join(s, how="left")

# 1) n_treatment_lines
n_treatment_lines = line_summary.groupby("patient_id")["linea_trattamento_oncologico"].nunique()
oncology_predictors = add_series(oncology_predictors, n_treatment_lines, "n_treatment_lines")

# 2) total_chemo_cycles
total_chemo_cycles = line_summary.groupby("patient_id")["line_cycles"].sum(min_count=1)
oncology_predictors = add_series(oncology_predictors, total_chemo_cycles, "total_chemo_cycles")

# 3) received_chemo
received_chemo = (total_chemo_cycles > 0)
oncology_predictors = add_series(oncology_predictors, received_chemo, "received_chemo")

# 4) treatment_duration_days
patient_treatment_start = line_summary.groupby("patient_id")["line_start_date"].min()
patient_treatment_end = line_summary.groupby("patient_id")["line_end_date"].max()
treatment_duration_days = (patient_treatment_end - patient_treatment_start).dt.days
oncology_predictors = add_series(oncology_predictors, treatment_duration_days, "treatment_duration_days")

# 5) any_dose_reduction
any_dose_reduction = line_summary.groupby("patient_id")["line_any_dose_reduction"].any()
oncology_predictors = add_series(oncology_predictors, any_dose_reduction, "any_dose_reduction")

# 6) any_toxicity
any_toxicity = line_summary.groupby("patient_id")["line_any_toxicity"].any()
oncology_predictors = add_series(oncology_predictors, any_toxicity, "any_toxicity")

# 7) end_due_to_progression
end_due_to_progression = line_summary.groupby("patient_id")["line_end_reason"].apply(
    lambda x: x.astype(str).str.contains("Disease Progression", case=False, na=False).any()
)
oncology_predictors = add_series(oncology_predictors, end_due_to_progression, "end_due_to_progression")

# 8) max_combo_regimen_size
max_combo_regimen_size = line_summary.groupby("patient_id")["line_n_active_drugs"].max()
oncology_predictors = add_series(oncology_predictors, max_combo_regimen_size, "max_combo_regimen_size")

# 9) total_unique_active_drugs
total_unique_active_drugs = treat.groupby("patient_id")["active_principle"].apply(
    lambda x: len(set([s.strip().lower() for s in x.dropna()]))
)
oncology_predictors = add_series(oncology_predictors, total_unique_active_drugs, "total_unique_active_drugs")

# 10) time_from_diagnosis_to_first_treatment_days (if tumor_diagnosis_date exists)
if "tumor_diagnosis_date" in master.columns:
    master["tumor_diagnosis_date"] = pd.to_datetime(master["tumor_diagnosis_date"], errors="coerce")
    tmp = patient_treatment_start.to_frame(name="first_treatment_date").reset_index()
    tmp = tmp.merge(master[["patient_id", "tumor_diagnosis_date"]], on="patient_id", how="left")
    tmp["time_from_diagnosis_to_first_treatment_days"] = (
        tmp["first_treatment_date"] - tmp["tumor_diagnosis_date"]
    ).dt.days
    tfd = tmp.set_index("patient_id")["time_from_diagnosis_to_first_treatment_days"]
    oncology_predictors = add_series(oncology_predictors, tfd, "time_from_diagnosis_to_first_treatment_days")

# 11) Drug toxicity class flags
CARDIOTOXIC = {
    "doxorubicina", "epirubicina", "trastuzumab", "pertuzumab",
    "epirubicin", "doxorubicin"
}
NEPHROTOXIC = {
    "cisplatino", "cisplatin", "oxaliplatino", "oxaliplatin", "carboplatino", "carboplatin"
}
NEUROTOXIC = {
    "oxaliplatino", "oxaliplatin", "paclitaxel", "docetaxel"
}
HEMATOTOXIC = {
    "5-fluorouracile", "5-fluorouracile bolo", "5-fluorouracile infusione continua",
    "5-flourouracile", "gemcitabina", "gemcitabine", "irinotecan"
}
TARGETED = {
    "bevacizumab", "panitumumab", "nivolumab", "ramucirumab", "cetuximab", "trastuzumab", "alfibercept"
}

def classify_patient_drugs(drugs_series):
    drugs = {s.strip().lower() for s in drugs_series.dropna()}
    return pd.Series({
        "received_cardiotoxic_drug": any(d in CARDIOTOXIC for d in drugs),
        "received_nephrotoxic_drug": any(d in NEPHROTOXIC for d in drugs),
        "received_neurotoxic_drug": any(d in NEUROTOXIC for d in drugs),
        "received_hematotoxic_drug": any(d in HEMATOTOXIC for d in drugs),
        "received_targeted_therapy": any(d in TARGETED for d in drugs),
    })

drug_flags_df = treat.groupby("patient_id")["active_principle"].apply(classify_patient_drugs)
drug_flags_df = drug_flags_df.reset_index().set_index("patient_id")

oncology_predictors = oncology_predictors.join(drug_flags_df, how="left")

oncology_predictors.head(), oncology_predictors.shape


(              n_treatment_lines  total_chemo_cycles  received_chemo  \
 patient_id                                                            
 1_AO San Pio                  1                44.0            True   
 1_AO San Pio                  1                44.0            True   
 1_AO San Pio                  1                44.0            True   
 1_AO San Pio                  1                44.0            True   
 1_AO San Pio                  1                44.0            True   
 
               treatment_duration_days  any_dose_reduction  any_toxicity  \
 patient_id                                                                
 1_AO San Pio                    994.0               False         False   
 1_AO San Pio                    994.0               False         False   
 1_AO San Pio                    994.0               False         False   
 1_AO San Pio                    994.0               False         False   
 1_AO San Pio                    994.0

In [3]:

#  building oncology_predictors with unique patient_ids
oncology_predictors = pd.DataFrame(sorted(treat["patient_id"].unique()), columns=["patient_id"]).set_index("patient_id")

# Add main numeric/binary predictors
oncology_predictors = add_series(oncology_predictors, n_treatment_lines, "n_treatment_lines")
oncology_predictors = add_series(oncology_predictors, total_chemo_cycles, "total_chemo_cycles")
oncology_predictors = add_series(oncology_predictors, received_chemo, "received_chemo")
oncology_predictors = add_series(oncology_predictors, treatment_duration_days, "treatment_duration_days")
oncology_predictors = add_series(oncology_predictors, any_dose_reduction, "any_dose_reduction")
oncology_predictors = add_series(oncology_predictors, any_toxicity, "any_toxicity")
oncology_predictors = add_series(oncology_predictors, end_due_to_progression, "end_due_to_progression")
oncology_predictors = add_series(oncology_predictors, max_combo_regimen_size, "max_combo_regimen_size")
oncology_predictors = add_series(oncology_predictors, total_unique_active_drugs, "total_unique_active_drugs")
if "tfd" in locals():
    oncology_predictors = add_series(oncology_predictors, tfd, "time_from_diagnosis_to_first_treatment_days")

# Now build drug flags cleanly: one row per patient_id
flags_rows = []
for pid, grp in treat.groupby("patient_id"):
    flags = classify_patient_drugs(grp["active_principle"])
    row = {"patient_id": pid}
    row.update(flags.to_dict())
    flags_rows.append(row)

drug_flags_df_clean = pd.DataFrame(flags_rows).set_index("patient_id")

oncology_predictors = oncology_predictors.join(drug_flags_df_clean, how="left")

oncology_predictors.head(), oncology_predictors.shape


(                                    n_treatment_lines  total_chemo_cycles  \
 patient_id                                                                  
 10_AO San Pio                                       3                40.0   
 10_AORN A. Cardarelli                               1                 6.0   
 10_AORN Monaldi – Cotugno - C.T.O.                  1                 4.0   
 10_AORN San Giuseppe Moscati                        2                 4.0   
 10_AORN Sant’Anna e San Sebastiano                  1                13.0   
 
                                     received_chemo  treatment_duration_days  \
 patient_id                                                                    
 10_AO San Pio                                 True                    644.0   
 10_AORN A. Cardarelli                         True                     87.0   
 10_AORN Monaldi – Cotugno - C.T.O.            True                     66.0   
 10_AORN San Giuseppe Moscati                  True 

In [4]:
# Merge oncology_predictors into master dataset on patient_id
master_with_onco = master.merge(
    oncology_predictors.reset_index(),
    on="patient_id",
    how="left"
)

output_path = base / "codige_with_cci_and_oncology_predictors.xlsx"
master_with_onco.to_excel(output_path, index=False)

output_path


  master_with_onco.to_excel(output_path, index=False)


WindowsPath('C:/Users/HP/OneDrive/Desktop/VERO_code/Phase_1/new_data/codige_with_cci_and_oncology_predictors.xlsx')