In [2]:
import pandas as pd
from pathlib import Path

base = Path("C:/Users/HP/OneDrive/Desktop/VERO_code/Phase_1/new_data")
base, list(base.iterdir())

# Load master with CCI and oncology predictors
master = pd.read_excel(base / "codige_with_cci_and_oncology_predictors.xlsx")
farmaco = pd.read_excel(base / "codige_tabella_farmaco_english.xlsx")

master.shape, master.columns[:15], farmaco.shape, farmaco.columns

((412, 114),
 Index(['patient_id', 'birth_date', 'age', 'age_group', 'gender', 'ethnicity',
        'education_level', 'bmi_value', 'bmi_category', 'employment_status',
        'alcohol_consumption', 'smoking_status_binary', 'smoking_status_detail',
        'smoking_years', 'observation_start_date'],
       dtype='object'),
 (5504, 8),
 Index(['patient_id', 'farmaco', 'data', 'cod_atc', 'active_principle',
        'categoria', 'adr_macro_category', 'cronico'],
       dtype='object'))

In [3]:
# Clean IDs and dates
master["patient_id"] = master["patient_id"].astype(str).str.strip()
farmaco["patient_id"] = farmaco["patient_id"].astype(str).str.strip()
farmaco["data"] = pd.to_datetime(farmaco["data"], errors="coerce")

# Bring observation window into med table
obs = master[["patient_id", "observation_start_date", "observation_end_date"]].copy()
obs["observation_start_date"] = pd.to_datetime(obs["observation_start_date"], errors="coerce")
obs["observation_end_date"] = pd.to_datetime(obs["observation_end_date"], errors="coerce")

farm_merged = farmaco.merge(obs, on="patient_id", how="left")

farm_merged["use_in_window"] = (
    farm_merged["data"].notna()
    & farm_merged["observation_start_date"].notna()
    & farm_merged["observation_end_date"].notna()
    & (farm_merged["data"] >= farm_merged["observation_start_date"])
    & (farm_merged["data"] <= farm_merged["observation_end_date"])
)

farm_in_window = farm_merged[farm_merged["use_in_window"]].copy()
farmaco.shape, farm_in_window.shape


((5504, 8), (3088, 11))

In [4]:
# Quick look at macro categories and categoria
farm_in_window["adr_macro_category"].value_counts(dropna=False).head(20), farm_in_window["categoria"].value_counts(dropna=False).head(20)

(adr_macro_category
 Gastrointestinali                                 687
 Antipertensivi                                    632
 Antitrombotici                                    459
 Integratori                                       224
 Steroidi                                          169
 Fattore di crescita                               146
 Antidislipidemici                                 134
 Antidiabetici                                      99
 Analgesici                                         83
 Respiratori                                        58
 Ormoni                                             58
 IPB                                                54
 Antibiotici                                        50
 Altri Farmaci                                      43
 Falati                                             29
 SNC                                                29
 Antiinfiammatori                                   26
 Dermatologici                               

In [5]:
# Build patient-level medication predictors from farm_in_window

f = farm_in_window.copy()

# Basic polypharmacy measures
patient_groups = f.groupby("patient_id")

n_unique_active = patient_groups["active_principle"].nunique()
n_unique_active.name = "n_unique_active_principles"

n_chronic_drugs = patient_groups.apply(
    lambda g: g.loc[g["cronico"] == 1, "active_principle"].nunique()
)
n_chronic_drugs.name = "n_chronic_drugs"

# Define polypharmacy flags based on n_unique_active
def poly_flag(n):
    if pd.isna(n):
        return pd.NA
    n = int(n)
    if n >= 10:
        return "Excessive polypharmacy (>=10)"
    elif n >= 5:
        return "Polypharmacy (5-9)"
    else:
        return "No polypharmacy (0-4)"

polypharmacy_flag = n_unique_active.map(poly_flag)
polypharmacy_flag.name = "polypharmacy_flag"

def chronic_poly_flag(n):
    if pd.isna(n):
        return pd.NA
    n = int(n)
    if n >= 10:
        return "Excessive chronic polypharmacy (>=10)"
    elif n >= 5:
        return "Chronic polypharmacy (5-9)"
    else:
        return "No chronic polypharmacy (0-4)"

chronic_polypharmacy_flag = n_chronic_drugs.map(chronic_poly_flag)
chronic_polypharmacy_flag.name = "chronic_polypharmacy_flag"

# System-level burdens using adr_macro_category
# We'll create flags and counts for key systems
def has_category(g, target):
    return g["adr_macro_category"].astype(str).str.contains(target, case=False, na=False).any()

def count_category(g, target):
    return g.loc[g["adr_macro_category"].astype(str).str.contains(target, case=False, na=False), "active_principle"].nunique()

system_configs = {
    "cardiovascular": ["Antipertensivi", "Antitrombotici", "Antidislipidemici"],
    "metabolic": ["Antidiabetici"],
    "respiratory": ["Respiratori"],
    "cns": ["SNC"],
    "gastrointestinal": ["Gastrointestinali"],
    "analgesic": ["Analgesici", "oppioidi"],
}

system_flags = {}
system_counts = {}

for sys_name, keywords in system_configs.items():
    # Flag: uses any drug in those macro categories
    flag_series = patient_groups.apply(
        lambda g, kws=keywords: g["adr_macro_category"].astype(str).str.contains("|".join(kws), case=False, na=False).any()
    )
    flag_series.name = f"uses_{sys_name}_drugs"
    system_flags[sys_name] = flag_series
    
    # Count: number of distinct active_principles in those categories
    count_series = patient_groups.apply(
        lambda g, kws=keywords: g.loc[
            g["adr_macro_category"].astype(str).str.contains("|".join(kws), case=False, na=False),
            "active_principle"
        ].nunique()
    )
    count_series.name = f"n_{sys_name}_drugs"
    system_counts[sys_name] = count_series

# ATC-based flags
def atc_flag(starts):
    return patient_groups["cod_atc"].apply(
        lambda s, prefix=starts: s.astype(str).str.startswith(prefix).any()
    )

uses_antithrombotics = atc_flag("B01")
uses_antithrombotics.name = "uses_antithrombotics"

uses_antidiabetics_atc = atc_flag("A10")
uses_antidiabetics_atc.name = "uses_antidiabetics_atc"

uses_ras_drugs = atc_flag("C09")
uses_ras_drugs.name = "uses_renin_angiotensin_drugs"

uses_diuretics = atc_flag("C03")
uses_diuretics.name = "uses_diuretics"

uses_lipid_lowering = atc_flag("C10")
uses_lipid_lowering.name = "uses_lipid_lowering_drugs"

# Dynamics: n_med_events and duration
n_med_events = patient_groups.size()
n_med_events.name = "n_med_events"

first_med_date = patient_groups["data"].min()
last_med_date = patient_groups["data"].max()
medication_duration_days = (last_med_date - first_med_date).dt.days
medication_duration_days.name = "medication_duration_days"

# Combine all predictors into one DataFrame
med_predictors = pd.DataFrame({
    "n_unique_active_principles": n_unique_active,
    "n_chronic_drugs": n_chronic_drugs,
    "polypharmacy_flag": polypharmacy_flag,
    "chronic_polypharmacy_flag": chronic_polypharmacy_flag,
    "n_med_events": n_med_events,
    "medication_duration_days": medication_duration_days,
    "uses_antithrombotics": uses_antithrombotics,
    "uses_antidiabetics_atc": uses_antidiabetics_atc,
    "uses_renin_angiotensin_drugs": uses_ras_drugs,
    "uses_diuretics": uses_diuretics,
    "uses_lipid_lowering_drugs": uses_lipid_lowering,
})

# Add system flags and counts
for sys_name, ser in system_flags.items():
    med_predictors[ser.name] = ser
for sys_name, ser in system_counts.items():
    med_predictors[ser.name] = ser

med_predictors.head(), med_predictors.shape


  n_chronic_drugs = patient_groups.apply(
  flag_series = patient_groups.apply(
  count_series = patient_groups.apply(
  flag_series = patient_groups.apply(
  count_series = patient_groups.apply(
  flag_series = patient_groups.apply(
  count_series = patient_groups.apply(
  flag_series = patient_groups.apply(
  count_series = patient_groups.apply(
  flag_series = patient_groups.apply(
  count_series = patient_groups.apply(
  flag_series = patient_groups.apply(
  count_series = patient_groups.apply(


(                                             n_unique_active_principles  \
 patient_id                                                                
 10_AO San Pio                                                         8   
 10_AORN Monaldi – Cotugno - C.T.O.                                    8   
 10_AORN San Giuseppe Moscati                                          3   
 10_AOU San Giovanni di Dio Ruggi di Aragona                           4   
 10_Ospedale del mare                                                  1   
 
                                              n_chronic_drugs  \
 patient_id                                                     
 10_AO San Pio                                              0   
 10_AORN Monaldi – Cotugno - C.T.O.                         0   
 10_AORN San Giuseppe Moscati                               0   
 10_AOU San Giovanni di Dio Ruggi di Aragona                0   
 10_Ospedale del mare                                       0   
 
         

In [6]:
# Merge medication predictors into master_with_cci_and_oncology_predictors
master["patient_id"] = master["patient_id"].astype(str).str.strip()
med_predictors = med_predictors.reset_index()  # bring patient_id as column

master_full = master.merge(med_predictors, on="patient_id", how="left")

output_path = base / "codige_with_cci_oncology_farmaco_predictors.xlsx"
master_full.to_excel(output_path, index=False)

output_path


  master_full.to_excel(output_path, index=False)


WindowsPath('C:/Users/HP/OneDrive/Desktop/VERO_code/Phase_1/new_data/codige_with_cci_oncology_farmaco_predictors.xlsx')