In [30]:
import pandas as pd
import numpy as np
from pathlib import Path

# Load cleaned_data.xlsx
path = Path("C:/Users/HP/OneDrive/Desktop/VERO_code/Phase_1/data/processed/cleaned_data.xlsx")
df = pd.read_excel(path)

df.shape, df.columns[:40]


((403, 112),
 Index(['patient_id', 'birth_date', 'age', 'age_group', 'gender', 'ethnicity',
        'education_level', 'bmi_value', 'bmi_category', 'employment_status',
        'alcohol_consumption', 'smoking_status_binary', 'smoking_status_detail',
        'smoking_years', 'observation_start_date', 'observation_end_date',
        'observation_end_reason', 'tumor_diagnosis_date',
        'oncology_unit_start_date', 'tumor_type', 'tumor_stage_tnm',
        'tumor_stage_roman', 'histological_grade', 'molecular_alterations',
        'mutations_present', 'dpyd_genotype_known', 'dpyd_genotype_type',
        'surgical_intervention', 'surgery_date', 'surgery_type',
        'prior_surgery', 'previous number of treatment lines',
        'other surgical intervention', 'surgery_complications',
        'oncology_treatment_lines_n', 'radiotherapy_status',
        'transfusion_received', 'transfusions_total_n', 'hypertension',
        'aortic_insufficiency'],
       dtype='object'))

In [31]:
df.columns[40:90]

Index(['dyslipidemia', 'bph', 'obesity_comorbidity', 'ischemic_heart_disease',
       'atrial_fibrillation', 'copd', 'asthma', 'diabetes_type_ii',
       'gastroesophageal_reflux_full', 'hypertensive_heart_disease',
       'renal_insufficiency', 'depressive_syndrome', 'anemia_comorbidity',
       'psychiatric_disorders', 'other_comorbidities',
       'cardiovascular_disorders', 'gastrointestinal_disorders',
       'cerebrovascular_disorders', 'ordinary_hospitalizations_n',
       'death_during_observation', 'adr_left', 'adr_n_tot', 'adr_n_grado1',
       'adr_n_grado2', 'adr_n_grado3', 'adr_n_grado4', 'adr_n_grado5',
       'observation_days', 'blood_glucose_range', 'white_blood_cells_range',
       'red_blood_cells_range', 'hemoglobin_range',
       'neutrophils_percent_range', 'platelet_count_range', 'creatinine_range',
       'ast_got_range', 'alt_gpt_range', 'total_bilirubin_range',
       'direct_bilirubin_range', 'death_date',
       'end_reason_progression_any_line', 'chemio_fin

In [32]:
df.columns[90:]

Index(['active_principles_n', 'Oncologic treatment line',
       'Hospitalizations count', 'side effect', 'comorbidities',
       'comobidity_cat', 'other', 'data', 'comorbidity_category_list',
       'number of other pathologies', 'adr_description', 'adr_onset_date',
       'adr_ctcae_grade', 'adr_outcome', 'adr_chemo_correlation',
       'adr_chemo_action', 'adr_source_project', 'adr_macro_category',
       'adr_clean.1', 'albumin_range', 'hospital_admission_date',
       'hospital_discharge_date'],
      dtype='object')

In [33]:
# Quick look at some key fields user mentioned
summary = {
    "smoking_status_binary": df["smoking_status_binary"].value_counts(dropna=False),
    "bmi_category": df["bmi_category"].value_counts(dropna=False),
    "smoking_status_detail": df["smoking_status_detail"].value_counts(dropna=False),
    "tumor_stage_roman": df["tumor_stage_roman"].value_counts(dropna=False),
    "adr_ctcae_grade": df["adr_ctcae_grade"].describe(),
    "albumin_range": df["albumin_range"].value_counts(dropna=False) if "albumin_range" in df.columns else "MISSING",
}
summary

{'smoking_status_binary': smoking_status_binary
 1    301
 0    102
 Name: count, dtype: int64,
 'bmi_category': bmi_category
 18.5-24.9 Normal Weight    235
 25-29.99 Overweight        112
 >=30 Obese                  41
 <18.5 Underweight           15
 Name: count, dtype: int64,
 'smoking_status_detail': smoking_status_detail
 Not Known / Missing    134
 Ex-Smoker              128
 Never Smoked           102
 Current Smoker          39
 Name: count, dtype: int64,
 'tumor_stage_roman': tumor_stage_roman
 Not Known / Missing    135
 Stage IV               123
 Stage III               96
 Stage II                45
 Stage I                  3
 I                        1
 Name: count, dtype: int64,
 'adr_ctcae_grade': count    403.000000
 mean       1.410220
 std        0.478587
 min        1.000000
 25%        1.142857
 50%        1.285714
 75%        1.500000
 max        4.000000
 Name: adr_ctcae_grade, dtype: float64,
 'albumin_range': albumin_range
 Unknown                  186
 Norm

In [34]:
# Check comorbidity-like columns
comorb_cols = [
    'hypertension', 'aortic_insufficiency', 'dyslipidemia', 'bph',
    'obesity_comorbidity', 'ischemic_heart_disease', 'atrial_fibrillation',
    'copd', 'asthma', 'diabetes_type_ii', 'gastroesophageal_reflux_full',
    'hypertensive_heart_disease', 'renal_insufficiency', 'depressive_syndrome',
    'anemia_comorbidity', 'psychiatric_disorders', 'other_comorbidities',
    'cardiovascular_disorders', 'gastrointestinal_disorders',
    'cerebrovascular_disorders'
]
present_comorb = [c for c in comorb_cols if c in df.columns]
present_comorb


['hypertension',
 'aortic_insufficiency',
 'dyslipidemia',
 'bph',
 'obesity_comorbidity',
 'ischemic_heart_disease',
 'atrial_fibrillation',
 'copd',
 'asthma',
 'diabetes_type_ii',
 'gastroesophageal_reflux_full',
 'hypertensive_heart_disease',
 'renal_insufficiency',
 'depressive_syndrome',
 'anemia_comorbidity',
 'psychiatric_disorders',
 'other_comorbidities',
 'cardiovascular_disorders',
 'gastrointestinal_disorders',
 'cerebrovascular_disorders']

In [35]:
# Peek at a couple of comorbidity distributions to see coding ("Present / Yes" etc.)
comorb_preview = {c: df[c].value_counts(dropna=False).head() for c in present_comorb[:6]}
comorb_preview

{'hypertension': hypertension
 Absent / No      269
 Present / Yes    134
 Name: count, dtype: int64,
 'aortic_insufficiency': aortic_insufficiency
 Absent / No      401
 Present / Yes      2
 Name: count, dtype: int64,
 'dyslipidemia': dyslipidemia
 Absent / No      362
 Present / Yes     41
 Name: count, dtype: int64,
 'bph': bph
 Absent / No      386
 Present / Yes     17
 Name: count, dtype: int64,
 'obesity_comorbidity': obesity_comorbidity
 Absent / No      394
 Present / Yes      9
 Name: count, dtype: int64,
 'ischemic_heart_disease': ischemic_heart_disease
 Absent / No      376
 Present / Yes     27
 Name: count, dtype: int64}

In [36]:
# Inspect key treatment and date columns mentioned
key_cols = [
    "surgical_intervention", "oncology_treatment_lines_n", "chemo_cycles_n",
    "ordinary_hospitalizations_n", "dose_reduced",
    "observation_start_date", "observation_end_date", "observation_end_reason",
    "hospital_admission_date", "hospital_discharge_date",
    "adr_onset_date", "death_date"
]
{c: df[c].head() for c in key_cols if c in df.columns}

{'surgical_intervention': 0      Absent / No
 1      Absent / No
 2      Absent / No
 3    Present / Yes
 4      Absent / No
 Name: surgical_intervention, dtype: object,
 'oncology_treatment_lines_n': 0    3.0
 1    1.0
 2    1.0
 3    2.0
 4    1.0
 Name: oncology_treatment_lines_n, dtype: float64,
 'chemo_cycles_n': 0    13.400000
 1     6.000000
 2     4.000000
 3     2.714286
 4    13.000000
 Name: chemo_cycles_n, dtype: float64,
 'ordinary_hospitalizations_n': 0    0
 1    0
 2    0
 3    1
 4    0
 Name: ordinary_hospitalizations_n, dtype: int64,
 'dose_reduced': 0    0.000000
 1    0.666667
 2    1.000000
 3    0.000000
 4    0.000000
 Name: dose_reduced, dtype: float64,
 'observation_start_date': 0    2023-01-20 00:00:00; 2023-01-20 00:00:00.000
 1                             2022-04-22 00:00:00
 2    2023-03-31 00:00:00; 2023-03-31 00:00:00.000
 3    2023-08-08 00:00:00; 2023-08-08 00:00:00.000
 4                             2022-09-14 00:00:00
 Name: observation_start_date, d

In [37]:
# 1. Map comorbidity columns to 0/1 flags
def map_yes_no(series):
    return series.replace({
        "Present / Yes": 1,
        "Absent / No": 0
    })

binary_comorbid_cols = {}
for c in present_comorb:
    binary_col = c + "_bin"
    df[binary_col] = map_yes_no(df[c])
    binary_comorbid_cols[c] = binary_col

binary_comorbid_cols


  return series.replace({
  return series.replace({
  return series.replace({
  return series.replace({
  return series.replace({
  return series.replace({
  return series.replace({
  return series.replace({
  return series.replace({
  return series.replace({
  return series.replace({
  return series.replace({
  return series.replace({
  return series.replace({
  return series.replace({
  return series.replace({
  return series.replace({
  return series.replace({


{'hypertension': 'hypertension_bin',
 'aortic_insufficiency': 'aortic_insufficiency_bin',
 'dyslipidemia': 'dyslipidemia_bin',
 'bph': 'bph_bin',
 'obesity_comorbidity': 'obesity_comorbidity_bin',
 'ischemic_heart_disease': 'ischemic_heart_disease_bin',
 'atrial_fibrillation': 'atrial_fibrillation_bin',
 'copd': 'copd_bin',
 'asthma': 'asthma_bin',
 'diabetes_type_ii': 'diabetes_type_ii_bin',
 'gastroesophageal_reflux_full': 'gastroesophageal_reflux_full_bin',
 'hypertensive_heart_disease': 'hypertensive_heart_disease_bin',
 'renal_insufficiency': 'renal_insufficiency_bin',
 'depressive_syndrome': 'depressive_syndrome_bin',
 'anemia_comorbidity': 'anemia_comorbidity_bin',
 'psychiatric_disorders': 'psychiatric_disorders_bin',
 'other_comorbidities': 'other_comorbidities_bin',
 'cardiovascular_disorders': 'cardiovascular_disorders_bin',
 'gastrointestinal_disorders': 'gastrointestinal_disorders_bin',
 'cerebrovascular_disorders': 'cerebrovascular_disorders_bin'}

In [38]:
# 2. Construct Charlson Comorbidity Index (CCI) with pragmatic mapping

def compute_cci(row):
    cci = 0
    
    # Diabetes (without complications)
    if 'diabetes_type_ii_bin' in row and row['diabetes_type_ii_bin'] == 1:
        cci += 1
    
    # Cardiovascular disease: MI/ischemic, atrial fibrillation, hypertension-related heart disease
    cardio_sources = [
        'ischemic_heart_disease_bin',
        'cardiovascular_disorders_bin',
        'atrial_fibrillation_bin',
        'hypertensive_heart_disease_bin'
    ]
    if any((col in row and row[col] == 1) for col in cardio_sources):
        cci += 1  # lumped as 1-point cardiovascular category
    
    # Cerebrovascular disease
    if 'cerebrovascular_disorders_bin' in row and row['cerebrovascular_disorders_bin'] == 1:
        cci += 1
    
    # Chronic pulmonary disease (COPD/asthma)
    if (('copd_bin' in row and row['copd_bin'] == 1) or 
        ('asthma_bin' in row and row['asthma_bin'] == 1)):
        cci += 1
    
    # Renal disease (moderate/severe CKD proxy)
    if 'renal_insufficiency_bin' in row and row['renal_insufficiency_bin'] == 1:
        cci += 2
    
    # Solid tumor / metastatic tumor using tumor_stage_roman as proxy
    stage = row.get('tumor_stage_roman', np.nan)
    if pd.notna(stage):
        s = str(stage).strip().upper()
        if "IV" in s:
            cci += 6  # metastatic solid tumor
        elif "I" in s or "II" in s or "III" in s:
            # any non metastatic solid tumor
            cci += 2
    
    return cci

df["CCI_score"] = df.apply(compute_cci, axis=1)

# 3. ACCI: add age-based points
def compute_age_points(age):
    # Standard ACCI categorisation:
    # <50: 0, 50-59: +1, 60-69: +2, 70-79: +3, >=80: +4
    if pd.isna(age):
        return 0
    try:
        a = float(age)
    except Exception:
        return 0
    if a < 50:
        return 0
    elif a < 60:
        return 1
    elif a < 70:
        return 2
    elif a < 80:
        return 3
    else:
        return 4

df["age_points_acci"] = df["age"].apply(compute_age_points)
df["ACCI_score"] = df["CCI_score"] + df["age_points_acci"]

df[["age", "CCI_score", "age_points_acci", "ACCI_score"]].head()


Unnamed: 0,age,CCI_score,age_points_acci,ACCI_score
0,73,7,3,10
1,58,2,1,3
2,84,6,4,10
3,76,3,3,6
4,70,6,3,9


In [39]:
import numpy as np
import pandas as pd

# If your main dataframe is called "dataset", uncomment this:
# df = dataset.copy()

# 1. List of comorbidity variables in the cleaned data
comorbidity_cols = [
    "hypertension",
    "aortic_insufficiency",
    "dyslipidemia",
    "bph",
    "obesity_comorbidity",
    "ischemic_heart_disease",
    "atrial_fibrillation",
    "copd",
    "asthma",
    "diabetes_type_ii",
    "gastroesophageal_reflux_full",
    "hypertensive_heart_disease",
    "renal_insufficiency",
    "depressive_syndrome",
    "anemia_comorbidity",
    "psychiatric_disorders",
    "other_comorbidities",
    "cardiovascular_disorders",
    "gastrointestinal_disorders",
    "cerebrovascular_disorders",
]

# 2. Create 0/1 binary versions for each comorbidity: "<name>_bin"
for col in comorbidity_cols:
    if col in df.columns:
        bin_col = col + "_bin"
        df[bin_col] = (
            df[col]
            .astype(str)
            .str.strip()
            .replace(
                {
                    "Present / Yes": 1,
                    "Absent / No": 0,
                    "present / yes": 1,
                    "absent / no": 0,
                    "Yes": 1,
                    "No": 0,
                    "yes": 1,
                    "no": 0,
                }
            )
        )
        # force numeric and keep NaN where mapping failed
        df[bin_col] = pd.to_numeric(df[bin_col], errors="coerce")

# 3. Collect all *_bin columns that actually exist
bin_cols = [c for c in df.columns if c.endswith("_bin")]
print("Binary comorbidity columns used:", bin_cols)

# 4. Compute comorbidity_count as sum of all binary comorbidity flags
df["comorbidity_count"] = (
    df[bin_cols]
    .apply(pd.to_numeric, errors="coerce")  # make sure everything is numeric
    .fillna(0)
    .sum(axis=1)
)

# 5. Categorise into 0-2 / 3-5 / >=6
def comorbidity_cat_fn(n):
    if pd.isna(n):
        return np.nan
    n = float(n)
    if n <= 2:
        return "0-2"
    elif n <= 5:
        return "3-5"
    else:
        return ">=6"

df["comorbidity_burden_cat"] = df["comorbidity_count"].apply(comorbidity_cat_fn)

# 6. Quick sanity checks
print(df["comorbidity_count"].describe())
print(df["comorbidity_burden_cat"].value_counts(dropna=False))
df[["comorbidity_count", "comorbidity_burden_cat"]].head()


Binary comorbidity columns used: ['hypertension_bin', 'aortic_insufficiency_bin', 'dyslipidemia_bin', 'bph_bin', 'obesity_comorbidity_bin', 'ischemic_heart_disease_bin', 'atrial_fibrillation_bin', 'copd_bin', 'asthma_bin', 'diabetes_type_ii_bin', 'gastroesophageal_reflux_full_bin', 'hypertensive_heart_disease_bin', 'renal_insufficiency_bin', 'depressive_syndrome_bin', 'anemia_comorbidity_bin', 'psychiatric_disorders_bin', 'other_comorbidities_bin', 'cardiovascular_disorders_bin', 'gastrointestinal_disorders_bin', 'cerebrovascular_disorders_bin']
count    403.000000
mean       1.019851
std        1.376636
min        0.000000
25%        0.000000
50%        0.000000
75%        2.000000
max        9.000000
Name: comorbidity_count, dtype: float64
comorbidity_burden_cat
0-2    345
3-5     56
>=6      2
Name: count, dtype: int64


  df[col]
  df[col]
  df[col]
  df[col]
  df[col]
  df[col]
  df[col]
  df[col]
  df[col]
  df[col]
  df[col]
  df[col]
  df[col]
  df[col]
  df[col]
  df[col]
  df[col]
  df[col]
  df[col]


Unnamed: 0,comorbidity_count,comorbidity_burden_cat
0,3.0,3-5
1,1.0,0-2
2,1.0,0-2
3,2.0,0-2
4,0.0,0-2


In [40]:
# 5. Tumor stage grouping: Stage I-II, Stage III, Stage IV

def stage_group_fn(stage):
    if pd.isna(stage):
        return np.nan
    s = str(stage).strip().upper()
    # Harmonise variants like "Stage II" vs "II"
    if "IV" in s:
        return "Stage IV"
    elif "III" in s:
        return "Stage III"
    elif "II" in s or "I" in s:
        return "Stage I-II"
    else:
        return np.nan

df["tumor_stage_group"] = df["tumor_stage_roman"].apply(stage_group_fn)
df["tumor_stage_group"].value_counts(dropna=False)


tumor_stage_group
Stage I-II    184
Stage IV      123
Stage III      96
Name: count, dtype: int64

In [41]:
# 6. BMI categories: collapse labels into four canonical groups

def bmi_group_fn(label):
    if pd.isna(label):
        return np.nan
    s = str(label).lower()
    if "underweight" in s or "<18.5" in s:
        return "Underweight"
    if "normal" in s or "18.5-24.9" in s:
        return "Normal weight"
    if "overweight" in s or "25-29.99" in s:
        return "Overweight"
    if "obese" in s or ">=30" in s:
        return "Obese"
    return np.nan

df["bmi_group"] = df["bmi_category"].apply(bmi_group_fn)
df["bmi_group"].value_counts(dropna=False)


bmi_group
Normal weight    235
Overweight       112
Obese             41
Underweight       15
Name: count, dtype: int64

In [42]:
# 7. Ordinary hospitalisations categories

def hosp_cat_fn(n):
    if pd.isna(n):
        return np.nan
    try:
        n_val = int(n)
    except Exception:
        return np.nan
    if n_val == 0:
        return "0"
    elif n_val <= 2:
        return "1-2"
    else:
        return ">=3"

df["ordinary_hosp_cat"] = df["ordinary_hospitalizations_n"].apply(hosp_cat_fn)
df["ordinary_hosp_cat"].value_counts(dropna=False)


ordinary_hosp_cat
0      293
1-2     99
>=3     11
Name: count, dtype: int64

In [43]:
import pandas as pd
import numpy as np
from pathlib import Path

# Reload to be safe after previous error
path = Path('C:/Users/HP/OneDrive/Desktop/VERO_code/Phase_1/data/processed/cleaned_data.xlsx')
df = pd.read_excel(path)

df.shape


(403, 112)

In [46]:
# Build a compact variable dictionary: original -> derived
var_dict_rows = []

# ACCI / CCI
var_dict_rows.append({"source_column": "age + comorbidities + tumor_stage_roman", "analysis_variable": "CCI_score"})
var_dict_rows.append({"source_column": "CCI_score + age", "analysis_variable": "ACCI_score"})

# Comorbidity burden
var_dict_rows.append({"source_column": ", ".join(present_comorb), "analysis_variable": "comorbidity_count"})
var_dict_rows.append({"source_column": "comorbidity_count", "analysis_variable": "comorbidity_burden_cat (0-2, 3-5, >=6)"})

# Tumor stage
var_dict_rows.append({"source_column": "tumor_stage_roman", "analysis_variable": "tumor_stage_group (Stage I-II, Stage III, Stage IV)"})

# BMI
var_dict_rows.append({"source_column": "bmi_category", "analysis_variable": "bmi_group (Underweight, Normal weight, Overweight, Obese)"})

# Ordinary hospitalisations
var_dict_rows.append({"source_column": "ordinary_hospitalizations_n", "analysis_variable": "ordinary_hosp_cat (0, 1-2, >=3)"})

# Lab abnormalities
if "hemoglobin_range" in df.columns:
    var_dict_rows.append({"source_column": "hemoglobin_range", "analysis_variable": "low_hemoglobin (1=low, 0=normal/high)"})
if "creatinine_range" in df.columns:
    var_dict_rows.append({"source_column": "creatinine_range", "analysis_variable": "elevated_creatinine (1=high, 0=normal/low)"})
if "albumin_range" in df.columns:
    var_dict_rows.append({"source_column": "albumin_range", "analysis_variable": "low_albumin (1=low, 0=normal/high) (currently all missing)"})

var_dict = pd.DataFrame(var_dict_rows)

# Save enriched dataset and variable dictionary
out_data_path = Path("C:/Users/HP/OneDrive/Desktop/VERO_code/Phase_2/results/hospital requirement/cleaned_data_phase1_enriched.xlsx")
out_dict_path = Path("C:/Users/HP/OneDrive/Desktop/VERO_code/Phase_2/results/hospital requirement/variable_dictionary_phase1.xlsx")
df.to_excel(out_data_path, index=False)
var_dict.to_excel(out_dict_path, index=False)

out_data_path, out_dict_path


  df.to_excel(out_data_path, index=False)
  var_dict.to_excel(out_dict_path, index=False)


(WindowsPath('C:/Users/HP/OneDrive/Desktop/VERO_code/Phase_2/results/hospital requirement/cleaned_data_phase1_enriched.xlsx'),
 WindowsPath('C:/Users/HP/OneDrive/Desktop/VERO_code/Phase_2/results/hospital requirement/variable_dictionary_phase1.xlsx'))

In [47]:
import numpy as np
import pandas as pd

import pandas as pd
import numpy as np
from pathlib import Path

# Load cleaned_data.xlsx
path = Path("C:/Users/HP/OneDrive/Desktop/VERO_code/Phase_1/data/processed/cleaned_data.xlsx")
df = pd.read_excel(path)

# ---------------------------------------------------
# 1) DEFINE COMORBIDITY COLUMNS AND BINARY MAPPING
# ---------------------------------------------------

# Helper to map yes/no-like strings to 0/1
def map_yes_no_series(s):
    if s is None:
        return s
    return s.replace({
        "Present / Yes": 1,
        "Absent / No": 0,
        "present / yes": 1,
        "absent / no": 0,
        "Yes": 1,
        "No": 0,
        "yes": 1,
        "no": 0
    })

# List of comorbidity columns we want to use (only those that exist will be used)
comorb_cols = [
    "hypertension",
    "aortic_insufficiency",
    "dyslipidemia",
    "bph",
    "obesity_comorbidity",
    "ischemic_heart_disease",
    "atrial_fibrillation",
    "copd",
    "asthma",
    "diabetes_type_ii",
    "gastroesophageal_reflux_full",
    "hypertensive_heart_disease",
    "renal_insufficiency",
    "depressive_syndrome",
    "anemia_comorbidity",
    "psychiatric_disorders",
    "other_comorbidities",
    "cardiovascular_disorders",
    "gastrointestinal_disorders",
    "cerebrovascular_disorders"
]

present_comorb = [c for c in comorb_cols if c in df.columns]

binary_comorbid_cols = {}

for c in present_comorb:
    binary_col = c + "_bin"

    # Start from the original column
    col_series = df[c]

    # Step 1: if it is object/string-like, map yes/no strings
    if col_series.dtype == "object":
        col_series = map_yes_no_series(col_series)

    # Step 2: force numeric (anything non numeric -> NaN)
    col_series = pd.to_numeric(col_series, errors="coerce")

    # Step 3: standardize to 0/1 (anything >=1 becomes 1, rest 0, NaN remains NaN)
    col_bin = np.where(col_series >= 1, 1,
                       np.where(col_series == 0, 0, np.nan))

    df[binary_col] = col_bin
    binary_comorbid_cols[c] = binary_col

print("Binary comorbidity columns created:")
print(list(binary_comorbid_cols.values()))

# ---------------------------------------------------
# 2) CHARLSON COMORBIDITY INDEX (CCI) AND ACCI
# ---------------------------------------------------

def compute_cci(row):
    cci = 0

    # Diabetes (without complications)
    if row.get("diabetes_type_ii_bin", 0) == 1:
        cci += 1

    # Cardiovascular disease cluster (count as 1 block)
    cardio_sources = [
        "ischemic_heart_disease_bin",
        "cardiovascular_disorders_bin",
        "atrial_fibrillation_bin",
        "hypertensive_heart_disease_bin"
    ]
    if any(row.get(col, 0) == 1 for col in cardio_sources):
        cci += 1

    # Cerebrovascular disease
    if row.get("cerebrovascular_disorders_bin", 0) == 1:
        cci += 1

    # Chronic pulmonary disease (COPD or asthma)
    if (row.get("copd_bin", 0) == 1) or (row.get("asthma_bin", 0) == 1):
        cci += 1

    # Renal disease
    if row.get("renal_insufficiency_bin", 0) == 1:
        cci += 2

    # Solid vs metastatic tumor using tumor_stage_roman
    stage = row.get("tumor_stage_roman", np.nan)
    if pd.notna(stage):
        s = str(stage).strip().upper()
        # Very simple rule: IV = metastatic, I/II/III = solid tumor
        if "IV" in s:
            cci += 6   # metastatic solid tumor
        elif any(k in s for k in ["I", "II", "III"]):
            cci += 2   # non metastatic solid tumor

    return cci

df["CCI_score"] = df.apply(compute_cci, axis=1)

def compute_age_points(age):
    if pd.isna(age):
        return 0
    try:
        a = float(age)
    except Exception:
        return 0
    if a < 50:
        return 0
    elif a < 60:
        return 1
    elif a < 70:
        return 2
    elif a < 80:
        return 3
    else:
        return 4

df["age_points_acci"] = df["age"].apply(compute_age_points)
df["ACCI_score"] = df["CCI_score"] + df["age_points_acci"]

# ---------------------------------------------------
# 3) COMORBIDITY BURDEN CATEGORIES
# ---------------------------------------------------

# Take the list of binary columns we actually created
bin_cols = [col for col in binary_comorbid_cols.values() if col in df.columns]

# Clean them again just to be 100 percent sure they are numeric
df[bin_cols] = df[bin_cols].apply(pd.to_numeric, errors="coerce")

# Now safely compute the count
df["comorbidity_count"] = df[bin_cols].fillna(0).sum(axis=1)

def comorbidity_cat_fn(n):
    if pd.isna(n):
        return np.nan
    n = float(n)
    if n <= 2:
        return "0-2"
    elif n <= 5:
        return "3-5"
    else:
        return ">=6"

df["comorbidity_burden_cat"] = df["comorbidity_count"].apply(comorbidity_cat_fn)

# ---------------------------------------------------
# 4) TUMOR STAGE GROUPING
# ---------------------------------------------------

def stage_group_fn(stage):
    if pd.isna(stage):
        return np.nan
    s = str(stage).strip().upper()
    if "IV" in s:
        return "Stage IV"
    elif "III" in s:
        return "Stage III"
    elif "II" in s or "I" in s:
        return "Stage I-II"
    else:
        return np.nan

df["tumor_stage_group"] = df["tumor_stage_roman"].apply(stage_group_fn)

# ---------------------------------------------------
# 5) BMI GROUPING (NORMAL / UNDER / OVER / OBESE)
# ---------------------------------------------------

def bmi_group_fn(label):
    if pd.isna(label):
        return np.nan
    s = str(label).lower()
    if "underweight" in s or "<18.5" in s:
        return "Underweight"
    if "normal" in s or "18.5-24.9" in s:
        return "Normal weight"
    if "overweight" in s or "25-29.99" in s:
        return "Overweight"
    if "obese" in s or ">=30" in s:
        return "Obese"
    return np.nan

df["bmi_group"] = df["bmi_category"].apply(bmi_group_fn)

# ---------------------------------------------------
# 6) ORDINARY HOSPITALISATIONS CATEGORIES (0 / 1-2 / >=3)
# ---------------------------------------------------

def hosp_cat_fn(n):
    if pd.isna(n):
        return np.nan
    try:
        n_val = int(n)
    except Exception:
        return np.nan
    if n_val == 0:
        return "0"
    elif n_val <= 2:
        return "1-2"
    else:
        return ">=3"

df["ordinary_hosp_cat"] = df["ordinary_hospitalizations_n"].apply(hosp_cat_fn)

# ---------------------------------------------------
# 7) LAB ABNORMALITY FLAGS
# ---------------------------------------------------

# Low hemoglobin
if "hemoglobin_range" in df.columns:
    def low_hb_fn(cat):
        if pd.isna(cat):
            return np.nan
        s = str(cat).lower()
        return 1 if "low" in s else 0
    df["low_hemoglobin"] = df["hemoglobin_range"].apply(low_hb_fn)
else:
    df["low_hemoglobin"] = np.nan

# Elevated creatinine
if "creatinine_range" in df.columns:
    def elevated_creat_fn(cat):
        if pd.isna(cat):
            return np.nan
        s = str(cat).lower()
        return 1 if ("high" in s or "elevated" in s) else 0
    df["elevated_creatinine"] = df["creatinine_range"].apply(elevated_creat_fn)
else:
    df["elevated_creatinine"] = np.nan

# Low albumin
if "albumin_range" in df.columns:
    def low_alb_fn(cat):
        if pd.isna(cat):
            return np.nan
        s = str(cat).lower()
        return 1 if "low" in s else 0
    df["low_albumin"] = df["albumin_range"].apply(low_alb_fn)
else:
    df["low_albumin"] = np.nan

# ---------------------------------------------------
# 8) QUICK SANITY CHECKS
# ---------------------------------------------------

print("\n=== CCI and ACCI ===")
print(df[["CCI_score", "ACCI_score"]].describe())

print("\n=== Comorbidity burden categories ===")
print(df["comorbidity_burden_cat"].value_counts(dropna=False))

print("\n=== Tumor stage group ===")
print(df["tumor_stage_group"].value_counts(dropna=False))

print("\n=== BMI group ===")
print(df["bmi_group"].value_counts(dropna=False))

print("\n=== Ordinary hospitalisations cat ===")
print(df["ordinary_hosp_cat"].value_counts(dropna=False))

print("\n=== Lab flags ===")
print("low_hemoglobin:")
print(df["low_hemoglobin"].value_counts(dropna=False))
print("\nelevated_creatinine:")
print(df["elevated_creatinine"].value_counts(dropna=False))
print("\nlow_albumin:")
print(df["low_albumin"].value_counts(dropna=False))


Binary comorbidity columns created:
['hypertension_bin', 'aortic_insufficiency_bin', 'dyslipidemia_bin', 'bph_bin', 'obesity_comorbidity_bin', 'ischemic_heart_disease_bin', 'atrial_fibrillation_bin', 'copd_bin', 'asthma_bin', 'diabetes_type_ii_bin', 'gastroesophageal_reflux_full_bin', 'hypertensive_heart_disease_bin', 'renal_insufficiency_bin', 'depressive_syndrome_bin', 'anemia_comorbidity_bin', 'psychiatric_disorders_bin', 'other_comorbidities_bin', 'cardiovascular_disorders_bin', 'gastrointestinal_disorders_bin', 'cerebrovascular_disorders_bin']

=== CCI and ACCI ===
        CCI_score  ACCI_score
count  403.000000  403.000000
mean     3.573201    5.585608
std      1.939759    2.324323
min      2.000000    2.000000
25%      2.000000    4.000000
50%      2.000000    5.000000
75%      6.000000    7.000000
max      8.000000   11.000000

=== Comorbidity burden categories ===
comorbidity_burden_cat
0-2    345
3-5     56
>=6      2
Name: count, dtype: int64

=== Tumor stage group ===
tumor

  return s.replace({
  return s.replace({
  return s.replace({
  return s.replace({
  return s.replace({
  return s.replace({
  return s.replace({
  return s.replace({
  return s.replace({
  return s.replace({
  return s.replace({
  return s.replace({
  return s.replace({
  return s.replace({
  return s.replace({
  return s.replace({
  return s.replace({
  return s.replace({
  return s.replace({


In [48]:
import numpy as np
import pandas as pd

# ---------------------------------------------------
# Ensure observation dates are parsed
# ---------------------------------------------------
date_cols_to_parse = ["observation_start_date", "observation_end_date"]

for col in date_cols_to_parse:
    if col in df.columns:
        df[col] = pd.to_datetime(df[col], errors="coerce")
    else:
        print(f"WARNING: column {col} not found in df")

# ---------------------------------------------------
# Helper to extract the earliest ADR date
# ---------------------------------------------------
def first_date_from_string(val):
    """
    Handles strings like:
    - '2023-07-18'
    - '2023-01-10; 2023-01-31; 2023-02-21'
    Returns the earliest valid datetime or NaT.
    """
    if pd.isna(val):
        return pd.NaT
    s = str(val).strip()
    if s == "":
        return pd.NaT
    
    parts = [p.strip() for p in s.split(";")]
    # Parse each part to datetime
    parsed = pd.to_datetime(parts, errors="coerce")
    
    # parsed is a DatetimeIndex/array
    if hasattr(parsed, "min"):
        return parsed.min()
    else:
        # Defensive fallback (should rarely be needed)
        valid_dates = [d for d in parsed if not pd.isna(d)]
        return min(valid_dates) if valid_dates else pd.NaT

if "adr_onset_date" in df.columns:
    df["adr_first_date"] = df["adr_onset_date"].apply(first_date_from_string)
else:
    print("WARNING: adr_onset_date column not found")
    df["adr_first_date"] = pd.NaT

# ---------------------------------------------------
# Severe ADR event flag (Grade >= 3)
# ---------------------------------------------------
if "adr_ctcae_grade" in df.columns:
    df["severe_adr_event"] = np.where(
        (df["adr_ctcae_grade"].notna()) & (df["adr_ctcae_grade"] >= 3),
        1,
        0
    )
else:
    print("WARNING: adr_ctcae_grade column not found")
    df["severe_adr_event"] = np.nan

# ---------------------------------------------------
# Define ADR end date = event date (if event) or observation_end_date
# ---------------------------------------------------
def adr_end_date_fn(row):
    # if severe ADR event, use earliest ADR date
    if row.get("severe_adr_event", 0) == 1:
        # if adr_first_date is missing for some reason, fall back to observation_end_date
        if pd.notna(row.get("adr_first_date", pd.NaT)):
            return row["adr_first_date"]
        else:
            return row["observation_end_date"]
    else:
        # no severe event: censored at observation_end_date
        return row["observation_end_date"]

df["adr_end_date"] = df.apply(adr_end_date_fn, axis=1)
df["adr_end_date"] = pd.to_datetime(df["adr_end_date"], errors="coerce")

# ---------------------------------------------------
# ADR follow-up time in days and person-years
# ---------------------------------------------------
df["adr_followup_days"] = (
    df["adr_end_date"] - df["observation_start_date"]
).dt.days

# Avoid zero or negative times (just in case)
df["adr_followup_days"] = df["adr_followup_days"].clip(lower=1)

df["adr_followup_py"] = df["adr_followup_days"] / 365.25

# ---------------------------------------------------
# Quick sanity checks
# ---------------------------------------------------
print("\n=== Severe ADR event flag ===")
print(df["severe_adr_event"].value_counts(dropna=False))

print("\n=== ADR follow-up (days) summary ===")
print(df["adr_followup_days"].describe())

print("\n=== ADR follow-up (person-years) summary ===")
print(df["adr_followup_py"].describe())

# Optional: check a few rows to ensure dates make sense
print("\nSample of ADR-related columns:")
print(
    df[
        [
            "observation_start_date",
            "adr_onset_date",
            "adr_first_date",
            "observation_end_date",
            "adr_end_date",
            "severe_adr_event",
            "adr_followup_days"
        ]
    ].head(10)
)


  df[col] = pd.to_datetime(df[col], errors="coerce")
  df[col] = pd.to_datetime(df[col], errors="coerce")



=== Severe ADR event flag ===
severe_adr_event
0    390
1     13
Name: count, dtype: int64

=== ADR follow-up (days) summary ===
count     231.000000
mean      496.229437
std       322.882647
min         6.000000
25%       197.500000
50%       476.000000
75%       822.500000
max      1085.000000
Name: adr_followup_days, dtype: float64

=== ADR follow-up (person-years) summary ===
count    231.000000
mean       1.358602
std        0.884005
min        0.016427
25%        0.540726
50%        1.303217
75%        2.251882
max        2.970568
Name: adr_followup_py, dtype: float64

Sample of ADR-related columns:
  observation_start_date                                     adr_onset_date  \
0                    NaT                             2023-01-29; 2023-03-15   
1             2022-04-22                                                NaN   
2                    NaT                             2023-04-05; 2023-05-13   
3                    NaT                                              

In [49]:
import numpy as np
import pandas as pd

# ---------------------------------------------------
# 0. Parse relevant date columns safely
# ---------------------------------------------------
date_cols = [
    "observation_start_date",
    "observation_end_date",
    "hospital_admission_date",
    "hospital_discharge_date",
]

for col in date_cols:
    if col in df.columns:
        df[col] = pd.to_datetime(df[col], errors="coerce")
    else:
        print(f"WARNING: column {col} not found")


# ---------------------------------------------------
# 1. Extract earliest ADR date from adr_onset_date
# ---------------------------------------------------
def first_date_from_string(val):
    if pd.isna(val):
        return pd.NaT
    s = str(val).strip()
    if s == "":
        return pd.NaT
    parts = [p.strip() for p in s.split(";")]
    parsed = pd.to_datetime(parts, errors="coerce")
    valid_dates = parsed[~pd.isna(parsed)]
    return valid_dates.min() if len(valid_dates) > 0 else pd.NaT

df["adr_first_date"] = df["adr_onset_date"].apply(first_date_from_string)


# ---------------------------------------------------
# 2. Severe ADR event flag (Grade >= 3)
# ---------------------------------------------------
df["severe_adr_event"] = np.where(
    (df["adr_ctcae_grade"].notna()) & (df["adr_ctcae_grade"] >= 3),
    1,
    0
)


# ---------------------------------------------------
# 3. Build ADR START DATE with fallback:
#    observation_start_date first, else hospital_admission_date
# ---------------------------------------------------
def adr_start_fn(row):
    if pd.notna(row.get("observation_start_date", pd.NaT)):
        return row["observation_start_date"]
    else:
        return row.get("hospital_admission_date", pd.NaT)

df["adr_start_date"] = df.apply(adr_start_fn, axis=1)


# ---------------------------------------------------
# 4. ADR END DATE:
#    - If severe ADR event → use earliest ADR date
#    - Else → use observation_end_date
# ---------------------------------------------------
def adr_end_fn(row):
    if row.get("severe_adr_event", 0) == 1:
        if pd.notna(row.get("adr_first_date", pd.NaT)):
            return row["adr_first_date"]
        else:
            return row.get("observation_end_date", pd.NaT)
    else:
        return row.get("observation_end_date", pd.NaT)

df["adr_end_date"] = df.apply(adr_end_fn, axis=1)
df["adr_end_date"] = pd.to_datetime(df["adr_end_date"], errors="coerce")


# ---------------------------------------------------
# 5. Follow-up time for ADR outcome (days + person-years)
# ---------------------------------------------------
df["adr_followup_days"] = (
    df["adr_end_date"] - df["adr_start_date"]
).dt.days

# Clip negatives (if end < start)
df["adr_followup_days"] = df["adr_followup_days"].clip(lower=1)

df["adr_followup_py"] = df["adr_followup_days"] / 365.25


# ---------------------------------------------------
# 6. Diagnostics
# ---------------------------------------------------
print("\n=== Severe ADR event flag ===")
print(df["severe_adr_event"].value_counts(dropna=False))

print("\n=== Missing ADR start dates after fallback ===")
print(df["adr_start_date"].isna().sum())

print("\n=== Missing ADR end dates ===")
print(df["adr_end_date"].isna().sum())

print("\n=== ADR follow-up (days) summary ===")
print(df["adr_followup_days"].describe())

print("\n=== ADR follow-up (person-years) summary ===")
print(df["adr_followup_py"].describe())

# Show sample rows
print("\nSample ADR window rows:")
print(
    df[
        [
            "adr_start_date",
            "adr_end_date",
            "severe_adr_event",
            "adr_followup_days",
            "observation_start_date",
            "hospital_admission_date"
        ]
    ].head(15)
)



=== Severe ADR event flag ===
severe_adr_event
0    390
1     13
Name: count, dtype: int64

=== Missing ADR start dates after fallback ===
138

=== Missing ADR end dates ===
164

=== ADR follow-up (days) summary ===
count     234.000000
mean      490.230769
std       325.135030
min         1.000000
25%       193.500000
50%       472.000000
75%       816.500000
max      1085.000000
Name: adr_followup_days, dtype: float64

=== ADR follow-up (person-years) summary ===
count    234.000000
mean       1.342179
std        0.890171
min        0.002738
25%        0.529774
50%        1.292266
75%        2.235455
max        2.970568
Name: adr_followup_py, dtype: float64

Sample ADR window rows:
   adr_start_date adr_end_date  severe_adr_event  adr_followup_days  \
0             NaT          NaT                 0                NaN   
1      2022-04-22   2022-07-18                 0               87.0   
2             NaT          NaT                 0                NaN   
3      2024-07-02     

In [51]:
import numpy as np
import pandas as pd

# 1. Make sure date columns are parsed (re-run safely)
date_cols = [
    "observation_start_date",
    "observation_end_date",
    "hospital_admission_date",
    "hospital_discharge_date",
]
for col in date_cols:
    if col in df.columns:
        df[col] = pd.to_datetime(df[col], errors="coerce")

# 2. Rebuild adr_first_date (idempotent, safe to overwrite)
def first_date_from_string(val):
    if pd.isna(val):
        return pd.NaT
    s = str(val).strip()
    if s == "":
        return pd.NaT
    parts = [p.strip() for p in s.split(";")]
    parsed = pd.to_datetime(parts, errors="coerce")
    valid_dates = parsed[~pd.isna(parsed)]
    return valid_dates.min() if len(valid_dates) > 0 else pd.NaT

df["adr_first_date"] = df["adr_onset_date"].apply(first_date_from_string)

# 3. Severe ADR flag (keep as before)
df["severe_adr_event"] = np.where(
    (df["adr_ctcae_grade"].notna()) & (df["adr_ctcae_grade"] >= 3),
    1,
    0
)

# 4. New ADR START DATE with extended fallback
def adr_start_fn2(row):
    # Order of preference
    for col in [
        "observation_start_date",
        "hospital_admission_date",
        "hospital_discharge_date",
        "adr_first_date"  # last resort
    ]:
        val = row.get(col, pd.NaT)
        if pd.notna(val):
            return val
    return pd.NaT

df["adr_start_date"] = df.apply(adr_start_fn2, axis=1)

# 5. New ADR END DATE with fallback
def adr_end_fn2(row):
    # If severe ADR event, prefer earliest ADR date
    if row.get("severe_adr_event", 0) == 1:
        if pd.notna(row.get("adr_first_date", pd.NaT)):
            return row["adr_first_date"]
        # fallback if adr_first_date missing
        for col in ["observation_end_date", "hospital_discharge_date"]:
            val = row.get(col, pd.NaT)
            if pd.notna(val):
                return val
        return pd.NaT
    else:
        # No severe ADR event: censor at end of observation or discharge
        for col in ["observation_end_date", "hospital_discharge_date"]:
            val = row.get(col, pd.NaT)
            if pd.notna(val):
                return val
        return pd.NaT

df["adr_end_date"] = df.apply(adr_end_fn2, axis=1)
df["adr_end_date"] = pd.to_datetime(df["adr_end_date"], errors="coerce")

# 6. Follow-up time in days and person-years
df["adr_followup_days"] = (df["adr_end_date"] - df["adr_start_date"]).dt.days

# remove negatives (if any)
df["adr_followup_days"] = df["adr_followup_days"].clip(lower=1)

df["adr_followup_py"] = df["adr_followup_days"] / 365.25

# 7. Diagnostics
print("=== Severe ADR event flag ===")
print(df["severe_adr_event"].value_counts(dropna=False), "\n")

print("Missing ADR start dates:", df["adr_start_date"].isna().sum())
print("Missing ADR end dates:", df["adr_end_date"].isna().sum(), "\n")

print("=== ADR follow-up (days) summary ===")
print(df["adr_followup_days"].describe(), "\n")

print("=== ADR follow-up (person-years) summary ===")
print(df["adr_followup_py"].describe(), "\n")

print("Sample ADR window rows:")
print(
    df[
        [
            "adr_start_date",
            "adr_end_date",
            "severe_adr_event",
            "adr_followup_days",
            "observation_start_date",
            "hospital_admission_date",
            "hospital_discharge_date",
            "adr_first_date"
        ]
    ].head(15)
)


=== Severe ADR event flag ===
severe_adr_event
0    390
1     13
Name: count, dtype: int64 

Missing ADR start dates: 36
Missing ADR end dates: 133 

=== ADR follow-up (days) summary ===
count     270.000000
mean      427.348148
std       343.141085
min         1.000000
25%       109.500000
50%       353.000000
75%       724.250000
max      1085.000000
Name: adr_followup_days, dtype: float64 

=== ADR follow-up (person-years) summary ===
count    270.000000
mean       1.170015
std        0.939469
min        0.002738
25%        0.299795
50%        0.966461
75%        1.982888
max        2.970568
Name: adr_followup_py, dtype: float64 

Sample ADR window rows:
   adr_start_date adr_end_date  severe_adr_event  adr_followup_days  \
0      2023-01-29          NaT                 0                NaN   
1      2022-04-22   2022-07-18                 0               87.0   
2      2023-04-05          NaT                 0                NaN   
3      2024-07-02   2024-07-05                 0  

In [52]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf

# 1) Restrict to patients with non-missing follow-up and defined event
df_adr = df.copy()

df_adr = df_adr[
    df_adr["adr_followup_py"].notna() &
    (df_adr["adr_followup_py"] > 0) &
    df_adr["severe_adr_event"].notna()
].copy()

print("N included in ADR IRR analysis:", len(df_adr))

# 2) Create age group (<=65 vs >65) for the table
def make_age_group(a):
    try:
        if pd.isna(a):
            return np.nan
        a = float(a)
        return "<=65" if a <= 65 else ">65"
    except Exception:
        return np.nan

df_adr["age_group_65"] = df_adr["age"].apply(make_age_group)

print(df_adr["age_group_65"].value_counts(dropna=False))


N included in ADR IRR analysis: 270
age_group_65
<=65    146
>65     124
Name: count, dtype: int64


In [61]:
# Age group
df_adr["age_group_65"] = df_adr["age_group_65"].astype("category")
df_adr["age_group_65"] = df_adr["age_group_65"].cat.reorder_categories(
    ["<=65", ">65"], ordered=True
)

# Comorbidity burden
if "comorbidity_burden_cat" in df_adr.columns:
    df_adr["comorbidity_burden_cat"] = df_adr["comorbidity_burden_cat"].astype("category")
    df_adr["comorbidity_burden_cat"] = df_adr["comorbidity_burden_cat"].cat.reorder_categories(
        ["0-2", "3-5", ">=6"], ordered=True
    )

# Tumor stage group
df_adr["tumor_stage_group"] = df_adr["tumor_stage_group"].astype("category")
df_adr["tumor_stage_group"] = df_adr["tumor_stage_group"].cat.reorder_categories(
    ["Stage I-II", "Stage III", "Stage IV"], ordered=True
)

# BMI group
df_adr["bmi_group"] = df_adr["bmi_group"].astype("category")
df_adr["bmi_group"] = df_adr["bmi_group"].cat.reorder_categories(
    ["Normal weight", "Underweight", "Overweight", "Obese"], ordered=True
)

# Ordinary hospitalisations category (0, 1-2, >=3) if used later
if "ordinary_hosp_cat" in df_adr.columns:
    df_adr["ordinary_hosp_cat"] = df_adr["ordinary_hosp_cat"].astype("category")
    df_adr["ordinary_hosp_cat"] = df_adr["ordinary_hosp_cat"].cat.reorder_categories(
        ["0", "1-2", ">=3"], ordered=True
    )

# Lab flags to numeric 0/1 (if not already)
for lab_col in ["low_hemoglobin", "elevated_creatinine", "low_albumin"]:
    if lab_col in df_adr.columns:
        df_adr[lab_col] = df_adr[lab_col].astype("float")


In [62]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
import numpy.linalg as npl

def fit_poisson_irr(formula, data, offset_col):
    """
    Fits a Poisson model with log(offset_col) and returns IRR table.
    - Uses robust HC0 covariance by default.
    - If robust covariance fails with singular matrix, falls back to non-robust.
    - If the model is still singular (e.g. no variation in predictor), returns (None, None).
    """

    # Drop rows with missing outcome, offset or any vars in the formula
    model = smf.glm(
        formula=formula,
        data=data,
        family=sm.families.Poisson(),
        offset=np.log(data[offset_col])
    )

    try:
        res = model.fit(cov_type="HC0")
    except npl.LinAlgError:
        print(f"Warning: robust covariance failed (singular matrix) for:\n  {formula}\nFalling back to non-robust covariance.")
        try:
            res = model.fit()
        except npl.LinAlgError:
            print(f"Error: even non-robust fit failed (singular matrix) for:\n  {formula}\nReturning None.")
            return None, None

    coef = res.params
    se = res.bse

    irr = np.exp(coef)
    lower = np.exp(coef - 1.96 * se)
    upper = np.exp(coef + 1.96 * se)

    out = pd.DataFrame({
        "term": coef.index,
        "coef": coef,
        "IRR": irr,
        "CI_lower": lower,
        "CI_upper": upper,
        "p_value": res.pvalues
    })

    return res, out


In [63]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
import numpy.linalg as npl

# -------------------------------------------------------------------
# Helper: check if a predictor has at least 2 distinct non-missing values
# -------------------------------------------------------------------
def has_variation(series):
    return series.dropna().nunique() >= 2

# -------------------------------------------------------------------
# Helper: fit Poisson with offset, return IRR table, robust to singularities
# -------------------------------------------------------------------
def fit_poisson_irr(formula, data, offset_col):
    """
    Fits a Poisson model with log(offset_col) as offset and returns:
      - res: fitted statsmodels results
      - out: DataFrame with term, coef, IRR, CI_lower, CI_upper, p_value

    Handles:
      - robust HC0 covariance by default
      - falls back to non-robust covariance if HC0 fails (singular matrix)
      - returns (None, None) if both fail
    """

    # Drop rows with missing offset first
    df = data.copy()
    df = df.loc[df[offset_col].notna()].copy()

    if df.empty:
        print(f"Skipping model (no data after dropping missing offset): {formula}")
        return None, None

    # Build model
    model = smf.glm(
        formula=formula,
        data=df,
        family=sm.families.Poisson(),
        offset = np.log(df[offset_col])
    )

    # Try robust covariance
    try:
        res = model.fit(cov_type="HC0")
    except npl.LinAlgError:
        print(f"Warning: robust covariance failed (singular matrix) for:\n  {formula}\nFalling back to non-robust covariance.")
        try:
            res = model.fit()
        except npl.LinAlgError:
            print(f"Error: even non-robust fit failed (singular matrix) for:\n  {formula}\nReturning None.")
            return None, None

    coef = res.params
    se = res.bse

    irr = np.exp(coef)
    lower = np.exp(coef - 1.96 * se)
    upper = np.exp(coef + 1.96 * se)

    out = pd.DataFrame({
        "term": coef.index,
        "coef": coef,
        "IRR": irr,
        "CI_lower": lower,
        "CI_upper": upper,
        "p_value": res.pvalues
    })

    return res, out

# -------------------------------------------------------------------
# Unadjusted Poisson models for severe_adr_event (incidence rate)
# -------------------------------------------------------------------

unadj_results = {}

# 1) Age group (<=65 vs >65)
formula_age_group_unadj = "severe_adr_event ~ C(age_group_65)"
res_age_group_u, tab_age_group_u = fit_poisson_irr(formula_age_group_unadj, df_adr, "adr_followup_py")
unadj_results["age_group"] = tab_age_group_u

# 2) Age continuous (per year)
formula_age_cont_unadj = "severe_adr_event ~ age"
res_age_cont_u, tab_age_cont_u = fit_poisson_irr(formula_age_cont_unadj, df_adr, "adr_followup_py")
unadj_results["age_cont"] = tab_age_cont_u

# 3) ACCI (per 1-point increase)
formula_acci_unadj = "severe_adr_event ~ ACCI_score"
res_acci_u, tab_acci_u = fit_poisson_irr(formula_acci_unadj, df_adr, "adr_followup_py")
unadj_results["ACCI"] = tab_acci_u

# 4) CCI (per 1-point increase)
formula_cci_unadj = "severe_adr_event ~ CCI_score"
res_cci_u, tab_cci_u = fit_poisson_irr(formula_cci_unadj, df_adr, "adr_followup_py")
unadj_results["CCI"] = tab_cci_u

# 5) Comorbidity burden categories (0-2 ref)
formula_comb_unadj = "severe_adr_event ~ C(comorbidity_burden_cat)"
res_comb_u, tab_comb_u = fit_poisson_irr(formula_comb_unadj, df_adr, "adr_followup_py")
unadj_results["comorbidity_burden"] = tab_comb_u

# 6) Tumor stage group (Stage I-II ref)
formula_stage_unadj = "severe_adr_event ~ C(tumor_stage_group)"
res_stage_u, tab_stage_u = fit_poisson_irr(formula_stage_unadj, df_adr, "adr_followup_py")
unadj_results["tumor_stage"] = tab_stage_u

# 7) Chemo cycles (per additional cycle)
# Make sure 'chemo_cycles_n' is the right column name in df_adr
formula_chemo_unadj = "severe_adr_event ~ chemo_cycles_n"
res_chemo_u, tab_chemo_u = fit_poisson_irr(formula_chemo_unadj, df_adr, "adr_followup_py")
unadj_results["chemo_cycles"] = tab_chemo_u

# 8) BMI category (Normal weight ref)
formula_bmi_unadj = "severe_adr_event ~ C(bmi_group)"
res_bmi_u, tab_bmi_u = fit_poisson_irr(formula_bmi_unadj, df_adr, "adr_followup_py")
unadj_results["bmi_group"] = tab_bmi_u

# 9) Lab abnormalities (one at a time), with variation checks

# low hemoglobin
if "low_hemoglobin" in df_adr.columns and has_variation(df_adr["low_hemoglobin"]):
    formula_hb_unadj = "severe_adr_event ~ low_hemoglobin"
    res_hb_u, tab_hb_u = fit_poisson_irr(formula_hb_unadj, df_adr, "adr_followup_py")
    unadj_results["low_hb"] = tab_hb_u
else:
    print("Skipping low_hemoglobin model: no variation or column missing in ADR subset.")
    unadj_results["low_hb"] = None

# elevated creatinine
if "elevated_creatinine" in df_adr.columns and has_variation(df_adr["elevated_creatinine"]):
    formula_creat_unadj = "severe_adr_event ~ elevated_creatinine"
    res_creat_u, tab_creat_u = fit_poisson_irr(formula_creat_unadj, df_adr, "adr_followup_py")
    unadj_results["elev_creat"] = tab_creat_u
else:
    print("Skipping elevated_creatinine model: no variation or column missing in ADR subset.")
    unadj_results["elev_creat"] = None

# low albumin
if "low_albumin" in df_adr.columns and has_variation(df_adr["low_albumin"]):
    formula_alb_unadj = "severe_adr_event ~ low_albumin"
    res_alb_u, tab_alb_u = fit_poisson_irr(formula_alb_unadj, df_adr, "adr_followup_py")
    unadj_results["low_alb"] = tab_alb_u
else:
    print("Skipping low_albumin model: no variation or column missing in ADR subset.")
    unadj_results["low_alb"] = None

# Example: inspect one
tab_age_group_u


Skipping low_hemoglobin model: no variation or column missing in ADR subset.
Skipping elevated_creatinine model: no variation or column missing in ADR subset.


Unnamed: 0,term,coef,IRR,CI_lower,CI_upper,p_value
Intercept,Intercept,-4.467975,0.011471,0.002835,0.046412,3.724647e-10
C(age_group_65)[T.>65],C(age_group_65)[T.>65],1.712588,5.543289,1.168132,26.305298,0.0311144


In [64]:
# See which models ran successfully
{key: (val is not None) for key, val in unadj_results.items()}

# Look at age group IRRs
unadj_results["age_group"]


Unnamed: 0,term,coef,IRR,CI_lower,CI_upper,p_value
Intercept,Intercept,-4.467975,0.011471,0.002835,0.046412,3.724647e-10
C(age_group_65)[T.>65],C(age_group_65)[T.>65],1.712588,5.543289,1.168132,26.305298,0.0311144


In [65]:
# Adjusted model for severe ADRs (Grade ≥3)
# Adjusted for:
# - age_group_65 (<=65 vs >65)
# - ACCI_score
# - tumor_stage_group
# - chemo_cycles_n
# - bmi_group
# - low_hemoglobin, elevated_creatinine, low_albumin

formula_adr_adj = """
severe_adr_event ~ C(age_group_65)
                  + ACCI_score
                  + C(tumor_stage_group)
                  + chemo_cycles_n
                  + C(bmi_group)
                  + low_hemoglobin
                  + elevated_creatinine
                  + low_albumin
"""

res_adr_adj, tab_adr_adj = fit_poisson_irr(formula_adr_adj, df_adr, "adr_followup_py")

tab_adr_adj


  
severe_adr_event ~ C(age_group_65)
                  + ACCI_score
                  + C(tumor_stage_group)
                  + chemo_cycles_n
                  + C(bmi_group)
                  + low_hemoglobin
                  + elevated_creatinine
                  + low_albumin

Falling back to non-robust covariance.


  result = getattr(ufunc, method)(*inputs, **kwargs)


Unnamed: 0,term,coef,IRR,CI_lower,CI_upper,p_value
Intercept,Intercept,-1.928152,0.1454167,0.005846,3.617197,0.239634
C(age_group_65)[T.>65],C(age_group_65)[T.>65],0.969942,2.637792,0.321107,21.668631,0.366668
C(tumor_stage_group)[T.Stage III],C(tumor_stage_group)[T.Stage III],0.984935,2.677637,0.366333,19.571661,0.331796
C(tumor_stage_group)[T.Stage IV],C(tumor_stage_group)[T.Stage IV],-0.012374,0.9877018,0.057671,16.91581,0.993188
C(bmi_group)[T.Underweight],C(bmi_group)[T.Underweight],-17.266702,3.170784e-08,0.0,inf,0.998787
C(bmi_group)[T.Overweight],C(bmi_group)[T.Overweight],0.821275,2.273397,0.43805,11.798501,0.328306
C(bmi_group)[T.Obese],C(bmi_group)[T.Obese],0.932457,2.540744,0.268887,24.00782,0.415789
ACCI_score,ACCI_score,0.329438,1.390187,0.742824,2.601718,0.302888
chemo_cycles_n,chemo_cycles_n,-0.692382,0.5003828,0.34638,0.722855,0.000225
low_hemoglobin,low_hemoglobin,0.0,1.0,1.0,1.0,


In [66]:
age_row_adj = tab_adr_adj.loc[tab_adr_adj["term"] == "C(age_group_65)[T.>65]"].copy()
age_row_adj


Unnamed: 0,term,coef,IRR,CI_lower,CI_upper,p_value
C(age_group_65)[T.>65],C(age_group_65)[T.>65],0.969942,2.637792,0.321107,21.668631,0.366668


In [67]:
age_row_unadj = tab_age_group_u.loc[tab_age_group_u["term"] == "C(age_group_65)[T.>65]"].copy()
age_row_unadj


Unnamed: 0,term,coef,IRR,CI_lower,CI_upper,p_value
C(age_group_65)[T.>65],C(age_group_65)[T.>65],1.712588,5.543289,1.168132,26.305298,0.031114


In [68]:
def format_row(label, category, tab_unadj, term_unadj, tab_adj=None, term_adj=None):
    """
    Build one row for the final IRR table.
    tab_unadj / tab_adj are the IRR result tables.
    term_unadj / term_adj are the 'term' labels to pick in each.
    """
    # Unadjusted
    row_u = tab_unadj.loc[tab_unadj["term"] == term_unadj].iloc[0]
    irr_u = row_u["IRR"]
    l_u = row_u["CI_lower"]
    u_u = row_u["CI_upper"]

    if (tab_adj is not None) and (term_adj is not None):
        row_a = tab_adj.loc[tab_adj["term"] == term_adj].iloc[0]
        irr_a = row_a["IRR"]
        l_a = row_a["CI_lower"]
        u_a = row_a["CI_upper"]
    else:
        irr_a = l_a = u_a = np.nan

    return {
        "Variable": label,
        "Category": category,
        "Unadj_IRR": irr_u,
        "Unadj_95CI": f"{l_u:.2f}-{u_u:.2f}",
        "Adj_IRR": irr_a,
        "Adj_95CI": f"{l_a:.2f}-{u_a:.2f}" if not np.isnan(irr_a) else ""
    }

# Example: Age group (>65 vs ≤65)
row_age = format_row(
    label="Age group",
    category=">65 vs ≤65",
    tab_unadj=tab_age_group_u,
    term_unadj="C(age_group_65)[T.>65]",
    tab_adj=tab_adr_adj,
    term_adj="C(age_group_65)[T.>65]"
)

row_age


{'Variable': 'Age group',
 'Category': '>65 vs ≤65',
 'Unadj_IRR': 5.543289038472696,
 'Unadj_95CI': '1.17-26.31',
 'Adj_IRR': 2.6377920064728198,
 'Adj_95CI': '0.32-21.67'}

In [69]:
table2_rows = []

# Age group example
table2_rows.append(row_age)

# Example ACCI (per point), assuming you want unadj from tab_acci_u and adj from tab_adr_adj
row_acci = format_row(
    label="ACCI score",
    category="Per 1-point increase",
    tab_unadj=tab_acci_u,
    term_unadj="ACCI_score",
    tab_adj=tab_adr_adj,
    term_adj="ACCI_score"
)
table2_rows.append(row_acci)

# ...add more rows for tumor stage, chemo cycles, BMI, labs...

table2_df = pd.DataFrame(table2_rows)
table2_df


Unnamed: 0,Variable,Category,Unadj_IRR,Unadj_95CI,Adj_IRR,Adj_95CI
0,Age group,>65 vs ≤65,5.543289,1.17-26.31,2.637792,0.32-21.67
1,ACCI score,Per 1-point increase,1.478646,1.17-1.86,1.390187,0.74-2.60


In [70]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf

# Keep only rows with valid follow-up and outcome
df_adr = df.copy()
df_adr = df_adr[
    (df_adr["adr_followup_py"] > 0) &
    df_adr["severe_adr_event"].isin([0, 1])
].copy()

# Make sure age_group_65 exists (<=65 vs >65)
if "age_group_65" not in df_adr.columns:
    df_adr["age_group_65"] = np.where(df_adr["age"] > 65, ">65", "<=65")

def fit_poisson_irr(formula, data, offset_col):
    """
    Fit a Poisson regression with log(offset_col) and
    return (results, IRR_table)
    """
    model = smf.glm(
        formula=formula,
        data=data,
        family=sm.families.Poisson(),
        offset=np.log(data[offset_col])
    )
    try:
        res = model.fit(cov_type="HC0")  # robust SE
    except np.linalg.LinAlgError:
        # fallback to default covariance if robust fails
        res = model.fit()
    
    coef = res.params
    se = res.bse
    irr = np.exp(coef)
    ci_lower = np.exp(coef - 1.96 * se)
    ci_upper = np.exp(coef + 1.96 * se)
    pvals = res.pvalues

    tab = pd.DataFrame({
        "term": coef.index,
        "coef": coef.values,
        "IRR": irr.values,
        "CI_lower": ci_lower.values,
        "CI_upper": ci_upper.values,
        "p_value": pvals.values
    })
    return res, tab


In [71]:
unadj_results = {}

# Age group (<=65 vs >65)
res_age_group_u, tab_age_group_u = fit_poisson_irr(
    "severe_adr_event ~ C(age_group_65)",
    df_adr, "adr_followup_py"
)
unadj_results["age_group"] = tab_age_group_u

# Age continuous (per year)
res_age_cont_u, tab_age_cont_u = fit_poisson_irr(
    "severe_adr_event ~ age",
    df_adr, "adr_followup_py"
)
unadj_results["age_cont"] = tab_age_cont_u

# ACCI (per 1-point increase)
res_acci_u, tab_acci_u = fit_poisson_irr(
    "severe_adr_event ~ ACCI_score",
    df_adr, "adr_followup_py"
)
unadj_results["ACCI"] = tab_acci_u

# CCI (per 1-point increase)
res_cci_u, tab_cci_u = fit_poisson_irr(
    "severe_adr_event ~ CCI_score",
    df_adr, "adr_followup_py"
)
unadj_results["CCI"] = tab_cci_u

# Comorbidity burden categories (0-2 ref)
res_comb_u, tab_comb_u = fit_poisson_irr(
    "severe_adr_event ~ C(comorbidity_burden_cat)",
    df_adr, "adr_followup_py"
)
unadj_results["comorbidity_burden"] = tab_comb_u

# Tumor stage group (Stage I-II ref)
res_stage_u, tab_stage_u = fit_poisson_irr(
    "severe_adr_event ~ C(tumor_stage_group)",
    df_adr, "adr_followup_py"
)
unadj_results["tumor_stage"] = tab_stage_u

# Chemo cycles (per cycle)
res_chemo_u, tab_chemo_u = fit_poisson_irr(
    "severe_adr_event ~ chemo_cycles_n",
    df_adr, "adr_followup_py"
)
unadj_results["chemo_cycles"] = tab_chemo_u

# BMI category (Normal weight ref)
res_bmi_u, tab_bmi_u = fit_poisson_irr(
    "severe_adr_event ~ C(bmi_group)",
    df_adr, "adr_followup_py"
)
unadj_results["bmi_group"] = tab_bmi_u

# Lab abnormalities (one at a time)
res_hb_u, tab_hb_u = fit_poisson_irr(
    "severe_adr_event ~ low_hemoglobin",
    df_adr, "adr_followup_py"
)
unadj_results["low_hb"] = tab_hb_u

res_creat_u, tab_creat_u = fit_poisson_irr(
    "severe_adr_event ~ elevated_creatinine",
    df_adr, "adr_followup_py"
)
unadj_results["elev_creat"] = tab_creat_u

res_alb_u, tab_alb_u = fit_poisson_irr(
    "severe_adr_event ~ low_albumin",
    df_adr, "adr_followup_py"
)
unadj_results["low_alb"] = tab_alb_u

# Cardiovascular comorbidity (Yes vs No)
res_cardio_u, tab_cardio_u = fit_poisson_irr(
    "severe_adr_event ~ cardiovascular_disorders_bin",
    df_adr, "adr_followup_py"
)
unadj_results["cardio"] = tab_cardio_u

# Diabetes type II (Yes vs No)
res_dm_u, tab_dm_u = fit_poisson_irr(
    "severe_adr_event ~ diabetes_type_ii_bin",
    df_adr, "adr_followup_py"
)
unadj_results["diabetes"] = tab_dm_u

# Dose reduction performed (Yes vs No)
# If your variable name is different (e.g. 'dose_reduced_bin'), change here
res_dose_u, tab_dose_u = fit_poisson_irr(
    "severe_adr_event ~ dose_reduced",
    df_adr, "adr_followup_py"
)
unadj_results["dose_reduced"] = tab_dose_u


In [72]:
# Main adjusted model – hospital specification
formula_adr_adj = """
severe_adr_event ~ C(age_group_65)
                  + ACCI_score
                  + C(tumor_stage_group)
                  + chemo_cycles_n
                  + C(bmi_group)
                  + low_hemoglobin
                  + elevated_creatinine
                  + low_albumin
"""
res_adr_adj, tab_adr_adj = fit_poisson_irr(
    formula_adr_adj, df_adr, "adr_followup_py"
)


  result = getattr(ufunc, method)(*inputs, **kwargs)


In [73]:
# Adjusted model for CCI (replace ACCI with CCI)
formula_cci_adj = """
severe_adr_event ~ CCI_score
                  + C(age_group_65)
                  + C(tumor_stage_group)
                  + chemo_cycles_n
                  + C(bmi_group)
                  + low_hemoglobin
                  + elevated_creatinine
                  + low_albumin
"""
res_cci_adj, tab_cci_adj = fit_poisson_irr(
    formula_cci_adj, df_adr, "adr_followup_py"
)

# Cardiovascular comorbidity adjusted
formula_cardio_adj = """
severe_adr_event ~ cardiovascular_disorders_bin
                  + C(age_group_65)
                  + ACCI_score
                  + C(tumor_stage_group)
                  + chemo_cycles_n
                  + C(bmi_group)
                  + low_hemoglobin
                  + elevated_creatinine
                  + low_albumin
"""
res_cardio_adj, tab_cardio_adj = fit_poisson_irr(
    formula_cardio_adj, df_adr, "adr_followup_py"
)

# Diabetes type II adjusted
formula_dm_adj = """
severe_adr_event ~ diabetes_type_ii_bin
                  + C(age_group_65)
                  + ACCI_score
                  + C(tumor_stage_group)
                  + chemo_cycles_n
                  + C(bmi_group)
                  + low_hemoglobin
                  + elevated_creatinine
                  + low_albumin
"""
res_dm_adj, tab_dm_adj = fit_poisson_irr(
    formula_dm_adj, df_adr, "adr_followup_py"
)

# Dose reduction adjusted
formula_dose_adj = """
severe_adr_event ~ dose_reduced
                  + C(age_group_65)
                  + ACCI_score
                  + C(tumor_stage_group)
                  + chemo_cycles_n
                  + C(bmi_group)
                  + low_hemoglobin
                  + elevated_creatinine
                  + low_albumin
"""
res_dose_adj, tab_dose_adj = fit_poisson_irr(
    formula_dose_adj, df_adr, "adr_followup_py"
)


  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


In [74]:
def get_term(tab, term_name):
    """
    Extract IRR and CI for a single term.
    Returns (IRR, CI_lower, CI_upper) or (np.nan, np.nan, np.nan) if missing.
    """
    row = tab.loc[tab["term"] == term_name]
    if row.empty:
        return np.nan, np.nan, np.nan
    r = row.iloc[0]
    return r["IRR"], r["CI_lower"], r["CI_upper"]

def make_row(variable, category,
             tab_unadj=None, term_unadj=None,
             tab_adj=None, term_adj=None,
             ref=False):
    if ref:
        return {
            "Variable": variable,
            "Category": category,
            "Unadj_IRR": 1.00,
            "Unadj_95CI": "",
            "Adj_IRR": 1.00,
            "Adj_95CI": ""
        }
    
    irr_u, l_u, u_u = (np.nan, np.nan, np.nan)
    irr_a, l_a, u_a = (np.nan, np.nan, np.nan)

    if (tab_unadj is not None) and (term_unadj is not None):
        irr_u, l_u, u_u = get_term(tab_unadj, term_unadj)
    
    if (tab_adj is not None) and (term_adj is not None):
        irr_a, l_a, u_a = get_term(tab_adj, term_adj)
    
    return {
        "Variable": variable,
        "Category": category,
        "Unadj_IRR": irr_u,
        "Unadj_95CI": "" if np.isnan(irr_u) else f"{l_u:.2f}-{u_u:.2f}",
        "Adj_IRR": irr_a,
        "Adj_95CI": "" if np.isnan(irr_a) else f"{l_a:.2f}-{u_a:.2f}",
    }


In [75]:
rows = []

# 1. Age group (<=65 vs >65)
rows.append(make_row("Age group", "≤65", ref=True))

rows.append(
    make_row(
        "Age group", ">65",
        tab_unadj=tab_age_group_u,
        term_unadj="C(age_group_65)[T.>65]",
        tab_adj=tab_adr_adj,
        term_adj="C(age_group_65)[T.>65]"
    )
)

# 2. Age continuous (per year)
rows.append(
    make_row(
        "Age (continuous, per year)", "Per 1-year increase",
        tab_unadj=tab_age_cont_u,
        term_unadj="age",
        tab_adj=tab_adr_adj,    # age is not in the main adj formula; if you added it there, this will work
        term_adj="age"
    )
)

# 3. ACCI score
rows.append(
    make_row(
        "ACCI score", "Per 1-point increase",
        tab_unadj=tab_acci_u,
        term_unadj="ACCI_score",
        tab_adj=tab_adr_adj,
        term_adj="ACCI_score"
    )
)

# 4. CCI score
rows.append(
    make_row(
        "CCI score", "Per 1-point increase",
        tab_unadj=tab_cci_u,
        term_unadj="CCI_score",
        tab_adj=tab_cci_adj,
        term_adj="CCI_score"
    )
)

# 5. Comorbidity burden
rows.append(make_row("Comorbidity burden", "0-2 comorbidities", ref=True))

rows.append(
    make_row(
        "Comorbidity burden", "3-5 comorbidities",
        tab_unadj=tab_comb_u,
        term_unadj="C(comorbidity_burden_cat)[T.3-5]"
        # no adjusted term here because it's not in main adj model;
        # you can add a model and term if you decide to adjust
    )
)

rows.append(
    make_row(
        "Comorbidity burden", "≥6 comorbidities",
        tab_unadj=tab_comb_u,
        term_unadj="C(comorbidity_burden_cat)[T.>=6]"
    )
)

# 6. Tumor Stage
rows.append(make_row("Tumor Stage", "Stage I-II", ref=True))

rows.append(
    make_row(
        "Tumor Stage", "Stage III",
        tab_unadj=tab_stage_u,
        term_unadj="C(tumor_stage_group)[T.Stage III]",
        tab_adj=tab_adr_adj,
        term_adj="C(tumor_stage_group)[T.Stage III]"
    )
)

rows.append(
    make_row(
        "Tumor Stage", "Stage IV",
        tab_unadj=tab_stage_u,
        term_unadj="C(tumor_stage_group)[T.Stage IV]",
        tab_adj=tab_adr_adj,
        term_adj="C(tumor_stage_group)[T.Stage IV]"
    )
)

# 7. Chemotherapy cycles
rows.append(
    make_row(
        "Chemotherapy cycles", "Per additional cycle",
        tab_unadj=tab_chemo_u,
        term_unadj="chemo_cycles_n",
        tab_adj=tab_adr_adj,
        term_adj="chemo_cycles_n"
    )
)

# 8. BMI category
rows.append(make_row("BMI category", "Normal weight (reference)", ref=True))

rows.append(
    make_row(
        "BMI category", "Underweight",
        tab_unadj=tab_bmi_u,
        term_unadj="C(bmi_group)[T.Underweight]",
        tab_adj=tab_adr_adj,
        term_adj="C(bmi_group)[T.Underweight]"
    )
)

rows.append(
    make_row(
        "BMI category", "Overweight",
        tab_unadj=tab_bmi_u,
        term_unadj="C(bmi_group)[T.Overweight]",
        tab_adj=tab_adr_adj,
        term_adj="C(bmi_group)[T.Overweight]"
    )
)

rows.append(
    make_row(
        "BMI category", "Obese",
        tab_unadj=tab_bmi_u,
        term_unadj="C(bmi_group)[T.Obese]",
        tab_adj=tab_adr_adj,
        term_adj="C(bmi_group)[T.Obese]"
    )
)

# 9. Laboratory abnormalities
rows.append(
    make_row(
        "Laboratory abnormalities", "Low hemoglobin",
        tab_unadj=tab_hb_u,
        term_unadj="low_hemoglobin",
        tab_adj=tab_adr_adj,
        term_adj="low_hemoglobin"
    )
)

rows.append(
    make_row(
        "Laboratory abnormalities", "Elevated creatinine",
        tab_unadj=tab_creat_u,
        term_unadj="elevated_creatinine",
        tab_adj=tab_adr_adj,
        term_adj="elevated_creatinine"
    )
)

rows.append(
    make_row(
        "Laboratory abnormalities", "Low albumin",
        tab_unadj=tab_alb_u,
        term_unadj="low_albumin",
        tab_adj=tab_adr_adj,
        term_adj="low_albumin"
    )
)

# 10. Cardiovascular comorbidity
rows.append(make_row("Cardiovascular comorbidity", "No", ref=True))

rows.append(
    make_row(
        "Cardiovascular comorbidity", "Yes",
        tab_unadj=tab_cardio_u,
        term_unadj="cardiovascular_disorders_bin",
        tab_adj=tab_cardio_adj,
        term_adj="cardiovascular_disorders_bin"
    )
)

# 11. Diabetes type II
rows.append(make_row("Diabetes type II", "No", ref=True))

rows.append(
    make_row(
        "Diabetes type II", "Yes",
        tab_unadj=tab_dm_u,
        term_unadj="diabetes_type_ii_bin",
        tab_adj=tab_dm_adj,
        term_adj="diabetes_type_ii_bin"
    )
)

# 12. Dose reduction performed
rows.append(make_row("Dose reduction performed", "No", ref=True))

rows.append(
    make_row(
        "Dose reduction performed", "Yes",
        tab_unadj=tab_dose_u,
        term_unadj="dose_reduced",
        tab_adj=tab_dose_adj,
        term_adj="dose_reduced"
    )
)

table2_df = pd.DataFrame(rows)

# Optional: round IRRs for readability
table2_df["Unadj_IRR"] = table2_df["Unadj_IRR"].round(2)
table2_df["Adj_IRR"] = table2_df["Adj_IRR"].round(2)

table2_df


Unnamed: 0,Variable,Category,Unadj_IRR,Unadj_95CI,Adj_IRR,Adj_95CI
0,Age group,≤65,1.0,,1.0,
1,Age group,>65,5.54,1.17-26.31,2.64,0.32-21.67
2,"Age (continuous, per year)",Per 1-year increase,1.18,1.06-1.32,,
3,ACCI score,Per 1-point increase,1.48,1.17-1.86,1.39,0.74-2.60
4,CCI score,Per 1-point increase,1.29,0.96-1.75,0.44,0.12-1.54
5,Comorbidity burden,0-2 comorbidities,1.0,,1.0,
6,Comorbidity burden,3-5 comorbidities,0.0,0.00-0.00,,
7,Comorbidity burden,≥6 comorbidities,0.0,0.00-0.00,,
8,Tumor Stage,Stage I-II,1.0,,1.0,
9,Tumor Stage,Stage III,1.94,0.34-11.06,2.68,0.37-19.57


In [76]:
output_path = "table2_severe_adr_irr.xlsx"
table2_df.to_excel(output_path, index=False)
print(f"Table 2 exported to: {output_path}")

Table 2 exported to: table2_severe_adr_irr.xlsx


  table2_df.to_excel(output_path, index=False)


In [87]:
import numpy as np
import pandas as pd
from lifelines import CoxPHFitter
from lifelines.exceptions import ConvergenceError

# ============================================================
# 1. Start from your main dataset
# ============================================================

dataset = df.copy()

In [88]:
# ============================================================
# 2. Make date columns proper datetimes and tz-naive
# ============================================================

for dcol in ["observation_start_date", "observation_end_date", "death_date"]:
    if dcol in dataset.columns:
        dataset[dcol] = pd.to_datetime(dataset[dcol], errors="coerce")
        try:
            dataset[dcol] = dataset[dcol].dt.tz_localize(None)
        except Exception:
            # already tz-naive
            pass

In [89]:
# ============================================================
# 3. Overall survival event indicator: 1 = death, 0 = censored
# ============================================================

if "Overall_Survival" not in dataset.columns:
    if "death_date" not in dataset.columns:
        raise ValueError("death_date column is required to compute Overall_Survival.")
    dataset["Overall_Survival"] = dataset["death_date"].apply(
        lambda x: 1 if pd.notna(x) else 0
    )

In [90]:
# ============================================================
# 4. Survival time in days from observation_start_date
# ============================================================

if "survival_days" not in dataset.columns:
    if "observation_start_date" not in dataset.columns:
        raise ValueError("observation_start_date is required to compute survival_days.")
    
    # Censoring date: observation_end_date if present, otherwise today
    if "observation_end_date" in dataset.columns:
        censor_date = dataset["observation_end_date"].where(
            dataset["observation_end_date"].notna(), pd.Timestamp.today()
        )
    else:
        censor_date = pd.Series(pd.Timestamp.today(), index=dataset.index)
    
    # Event date: death_date if present, else censor_date
    if "death_date" in dataset.columns:
        event_date = dataset["death_date"].where(
            dataset["death_date"].notna(), censor_date
        )
    else:
        event_date = censor_date
    
    dataset["survival_days"] = (event_date - dataset["observation_start_date"]).dt.days

# Remove non-positive times
dataset.loc[dataset["survival_days"] <= 0, "survival_days"] = np.nan

In [91]:
# ============================================================
# 5. Age group 65 (<=65 vs >65)
# ============================================================

if "age_group_65" not in dataset.columns:
    if "age" not in dataset.columns:
        raise ValueError("age column is required to derive age_group_65.")
    dataset["age_group_65"] = np.where(dataset["age"] > 65, ">65", "≤65")

# Binary version: 1 = >65, 0 = ≤65
dataset["age_gt65"] = np.where(dataset["age_group_65"] == ">65", 1, 0)

print("age_group_65 counts:")
print(dataset["age_group_65"].value_counts(dropna=False))

age_group_65 counts:
age_group_65
≤65    206
>65    197
Name: count, dtype: int64


In [92]:
# ============================================================
# 6. Build df_surv with required columns
# ============================================================

needed_cols = [
    "survival_days",
    "Overall_Survival",
    "age",
    "age_group_65",
    "age_gt65",
    "ACCI_score",
    "CCI_score",
    "comorbidity_burden_cat",
    "severe_adr_event",
    "tumor_stage_group",
    "ordinary_hosp_cat",
    "chemo_cycles_n",
    "bmi_group",
    "surgical_intervention",
    "low_hemoglobin",
    "elevated_creatinine",
    "low_albumin",
    "cardiovascular_disorders_bin",
    "diabetes_type_ii_bin",
]

missing_needed = [c for c in needed_cols if c not in dataset.columns]
if missing_needed:
    print("WARNING - missing expected columns:", missing_needed)

keep_cols = [c for c in needed_cols if c in dataset.columns]
df_surv = dataset[keep_cols].copy()

# Keep rows with valid survival info
df_surv = df_surv[
    df_surv["survival_days"].notna() &
    df_surv["Overall_Survival"].isin([0, 1])
].copy()

print("\nSurvival dataset shape:", df_surv.shape)
print(df_surv[["survival_days", "Overall_Survival"]].describe(include="all"))



Survival dataset shape: (231, 19)
       survival_days  Overall_Survival
count     231.000000        231.000000
mean      529.865801          0.290043
std       320.418957          0.454767
min         6.000000          0.000000
25%       229.000000          0.000000
50%       503.000000          0.000000
75%       832.500000          1.000000
max      1270.000000          1.000000


In [99]:
#import numpy as np
import pandas as pd
from lifelines import CoxPHFitter
from lifelines.exceptions import ConvergenceError

# ============================================================
# 7. Create explicit dummy/binary covariates (updated)
# ============================================================

# Comorbidity burden: 0-2 is reference
df_surv["comb_3_5"] = np.where(df_surv["comorbidity_burden_cat"] == "3-5", 1, 0)
df_surv["comb_ge6"] = np.where(df_surv["comorbidity_burden_cat"] == ">=6", 1, 0)

# Tumor stage: Stage I-II reference
df_surv["stage_III"] = np.where(df_surv["tumor_stage_group"] == "Stage III", 1, 0)
df_surv["stage_IV"] = np.where(df_surv["tumor_stage_group"] == "Stage IV", 1, 0)

# Ordinary hospitalizations: 0 reference
df_surv["hosp_1_2"] = np.where(df_surv["ordinary_hosp_cat"] == "1-2", 1, 0)
df_surv["hosp_ge3"] = np.where(df_surv["ordinary_hosp_cat"] == ">=3", 1, 0)

# BMI category: Normal weight reference
df_surv["bmi_underweight"] = np.where(df_surv["bmi_group"] == "Underweight", 1, 0)
df_surv["bmi_overweight"] = np.where(df_surv["bmi_group"] == "Overweight", 1, 0)
df_surv["bmi_obese"] = np.where(df_surv["bmi_group"] == "Obese", 1, 0)

# Age group binary (we already created age_gt65 earlier, but just to be safe)
df_surv["age_gt65"] = np.where(df_surv["age_group_65"] == ">65", 1, 0)

# Surgical intervention: map string to 0/1
# Adjust these strings if your data uses slightly different labels.
def map_surg(x):
    if isinstance(x, str):
        x = x.strip()
        if x.lower() in ["present / yes", "yes", "present"]:
            return 1
        if x.lower() in ["absent / no", "no", "absent"]:
            return 0
    return np.nan

df_surv["surg_bin"] = df_surv["surgical_intervention"].apply(map_surg)

print("surg_bin value counts:")
print(df_surv["surg_bin"].value_counts(dropna=False))

# Ensure lab variables are numeric
for lab_var in ["low_hemoglobin", "elevated_creatinine", "low_albumin"]:
    if lab_var in df_surv.columns:
        df_surv[lab_var] = pd.to_numeric(df_surv[lab_var], errors="coerce")

surg_bin value counts:
surg_bin
0    117
1    114
Name: count, dtype: int64


In [100]:
# ============================================================
# 8. Helper functions (same as before, but using vars lists)
# ============================================================

def has_variation(data, var):
    s = data[var].dropna() if var in data.columns else pd.Series(dtype=float)
    return s.nunique() > 1

def fit_cox_vars(vars_list, data, penalizer=0.1, name_for_log=""):
    cols = ["survival_days", "Overall_Survival"] + vars_list
    cols = [c for c in cols if c in data.columns]
    df_sub = data[cols].copy().dropna()

    if df_sub["Overall_Survival"].sum() == 0:
        raise ValueError(f"No events in subset for {name_for_log or vars_list}")

    for v in vars_list:
        if v in df_sub.columns and df_sub[v].dropna().nunique() < 2:
            raise ValueError(f"Variable {v} has no variation in subset for {name_for_log}")

    cph = CoxPHFitter(penalizer=penalizer)
    cph.fit(df_sub, duration_col="survival_days", event_col="Overall_Survival")

    summ = cph.summary.copy()
    summ["term"] = summ.index
    summ["HR"] = np.exp(summ["coef"])
    summ["CI_lower"] = np.exp(summ["coef"] - 1.96 * summ["se(coef)"])
    summ["CI_upper"] = np.exp(summ["coef"] + 1.96 * summ["se(coef)"])
    summ = summ.rename(columns={"p": "p_value"})
    tab = summ[["term", "coef", "HR", "CI_lower", "CI_upper", "p_value"]].copy()
    return cph, tab

def safe_fit_cox_vars(vars_list, data, penalizer=0.1, name_for_log=""):
    try:
        return fit_cox_vars(vars_list, data, penalizer=penalizer, name_for_log=name_for_log)
    except ConvergenceError as e:
        print(f"[ConvergenceError] {name_for_log or vars_list}: {e}")
        return None, None
    except ValueError as e:
        print(f"[ValueError] {name_for_log or vars_list}: {e}")
        return None, None

def extract_term(tab, term):
    if tab is None:
        return np.nan, np.nan, np.nan
    m = tab[tab["term"] == term]
    if m.empty:
        return np.nan, np.nan, np.nan
    r = m.iloc[0]
    return r["HR"], r["CI_lower"], r["CI_upper"]

def format_ci(lo, hi):
    if np.isnan(lo) or np.isnan(hi):
        return ""
    return f"{lo:.2f}-{hi:.2f}"

def make_row(variable, category, tab_unadj=None, term_unadj=None,
             tab_adj=None, term_adj=None, ref=False):
    if ref:
        return {
            "Variable": variable,
            "Category": category,
            "Unadj_HR": 1.00,
            "Unadj_CI": "",
            "Adj_HR": 1.00,
            "Adj_CI": "",
        }
    hr_u, lo_u, hi_u = extract_term(tab_unadj, term_unadj) if tab_unadj is not None else (np.nan, np.nan, np.nan)
    hr_a, lo_a, hi_a = extract_term(tab_adj, term_adj) if tab_adj is not None else (np.nan, np.nan, np.nan)

    return {
        "Variable": variable,
        "Category": category,
        "Unadj_HR": None if np.isnan(hr_u) else round(hr_u, 2),
        "Unadj_CI": format_ci(lo_u, hi_u),
        "Adj_HR": None if np.isnan(hr_a) else round(hr_a, 2),
        "Adj_CI": format_ci(hi_a if False else lo_a, hi_a),  # just format lo–hi
    }

In [101]:
# ============================================================
# 9. UNADJUSTED models (re-run with surg_bin)
# ============================================================

unadj = {}

cph_agegrp_u, tab_agegrp_u = safe_fit_cox_vars(["age_gt65"], df_surv, name_for_log="age_gt65")
unadj["age_group"] = tab_agegrp_u

cph_age_u, tab_age_u = safe_fit_cox_vars(["age"], df_surv, name_for_log="age")
unadj["age_cont"] = tab_age_u

cph_acci_u, tab_acci_u = safe_fit_cox_vars(["ACCI_score"], df_surv, name_for_log="ACCI")
unadj["ACCI"] = tab_acci_u

cph_cci_u, tab_cci_u = safe_fit_cox_vars(["CCI_score"], df_surv, name_for_log="CCI")
unadj["CCI"] = tab_cci_u

cph_comb_u, tab_comb_u = safe_fit_cox_vars(["comb_3_5", "comb_ge6"], df_surv, name_for_log="comb_burden")
unadj["comorbidity_burden"] = tab_comb_u

cph_sevadr_u, tab_sevadr_u = safe_fit_cox_vars(["severe_adr_event"], df_surv, name_for_log="severe_adr")
unadj["severe_adr"] = tab_sevadr_u

cph_stage_u, tab_stage_u = safe_fit_cox_vars(["stage_III", "stage_IV"], df_surv, name_for_log="tumor_stage")
unadj["tumor_stage"] = tab_stage_u

cph_hosp_u, tab_hosp_u = safe_fit_cox_vars(["hosp_1_2", "hosp_ge3"], df_surv, name_for_log="ordinary_hosp")
unadj["ordinary_hosp"] = tab_hosp_u

cph_chemo_u, tab_chemo_u = safe_fit_cox_vars(["chemo_cycles_n"], df_surv, name_for_log="chemo_cycles")
unadj["chemo_cycles"] = tab_chemo_u

cph_bmi_u, tab_bmi_u = safe_fit_cox_vars(
    ["bmi_underweight", "bmi_overweight", "bmi_obese"], df_surv, name_for_log="bmi_group"
)
unadj["bmi_group"] = tab_bmi_u

cph_surg_u, tab_surg_u = safe_fit_cox_vars(["surg_bin"], df_surv, name_for_log="surgical")
unadj["surgical"] = tab_surg_u

# labs: skip if no variation
if has_variation(df_surv, "low_hemoglobin"):
    cph_hb_u, tab_hb_u = safe_fit_cox_vars(["low_hemoglobin"], df_surv, name_for_log="low_hemoglobin")
else:
    print("Skipping Cox for low_hemoglobin (no variation).")
    cph_hb_u, tab_hb_u = None, None
unadj["low_hb"] = tab_hb_u

if has_variation(df_surv, "elevated_creatinine"):
    cph_creat_u, tab_creat_u = safe_fit_cox_vars(["elevated_creatinine"], df_surv, name_for_log="elevated_creatinine")
else:
    print("Skipping Cox for elevated_creatinine (no variation).")
    cph_creat_u, tab_creat_u = None, None
unadj["elev_creat"] = tab_creat_u

if has_variation(df_surv, "low_albumin"):
    cph_alb_u, tab_alb_u = safe_fit_cox_vars(["low_albumin"], df_surv, name_for_log="low_albumin")
else:
    print("Skipping Cox for low_albumin (no variation).")
    cph_alb_u, tab_alb_u = None, None
unadj["low_alb"] = tab_alb_u

if has_variation(df_surv, "cardiovascular_disorders_bin"):
    cph_cardio_u, tab_cardio_u = safe_fit_cox_vars(
        ["cardiovascular_disorders_bin"], df_surv, name_for_log="cardio"
    )
else:
    cph_cardio_u, tab_cardio_u = None, None
unadj["cardio"] = tab_cardio_u

if has_variation(df_surv, "diabetes_type_ii_bin"):
    cph_dm_u, tab_dm_u = safe_fit_cox_vars(["diabetes_type_ii_bin"], df_surv, name_for_log="diabetes")
else:
    cph_dm_u, tab_dm_u = None, None
unadj["diabetes"] = tab_dm_u


Skipping Cox for low_hemoglobin (no variation).
Skipping Cox for elevated_creatinine (no variation).


In [102]:
# ============================================================
# 10. ADJUSTED models using ACCI core + surg_bin
# ============================================================

core_vars = [
    "age_gt65",
    "severe_adr_event",
    "stage_III",
    "stage_IV",
    "hosp_1_2",
    "hosp_ge3",
    "bmi_underweight",
    "bmi_overweight",
    "bmi_obese",
    "surg_bin",
]

for lab_var in ["low_hemoglobin", "elevated_creatinine", "low_albumin"]:
    if has_variation(df_surv, lab_var):
        core_vars.append(lab_var)
    else:
        print(f"{lab_var} not added to core (no variation).")

vars_acc_adj = core_vars + ["ACCI_score"]
cph_acc_adj, tab_acc_adj = safe_fit_cox_vars(vars_acc_adj, df_surv, name_for_log="ACCI adjusted")

vars_cci_adj = core_vars + ["CCI_score"]
cph_cci_adj, tab_cci_adj = safe_fit_cox_vars(vars_cci_adj, df_surv, name_for_log="CCI adjusted")

vars_comb_adj = core_vars + ["comb_3_5", "comb_ge6"]
cph_comb_adj, tab_comb_adj = safe_fit_cox_vars(vars_comb_adj, df_surv, name_for_log="comb burden adjusted")

core_no_age = [v for v in core_vars if v != "age_gt65"] + ["ACCI_score", "age"]
cph_age_adj, tab_age_adj = safe_fit_cox_vars(core_no_age, df_surv, name_for_log="age adjusted")

vars_chemo_adj = vars_acc_adj + ["chemo_cycles_n"]
cph_chemo_adj, tab_chemo_adj = safe_fit_cox_vars(vars_chemo_adj, df_surv, name_for_log="chemo adjusted")

if has_variation(df_surv, "cardiovascular_disorders_bin"):
    vars_cardio_adj = vars_acc_adj + ["cardiovascular_disorders_bin"]
    cph_cardio_adj, tab_cardio_adj = safe_fit_cox_vars(vars_cardio_adj, df_surv, name_for_log="cardio adjusted")
else:
    cph_cardio_adj, tab_cardio_adj = None, None

if has_variation(df_surv, "diabetes_type_ii_bin"):
    vars_dm_adj = vars_acc_adj + ["diabetes_type_ii_bin"]
    cph_dm_adj, tab_dm_adj = safe_fit_cox_vars(vars_dm_adj, df_surv, name_for_log="diabetes adjusted")
else:
    cph_dm_adj, tab_dm_adj = None, None


low_hemoglobin not added to core (no variation).
elevated_creatinine not added to core (no variation).


In [103]:
# ============================================================
# 11. Build Table 3 again (same structure as before, just with surg_bin)
# ============================================================

rows = []

rows.append(make_row("Age group", "≤65", ref=True))
rows.append(
    make_row("Age group", ">65",
             tab_unadj=tab_agegrp_u, term_unadj="age_gt65",
             tab_adj=tab_acc_adj, term_adj="age_gt65")
)

rows.append(
    make_row("Age (continuous, per year)", "Per 1-year increase",
             tab_unadj=tab_age_u, term_unadj="age",
             tab_adj=tab_age_adj, term_adj="age")
)

rows.append(
    make_row("ACCI score", "Per 1-point increase",
             tab_unadj=tab_acci_u, term_unadj="ACCI_score",
             tab_adj=tab_acc_adj, term_adj="ACCI_score")
)

rows.append(
    make_row("CCI score", "Per 1-point increase",
             tab_unadj=tab_cci_u, term_unadj="CCI_score",
             tab_adj=tab_cci_adj, term_adj="CCI_score")
)

rows.append(make_row("Comorbidity burden", "0-2 comorbidities", ref=True))
rows.append(
    make_row("Comorbidity burden", "3-5 comorbidities",
             tab_unadj=tab_comb_u, term_unadj="comb_3_5",
             tab_adj=tab_comb_adj, term_adj="comb_3_5")
)
rows.append(
    make_row("Comorbidity burden", "≥6 comorbidities",
             tab_unadj=tab_comb_u, term_unadj="comb_ge6",
             tab_adj=tab_comb_adj, term_adj="comb_ge6")
)

rows.append(make_row("Severe ADRs (Grade ≥3)", "No", ref=True))
rows.append(
    make_row("Severe ADRs (Grade ≥3)", "Yes",
             tab_unadj=tab_sevadr_u, term_unadj="severe_adr_event",
             tab_adj=tab_acc_adj, term_adj="severe_adr_event")
)

rows.append(make_row("Tumor Stage", "Stage I-II", ref=True))
rows.append(
    make_row("Tumor Stage", "Stage III",
             tab_unadj=tab_stage_u, term_unadj="stage_III",
             tab_adj=tab_acc_adj, term_adj="stage_III")
)
rows.append(
    make_row("Tumor Stage", "Stage IV",
             tab_unadj=tab_stage_u, term_unadj="stage_IV",
             tab_adj=tab_acc_adj, term_adj="stage_IV")
)

rows.append(make_row("Ordinary hospitalizations", "0", ref=True))
rows.append(
    make_row("Ordinary hospitalizations", "1-2",
             tab_unadj=tab_hosp_u, term_unadj="hosp_1_2",
             tab_adj=tab_acc_adj, term_adj="hosp_1_2")
)
rows.append(
    make_row("Ordinary hospitalizations", "≥3",
             tab_unadj=tab_hosp_u, term_unadj="hosp_ge3",
             tab_adj=tab_acc_adj, term_adj="hosp_ge3")
)

rows.append(
    make_row("Chemotherapy cycles", "Per additional cycle",
             tab_unadj=tab_chemo_u, term_unadj="chemo_cycles_n",
             tab_adj=tab_chemo_adj, term_adj="chemo_cycles_n")
)

rows.append(make_row("BMI category", "Normal weight (reference)", ref=True))
rows.append(
    make_row("BMI category", "Underweight",
             tab_unadj=tab_bmi_u, term_unadj="bmi_underweight",
             tab_adj=tab_acc_adj, term_adj="bmi_underweight")
)
rows.append(
    make_row("BMI category", "Overweight",
             tab_unadj=tab_bmi_u, term_unadj="bmi_overweight",
             tab_adj=tab_acc_adj, term_adj="bmi_overweight")
)
rows.append(
    make_row("BMI category", "Obese",
             tab_unadj=tab_bmi_u, term_unadj="bmi_obese",
             tab_adj=tab_acc_adj, term_adj="bmi_obese")
)

rows.append(make_row("Surgical intervention", "No", ref=True))
rows.append(
    make_row("Surgical intervention", "Yes",
             tab_unadj=tab_surg_u, term_unadj="surg_bin",
             tab_adj=tab_acc_adj, term_adj="surg_bin")
)

rows.append(
    make_row("Laboratory abnormalities", "Low hemoglobin",
             tab_unadj=tab_hb_u, term_unadj="low_hemoglobin",
             tab_adj=tab_acc_adj, term_adj="low_hemoglobin")
)
rows.append(
    make_row("Laboratory abnormalities", "Elevated creatinine",
             tab_unadj=tab_creat_u, term_unadj="elevated_creatinine",
             tab_adj=tab_acc_adj, term_adj="elevated_creatinine")
)
rows.append(
    make_row("Laboratory abnormalities", "Low albumin",
             tab_unadj=tab_alb_u, term_unadj="low_albumin",
             tab_adj=tab_acc_adj, term_adj="low_albumin")
)

rows.append(make_row("Cardiovascular disorders", "No", ref=True))
rows.append(
    make_row("Cardiovascular disorders", "Yes",
             tab_unadj=tab_cardio_u, term_unadj="cardiovascular_disorders_bin",
             tab_adj=tab_cardio_adj, term_adj="cardiovascular_disorders_bin")
)

rows.append(make_row("Diabetes type II", "No", ref=True))
rows.append(
    make_row("Diabetes type II", "Yes",
             tab_unadj=tab_dm_u, term_unadj="diabetes_type_ii_bin",
             tab_adj=tab_dm_adj, term_adj="diabetes_type_ii_bin")
)

table3_df = pd.DataFrame(rows)

output_path = "table3_cox_overall_survival.xlsx"
table3_df.to_excel(output_path, index=False)
print("Table 3 saved to:", output_path)
table3_df

Table 3 saved to: table3_cox_overall_survival.xlsx


  table3_df.to_excel(output_path, index=False)


Unnamed: 0,Variable,Category,Unadj_HR,Unadj_CI,Adj_HR,Adj_CI
0,Age group,≤65,1.0,,1.0,
1,Age group,>65,1.06,0.70-1.61,1.07,0.67-1.72
2,"Age (continuous, per year)",Per 1-year increase,1.0,0.98-1.02,1.0,0.98-1.03
3,ACCI score,Per 1-point increase,1.04,0.94-1.14,1.0,0.88-1.14
4,CCI score,Per 1-point increase,1.05,0.93-1.18,0.98,0.84-1.16
5,Comorbidity burden,0-2 comorbidities,1.0,,1.0,
6,Comorbidity burden,3-5 comorbidities,0.61,0.31-1.20,0.69,0.33-1.43
7,Comorbidity burden,≥6 comorbidities,0.37,0.00-29.97,0.36,0.00-28.81
8,Severe ADRs (Grade ≥3),No,1.0,,1.0,
9,Severe ADRs (Grade ≥3),Yes,2.06,0.31-13.71,1.96,0.29-13.28


In [104]:
# ============================================================
# 11. Build Table 3 again (same structure as before, just with surg_bin)
# ============================================================

rows = []

rows.append(make_row("Age group", "≤65", ref=True))
rows.append(
    make_row("Age group", ">65",
             tab_unadj=tab_agegrp_u, term_unadj="age_gt65",
             tab_adj=tab_acc_adj, term_adj="age_gt65")
)

rows.append(
    make_row("Age (continuous, per year)", "Per 1-year increase",
             tab_unadj=tab_age_u, term_unadj="age",
             tab_adj=tab_age_adj, term_adj="age")
)

rows.append(
    make_row("ACCI score", "Per 1-point increase",
             tab_unadj=tab_acci_u, term_unadj="ACCI_score",
             tab_adj=tab_acc_adj, term_adj="ACCI_score")
)

rows.append(
    make_row("CCI score", "Per 1-point increase",
             tab_unadj=tab_cci_u, term_unadj="CCI_score",
             tab_adj=tab_cci_adj, term_adj="CCI_score")
)

rows.append(make_row("Comorbidity burden", "0-2 comorbidities", ref=True))
rows.append(
    make_row("Comorbidity burden", "3-5 comorbidities",
             tab_unadj=tab_comb_u, term_unadj="comb_3_5",
             tab_adj=tab_comb_adj, term_adj="comb_3_5")
)
rows.append(
    make_row("Comorbidity burden", "≥6 comorbidities",
             tab_unadj=tab_comb_u, term_unadj="comb_ge6",
             tab_adj=tab_comb_adj, term_adj="comb_ge6")
)

rows.append(make_row("Severe ADRs (Grade ≥3)", "No", ref=True))
rows.append(
    make_row("Severe ADRs (Grade ≥3)", "Yes",
             tab_unadj=tab_sevadr_u, term_unadj="severe_adr_event",
             tab_adj=tab_acc_adj, term_adj="severe_adr_event")
)

rows.append(make_row("Tumor Stage", "Stage I-II", ref=True))
rows.append(
    make_row("Tumor Stage", "Stage III",
             tab_unadj=tab_stage_u, term_unadj="stage_III",
             tab_adj=tab_acc_adj, term_adj="stage_III")
)
rows.append(
    make_row("Tumor Stage", "Stage IV",
             tab_unadj=tab_stage_u, term_unadj="stage_IV",
             tab_adj=tab_acc_adj, term_adj="stage_IV")
)

rows.append(make_row("Ordinary hospitalizations", "0", ref=True))
rows.append(
    make_row("Ordinary hospitalizations", "1-2",
             tab_unadj=tab_hosp_u, term_unadj="hosp_1_2",
             tab_adj=tab_acc_adj, term_adj="hosp_1_2")
)
rows.append(
    make_row("Ordinary hospitalizations", "≥3",
             tab_unadj=tab_hosp_u, term_unadj="hosp_ge3",
             tab_adj=tab_acc_adj, term_adj="hosp_ge3")
)

rows.append(
    make_row("Chemotherapy cycles", "Per additional cycle",
             tab_unadj=tab_chemo_u, term_unadj="chemo_cycles_n",
             tab_adj=tab_chemo_adj, term_adj="chemo_cycles_n")
)

rows.append(make_row("BMI category", "Normal weight (reference)", ref=True))
rows.append(
    make_row("BMI category", "Underweight",
             tab_unadj=tab_bmi_u, term_unadj="bmi_underweight",
             tab_adj=tab_acc_adj, term_adj="bmi_underweight")
)
rows.append(
    make_row("BMI category", "Overweight",
             tab_unadj=tab_bmi_u, term_unadj="bmi_overweight",
             tab_adj=tab_acc_adj, term_adj="bmi_overweight")
)
rows.append(
    make_row("BMI category", "Obese",
             tab_unadj=tab_bmi_u, term_unadj="bmi_obese",
             tab_adj=tab_acc_adj, term_adj="bmi_obese")
)

rows.append(make_row("Surgical intervention", "No", ref=True))
rows.append(
    make_row("Surgical intervention", "Yes",
             tab_unadj=tab_surg_u, term_unadj="surg_bin",
             tab_adj=tab_acc_adj, term_adj="surg_bin")
)

rows.append(
    make_row("Laboratory abnormalities", "Low hemoglobin",
             tab_unadj=tab_hb_u, term_unadj="low_hemoglobin",
             tab_adj=tab_acc_adj, term_adj="low_hemoglobin")
)
rows.append(
    make_row("Laboratory abnormalities", "Elevated creatinine",
             tab_unadj=tab_creat_u, term_unadj="elevated_creatinine",
             tab_adj=tab_acc_adj, term_adj="elevated_creatinine")
)
rows.append(
    make_row("Laboratory abnormalities", "Low albumin",
             tab_unadj=tab_alb_u, term_unadj="low_albumin",
             tab_adj=tab_acc_adj, term_adj="low_albumin")
)

rows.append(make_row("Cardiovascular disorders", "No", ref=True))
rows.append(
    make_row("Cardiovascular disorders", "Yes",
             tab_unadj=tab_cardio_u, term_unadj="cardiovascular_disorders_bin",
             tab_adj=tab_cardio_adj, term_adj="cardiovascular_disorders_bin")
)

rows.append(make_row("Diabetes type II", "No", ref=True))
rows.append(
    make_row("Diabetes type II", "Yes",
             tab_unadj=tab_dm_u, term_unadj="diabetes_type_ii_bin",
             tab_adj=tab_dm_adj, term_adj="diabetes_type_ii_bin")
)

table3_df = pd.DataFrame(rows)

output_path = "table3_cox_overall_survival.xlsx"
table3_df.to_excel(output_path, index=False)
print("Table 3 saved to:", output_path)
table3_df


Table 3 saved to: table3_cox_overall_survival.xlsx


  table3_df.to_excel(output_path, index=False)


Unnamed: 0,Variable,Category,Unadj_HR,Unadj_CI,Adj_HR,Adj_CI
0,Age group,≤65,1.0,,1.0,
1,Age group,>65,1.06,0.70-1.61,1.07,0.67-1.72
2,"Age (continuous, per year)",Per 1-year increase,1.0,0.98-1.02,1.0,0.98-1.03
3,ACCI score,Per 1-point increase,1.04,0.94-1.14,1.0,0.88-1.14
4,CCI score,Per 1-point increase,1.05,0.93-1.18,0.98,0.84-1.16
5,Comorbidity burden,0-2 comorbidities,1.0,,1.0,
6,Comorbidity burden,3-5 comorbidities,0.61,0.31-1.20,0.69,0.33-1.43
7,Comorbidity burden,≥6 comorbidities,0.37,0.00-29.97,0.36,0.00-28.81
8,Severe ADRs (Grade ≥3),No,1.0,,1.0,
9,Severe ADRs (Grade ≥3),Yes,2.06,0.31-13.71,1.96,0.29-13.28
