In [1]:
import pandas as pd
import numpy as np

# -----------------------------------------------------------
# 1. LOAD DATA
# -----------------------------------------------------------

df = pd.read_excel("C:/Users/HP/OneDrive/Desktop/VERO_code/Phase_1/new_data/codige_with_cci_oncology_farmaco_predictors.xlsx")

print("Initial shape:", df.shape)

  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (


Initial shape: (412, 137)


In [2]:
# -----------------------------------------------------------
# 2. STANDARDIZE patient_id
# -----------------------------------------------------------

df['patient_id'] = (
    df['patient_id']
    .astype(str)
    .str.strip()
    .str.replace("\s+", " ", regex=True)
)

In [3]:
# -----------------------------------------------------------
# 3. IDENTIFY DUPLICATE patient IDs
# -----------------------------------------------------------

dup_ids = df[df.duplicated("patient_id", keep=False)]\
            .sort_values("patient_id")

dup_ids.to_excel("C:/Users/HP/OneDrive/Desktop/VERO_code/Phase_1/new_data/codige_master_duplicate_patient_ids.xlsx", index=False)

print("Duplicate patient IDs exported.")

  dup_ids.to_excel("C:/Users/HP/OneDrive/Desktop/VERO_code/Phase_1/new_data/codige_master_duplicate_patient_ids.xlsx", index=False)


Duplicate patient IDs exported.


In [4]:
# -----------------------------------------------------------
# 4. DROP EXCESSIVE-MISSING COLUMNS
# (threshold can be adjusted â€” currently 45%)
# -----------------------------------------------------------

missing_pct = df.isna().mean().sort_values(ascending=False)
missing_pct.to_excel("C:/Users/HP/OneDrive/Desktop/VERO_code/Phase_1/new_data/codige_master_missingness_report.xlsx")

cols_to_drop_missing = missing_pct[missing_pct > 0.45].index.tolist()
print("Dropping columns with >45% missing:", len(cols_to_drop_missing))

df = df.drop(columns=cols_to_drop_missing)

Dropping columns with >45% missing: 42


  missing_pct.to_excel("C:/Users/HP/OneDrive/Desktop/VERO_code/Phase_1/new_data/codige_master_missingness_report.xlsx")


In [5]:
# -----------------------------------------------------------
# 5. DROP OUTCOME-LEAKAGE COLUMNS
# -----------------------------------------------------------

outcome_cols = [
    "death_date", "death_outcome", "death_event_flag",
    "survival_days", "event_flag", "hospitalization_flag",
    "hospitalizations_count", "hospitalization_event_flag",
    "first_severe_adr_date", "severe_adr_flag"
]

outcome_cols = [c for c in outcome_cols if c in df.columns]

print("Removing outcome-related columns:", outcome_cols)
df = df.drop(columns=outcome_cols)

Removing outcome-related columns: ['death_outcome', 'survival_days']


In [6]:
# -----------------------------------------------------------
# 6. DROP PERFECT DUPLICATE COLUMNS
# -----------------------------------------------------------

def find_duplicate_columns(dataframe):
    duplicate_mapping = {}
    cols = dataframe.columns
    for i in range(len(cols)):
        for j in range(i + 1, len(cols)):
            if dataframe[cols[i]].equals(dataframe[cols[j]]):
                duplicate_mapping.setdefault(cols[i], []).append(cols[j])
    return duplicate_mapping

duplicate_cols_map = find_duplicate_columns(df)

duplicate_cols_to_drop = []
for base, dups in duplicate_cols_map.items():
    duplicate_cols_to_drop.extend(dups)

print("Dropping duplicate-content columns:", duplicate_cols_to_drop)

df = df.drop(columns=duplicate_cols_to_drop)

Dropping duplicate-content columns: ['n_treatment_lines']


In [7]:
# -----------------------------------------------------------
# 7. DROP ROW-LEVEL DUPLICATES JUST IN CASE
# -----------------------------------------------------------

before = df.shape[0]
df = df.drop_duplicates()
after = df.shape[0]

print(f"Removed {before - after} fully duplicated rows.")

Removed 2 fully duplicated rows.


In [8]:
# -----------------------------------------------------------
# 8. EXPORT CLEANED MASTER DATASET
# -----------------------------------------------------------

clean_path = "C:/Users/HP/OneDrive/Desktop/VERO_code/Phase_1/new_data/codige_master_clean_predictors_only.xlsx"
df.to_excel(clean_path, index=False)

print("Clean master dataset exported:", clean_path)

  df.to_excel(clean_path, index=False)


Clean master dataset exported: C:/Users/HP/OneDrive/Desktop/VERO_code/Phase_1/new_data/codige_master_clean_predictors_only.xlsx


In [9]:
import pandas as pd

# Load master predictors dataset
df = pd.read_excel("C:/Users/HP/OneDrive/Desktop/VERO_code/Phase_1/new_data/codige_with_cci_oncology_farmaco_predictors.xlsx")

# Patient IDs where we have too many rows
dup_ids = [
    "16_AORN San Giuseppe Moscati",
    "25_AOU San Giovanni di Dio Ruggi di Aragona",
    "2_AORN San Giuseppe Moscati",
]

# For each problematic patient_id, keep only the first 2 rows, drop the rest
for pid in dup_ids:
    mask = df["patient_id"] == pid
    idx = df.index[mask]

    # If there are more than 2 rows, drop everything after the first 2
    if len(idx) > 1:
        drop_idx = idx[2:]
        df = df.drop(drop_idx)

# Reset index after row drops
df = df.reset_index(drop=True)

# Check final shape
print("Final master shape:", df.shape)

# Save cleaned version
df.to_excel("codige_master_clean__v2.xlsx", index=False)


Final master shape: (406, 137)


  df.to_excel("codige_master_clean__v2.xlsx", index=False)
