In [10]:
import pandas as pd

# Load the cleaned dataset and action plan
data_path = r"C:\Users\HP\OneDrive\Desktop\VERO_code\Phase_1\data\processed\merged_codige_wide_english_values_translated.xlsx"
action_plan_path = r"C:\Users\HP\OneDrive\Desktop\VERO_code\Phase_1\outputs\recommendations\variable_action_plan.xlsx"

# Load the cleaned data
df = pd.read_excel(data_path)

# Load the variable action plan
action_plan_df = pd.read_excel(action_plan_path, sheet_name="action_plan")


In [11]:
# Filter high-priority variables from the action plan
high_priority_vars = action_plan_df[action_plan_df['priority'] == 'high']['variable'].tolist()

# Extract high-priority variables from the dataframe
high_priority_df = df[high_priority_vars]

# Display high-priority variables
print(f"High-priority variables: {high_priority_vars}")


High-priority variables: ['breast_cancer_subtype', 'radiotherapy_end_date', 'prior_radiotherapy', 'radiotherapy_start_date', 'colon_cancer_location', 'hospital_discharge_date', 'ejection_fraction_percent', 'ejection_fraction_category', 'hospital_admission_date', 'gamma_gt_range', 'albumin_range', 'azotemia_range', 'adr_ctcae_grade', 'tumor_stage_roman', 'blood_glucose_range', 'direct_bilirubin_range', 'total_bilirubin_range', 'creatinine_range', 'ast_got_range', 'alt_gpt_range', 'white_blood_cells_range', 'neutrophils_percent_range', 'platelet_count_range', 'dpyd_genotype_type', 'red_blood_cells_range', 'hemoglobin_range', 'ethnicity', 'bmi_value', 'bmi_category', 'dpyd_genotype_known', 'chemo_cycles_n', 'age', 'age_group', 'gender', 'observation_start_date', 'observation_end_date', 'tumor_type', 'oncology_treatment_lines_n', 'transfusion_received', 'transfusions_total_n', 'hypertension', 'aortic_insufficiency', 'dyslipidemia', 'bph', 'obesity_comorbidity', 'ischemic_heart_disease', 'a

In [12]:
# Check missingness for high-priority variables
missing_high_priority = high_priority_df.isna().sum()
missing_high_priority_pct = (missing_high_priority / len(df)) * 100

# Display missingness summary
missing_high_priority_summary = pd.DataFrame({
    'missing_count': missing_high_priority,
    'missing_percentage': missing_high_priority_pct
}).sort_values(by='missing_percentage', ascending=False)

print(missing_high_priority_summary)


                         missing_count  missing_percentage
breast_cancer_subtype              402           99.751861
radiotherapy_end_date              370           91.811414
prior_radiotherapy                 368           91.315136
radiotherapy_start_date            368           91.315136
death_date                         306           75.930521
...                                ...                 ...
observation_end_date                 0            0.000000
observation_start_date               0            0.000000
gender                               0            0.000000
age                                  0            0.000000
active_principles_n                  0            0.000000

[64 rows x 2 columns]


In [13]:
# Define a function to apply imputation based on column type and action plan
def apply_imputation(df, action_plan_df):
    # Iterate through all high-priority columns
    for col in high_priority_vars:
        # Get the recommended action for the column from the action plan
        action = action_plan_df[action_plan_df['variable'] == col]['recommended_action'].values[0]

        # Check if the recommended action is 'Impute' and handle imputation based on column type
        if action == 'Impute':
            # For categorical variables, impute with the mode (most frequent value)
            if df[col].dtype == 'object':
                df[col] = df[col].fillna(df[col].mode()[0])
                print(f"Imputed {col} with mode (categorical variable)")

            # For numeric variables, impute with the median value
            elif df[col].dtype in ['int64', 'float64']:
                df[col] = df[col].fillna(df[col].median())
                print(f"Imputed {col} with median (numeric variable)")

            # You can add any additional rules for other variable types as needed

    return df

# Apply the imputation function
df = apply_imputation(df, action_plan_df)

In [14]:
# Handle columns with high missingness (drop or special handling)
high_missing_columns = missing_high_priority_pct[missing_high_priority_pct > 40].index.tolist()

# Ensure that 'death_date' is NOT dropped, even if it has > 40% missing values
if 'death_date' in high_missing_columns:
    high_missing_columns.remove('death_date', )
    # Impute missing 'death_date' values with a placeholder date (12/31/2099)
    df['death_date'] = df['death_date'].fillna(pd.to_datetime('2099-12-31'))

# Drop other columns with > 40% missing data
df.drop(columns=high_missing_columns, inplace=True)

print(f"Dropped columns with > 40% missing data, except 'death_date': {high_missing_columns}")


Dropped columns with > 40% missing data, except 'death_date': ['breast_cancer_subtype', 'radiotherapy_end_date', 'prior_radiotherapy', 'radiotherapy_start_date', 'colon_cancer_location', 'hospital_discharge_date', 'ejection_fraction_percent', 'ejection_fraction_category', 'hospital_admission_date', 'gamma_gt_range', 'albumin_range', 'azotemia_range']


In [15]:
# Define the output path where the cleaned data will be saved
output_path = r"C:\Users\HP\OneDrive\Desktop\VERO_code\Phase_1\data\processed\cleaned_data.xlsx"

# Export the cleaned DataFrame to an Excel file
df.to_excel(output_path, index=False)

print(f"✅ Cleaned dataset has been saved successfully at: {output_path}")


  df.to_excel(output_path, index=False)


✅ Cleaned dataset has been saved successfully at: C:\Users\HP\OneDrive\Desktop\VERO_code\Phase_1\data\processed\cleaned_data.xlsx


In [16]:
import pandas as pd

# Load the action plan that contains priority levels and recommended actions
action_plan_path = r"C:\Users\HP\OneDrive\Desktop\VERO_code\Phase_1\outputs\recommendations\variable_action_plan.xlsx"
action_plan_df = pd.read_excel(action_plan_path, sheet_name="action_plan")

# Create a missingness summary for the cleaned data (df)
missing_summary = pd.DataFrame({
    "variable": df.columns,
    "missing_count": df.isna().sum(),
    "missing_pct": (df.isna().mean() * 100).round(2),
    "type": [df[col].dtype for col in df.columns]
})

# Merge the missingness summary with the action plan to get 'role', 'priority', and 'recommended_action'
missing_summary = missing_summary.merge(action_plan_df[['variable', 'role', 'priority', 'recommended_action']], 
                                         how='left', on='variable')

# Export the summary as a new Excel file
output_path = r"C:\Users\HP\OneDrive\Desktop\VERO_code\Phase_1\data\raw\missingness_summary.xlsx"
missing_summary.to_excel(output_path, index=False)

print(f"✅ Missingness summary has been saved successfully at: {output_path}")


✅ Missingness summary has been saved successfully at: C:\Users\HP\OneDrive\Desktop\VERO_code\Phase_1\data\raw\missingness_summary.xlsx


  missing_summary.to_excel(output_path, index=False)


In [17]:
# Calculate missing percentages for each column
missing_pct = df.isna().mean() * 100

# Identify columns with >= 40% missing data
columns_to_drop = missing_pct[missing_pct >= 40].index.tolist()

# Drop the identified columns
df.drop(columns=columns_to_drop, inplace=True)

# Print the dropped columns for reference
print(f"Dropped columns with >= 40% missing data: {columns_to_drop}")


Dropped columns with >= 40% missing data: ['smoking_years', 'tumor_stage_tnm', 'histological_grade', 'surgery_date', 'surgery_type', 'surgery_type_specify', 'prior_surgery', 'surgery_complications', 'reoperation_for_complication', 'hospitalization_for_surgery_complication', 'treatment_line_n', 'chemo_schema_end_reason', 'toxicity_type', 'ricovero_n', 'admission_diagnosis', 'discharge_diagnosis', 'admission_mode', 'er_stay_duration', 'tipo_left', 'hospitalization_cause', 'oncology_schema_modified', 'comorbilita_cat', 'altro', 'adr_onset_date', 'hospitalization_type', 'adr_chemo_correlation', 'adr_description_clean']


In [18]:
import pandas as pd
import numpy as np

# Load the action plan that contains priority levels and recommended actions
action_plan_path = r"C:\Users\HP\OneDrive\Desktop\VERO_code\Phase_1\outputs\recommendations\variable_action_plan.xlsx"
action_plan_df = pd.read_excel(action_plan_path, sheet_name="action_plan")

# Loop through each column in the cleaned DataFrame
for col in df.columns:
    # Check if there are missing values in the column
    if df[col].isna().sum() > 0:
        
        # Get the recommended action for the column from the action plan
        action = action_plan_df[action_plan_df['variable'] == col]['recommended_action'].values[0]
        
        # Impute based on the recommended action
        if action == 'Impute':
            # Check if the column is categorical
            if df[col].dtype == 'object':
                # Impute missing values with the mode (most frequent value) for categorical columns
                df[col] = df[col].fillna(df[col].mode()[0])
                print(f"Imputed {col} using mode (for categorical variable)")

            # For numeric columns, impute with the median
            elif df[col].dtype in ['int64', 'float64']:
                df[col] = df[col].fillna(df[col].median())
                print(f"Imputed {col} using median (for numeric variable)")

            # You can add any additional rules for other variable types as needed

# Check the result of the imputation
missing_after_imputation = df.isna().sum()
print(f"Missing values after imputation: {missing_after_imputation[missing_after_imputation > 0]}")


Missing values after imputation: ethnicity                      5
education_level                5
bmi_value                      5
bmi_category                   5
employment_status              5
alcohol_consumption            5
smoking_status_binary        134
smoking_status_detail          5
tumor_diagnosis_date          25
oncology_unit_start_date     148
tumor_stage_roman            135
dpyd_genotype_known            4
dpyd_genotype_type            61
blood_glucose_range          126
white_blood_cells_range       66
red_blood_cells_range         61
hemoglobin_range              61
neutrophils_percent_range     66
platelet_count_range          65
creatinine_range              78
ast_got_range                 68
alt_gpt_range                 68
total_bilirubin_range         90
direct_bilirubin_range       111
chemo_schema_name              1
chemo_schema_end_date          2
chemo_cycles_n                 2
comorbidita                  156
data                         156
comorbidit

In [20]:
import pandas as pd
import numpy as np
from fancyimpute import IterativeImputer, KNN

# Load the action plan
action_plan_path = r"C:\Users\HP\OneDrive\Desktop\VERO_code\Phase_1\outputs\recommendations\variable_action_plan.xlsx"

action_plan_df = pd.read_excel(action_plan_path, sheet_name="action_plan")

# Function to apply Bayesian or KNN imputation based on action plan
def apply_advanced_imputation(df, action_plan_df):
    for col in df.columns:
        if df[col].isna().sum() > 0:  # Check if the column has missing values
            # Get the recommended action for the column from the action plan
            action = action_plan_df[action_plan_df['variable'] == col]['recommended_action'].values[0]
            
            if action == 'Impute':
                # Check if Bayesian imputation is recommended
                bayesian_action = action_plan_df[action_plan_df['variable'] == col]['imputation_method'].values[0]
                
                if bayesian_action == 'Bayesian' and df[col].dtype in ['int64', 'float64']:
                    # Apply Bayesian imputation using fancyimpute's IterativeImputer (which uses Bayesian Ridge)
                    bayesian_imputer = IterativeImputer(estimator='bayesian_ridge', random_state=42)
                    df[col] = bayesian_imputer.fit_transform(df[[col]])  # Apply Bayesian Ridge
                    print(f"Imputed {col} using Bayesian Ridge (numeric variable)")
                
                # Check for KNN imputation if recommended or if Bayesian is not an option
                elif bayesian_action != 'Bayesian' and df[col].dtype in ['int64', 'float64']:
                    knn_imputer = KNN(k=5)  # Set number of neighbors for KNN imputation
                    df[col] = knn_imputer.fit_transform(df[[col]])  # Apply KNN imputation
                    print(f"Imputed {col} using KNN (numeric variable)")
                
                # For categorical variables, if KNN or Bayesian is recommended
                elif df[col].dtype == 'object':
                    knn_imputer = KNN(k=5)
                    df[col] = knn_imputer.fit_transform(df[[col]]).astype(str)  # KNN for categorical
                    print(f"Imputed {col} using KNN (categorical variable)")

    return df

# Apply advanced imputation
df = apply_advanced_imputation(df, action_plan_df)

# Check the result of the imputation
missing_after_imputation = df.isna().sum()
print(f"Missing values after advanced imputation: {missing_after_imputation[missing_after_imputation > 0]}")

# Define output path for the cleaned dataset
output_path = r"C:\Users\HP\OneDrive\Desktop\VERO_code\Phase_1\data\interim\advanced_imputed_data.xlsx"
df.to_excel(output_path, index=False)

print(f"✅ Advanced imputation complete and saved to: {output_path}")


Missing values after advanced imputation: ethnicity                      5
education_level                5
bmi_value                      5
bmi_category                   5
employment_status              5
alcohol_consumption            5
smoking_status_binary        134
smoking_status_detail          5
tumor_diagnosis_date          25
oncology_unit_start_date     148
tumor_stage_roman            135
dpyd_genotype_known            4
dpyd_genotype_type            61
blood_glucose_range          126
white_blood_cells_range       66
red_blood_cells_range         61
hemoglobin_range              61
neutrophils_percent_range     66
platelet_count_range          65
creatinine_range              78
ast_got_range                 68
alt_gpt_range                 68
total_bilirubin_range         90
direct_bilirubin_range       111
chemo_schema_name              1
chemo_schema_end_date          2
chemo_cycles_n                 2
comorbidita                  156
data                         156
c

  df.to_excel(output_path, index=False)


✅ Advanced imputation complete and saved to: C:\Users\HP\OneDrive\Desktop\VERO_code\Phase_1\data\interim\advanced_imputed_data.xlsx


In [37]:
import pandas as pd

# Define a placeholder value for missing critical variables (e.g., "Unknown", "Not Available")
critical_columns = [
    'smoking_status_binary', 'tumor_diagnosis_date', 'oncology_unit_start_date', 'white_blood_cells_range', 
    'red_blood_cells_range', 'hemoglobin_range', 'neutrophils_percent_range', 'platelet_count_range', 'creatinine_range', 
    'ast_got_range', 'alt_gpt_range', 'total_bilirubin_range', 'direct_bilirubin_range', 'adr_left', 'blood_glucose_range', 
    'linea_trattamento_oncologico', 'comorbidity_category_list', 'altre_pat_n', 'adr_clean.1'
]

# Impute critical columns with placeholders if they have missing values
for col in critical_columns:
    if df[col].isna().sum() > 0:
        if df[col].dtype == 'object':
            # Impute with a placeholder for categorical columns
            df[col] = df[col].fillna("Unknown")
            print(f"Imputed {col} with placeholder 'Unknown'")
        elif df[col].dtype in ['int64', 'float64']:
            # For numeric columns, we can impute with a reasonable placeholder like -1 or the median
            df[col] = df[col].fillna(-1)  # Placeholder for numeric values
            print(f"Imputed {col} with placeholder -1 for numeric values")

# Check the result of imputation
missing_after_imputation = df.isna().sum()
print(f"Missing values after manual imputation: {missing_after_imputation[missing_after_imputation > 0]}")

# Re-checking the data
# Export the cleaned DataFrame with imputed values to Excel
output_path = r"C:\Users\HP\OneDrive\Desktop\VERO_code\Phase_1\data\interim\manual_imputed_data.xlsx"
df.to_excel(output_path, index=False)

print(f"✅ Data after manual imputation has been saved to: {output_path}")


Missing values after manual imputation: ethnicity                  5
education_level            5
bmi_value                  5
bmi_category               5
employment_status          5
alcohol_consumption        5
smoking_status_detail      5
tumor_stage_roman        135
dpyd_genotype_known        4
dpyd_genotype_type        61
chemo_schema_name          1
chemo_schema_end_date      2
chemo_cycles_n             2
comorbidita              156
data                     156
adr_description          144
adr_ctcae_grade          145
adr_outcome              160
adr_chemo_action         158
adr_source_project       144
adr_macro_category       144
dtype: int64


  df.to_excel(output_path, index=False)


✅ Data after manual imputation has been saved to: C:\Users\HP\OneDrive\Desktop\VERO_code\Phase_1\data\interim\manual_imputed_data.xlsx


In [38]:
import pandas as pd
import numpy as np

# Define a dictionary for domain-specific imputation
domain_imputation = {
    'tumor_type': 'Unknown',  # Tumor type missing, use placeholder 'Unknown'
    'adr_adr': 'No ADR',  # Missing ADR, impute with 'No ADR'
    'adr_onset_date': '01/01/2020',  # Missing ADR onset date, use placeholder date
    'adr_ctcae_grade': 0,  # Missing ADR grade, impute with 0 or a default value
    'adr_outcome': 'Not Available',  # Missing ADR outcome, impute with 'Not Available'
    'adr_chemo_correlation': 'Unknown',  # Missing ADR chemo correlation, impute with 'Unknown'
    'adr_chemo_action': 'None',  # Missing ADR chemo action, impute with 'None'
    'adr_source_project': 'Not Available',  # Missing ADR source, impute with 'Not Available'
    'adr_description_clean': 'No ADR',  # Missing ADR description, impute with 'No ADR'
    'adr_macro_category': 'No ADR',  # Missing ADR macro category, impute with 'No ADR'
    'comorbidita': 'No Comorbidity',  # Missing comorbidity, impute with 'No Comorbidity'
    'comorbilita_cat': 'None',  # Missing comorbidity category, impute with 'None'
    'altro': 'No Additional Info',  # Missing 'altro' info, impute with 'No Additional Info'
    'data': 'No Data',  # Missing 'data', impute with 'No Data'
    'treatment_line_n': 0,  # Missing treatment line count, impute with 0
    'chemo_schema_name': 'Not Available',  # Missing chemotherapy schema name, impute with 'Not Available'
    'chemo_schema_start_date': '01/01/2020',  # Missing chemotherapy start date, impute with placeholder
    'chemo_schema_end_date': '01/01/2020',  # Missing chemotherapy end date, impute with placeholder
    'chemo_cycles_n': 0,  # Missing chemotherapy cycles count, impute with 0
    'active_principle': 'None',  # Missing active principle, impute with 'None'
    'dose_reduced': 'No',  # Missing dose reduction info, impute with 'No'
    'active_principles_n': 0  # Missing number of active principles, impute with 0
}

# Loop through the domain-imputation dictionary to apply imputation
for col, value in domain_imputation.items():
    if col in df.columns:
        df[col] = df[col].fillna(value)
        print(f"Imputed {col} with {value}")

# Check the result of the imputation
missing_after_imputation = df.isna().sum()
print(f"Missing values after contextual imputation: {missing_after_imputation[missing_after_imputation > 0]}")

# Export the cleaned DataFrame with imputed values to Excel
output_path = r"C:\Users\HP\OneDrive\Desktop\VERO_code\Phase_1\data\interim\contextual_imputed_data.xlsx"
df.to_excel(output_path, index=False)

print(f"✅ Data after contextual imputation has been saved to: {output_path}")


Imputed tumor_type with Unknown
Imputed adr_ctcae_grade with 0
Imputed adr_outcome with Not Available
Imputed adr_chemo_action with None
Imputed adr_source_project with Not Available
Imputed adr_macro_category with No ADR
Imputed comorbidita with No Comorbidity
Imputed data with No Data
Imputed chemo_schema_name with Not Available
Imputed chemo_schema_start_date with 01/01/2020
Imputed chemo_schema_end_date with 01/01/2020
Imputed chemo_cycles_n with 0
Imputed active_principle with None
Imputed dose_reduced with No
Imputed active_principles_n with 0
Missing values after contextual imputation: ethnicity                  5
education_level            5
bmi_value                  5
bmi_category               5
employment_status          5
alcohol_consumption        5
smoking_status_detail      5
tumor_stage_roman        135
dpyd_genotype_known        4
dpyd_genotype_type        61
adr_description          144
dtype: int64


  df.to_excel(output_path, index=False)


✅ Data after contextual imputation has been saved to: C:\Users\HP\OneDrive\Desktop\VERO_code\Phase_1\data\interim\contextual_imputed_data.xlsx


In [39]:
import pandas as pd
import numpy as np

# Define a dictionary for further imputation for specific columns
additional_imputation = {
    'observation_end_reason': 'Not Available',  # Missing observation end reason, impute with 'Not Available'
    'tumor_stage_roman': 'Not Available',  # Missing tumor stage roman, impute with 'Not Available'
    'dpyd_genotype_type': 'Not Tested',  # Missing dpyd genotype type, impute with 'Not Tested'
}

# Loop through the additional imputation dictionary and apply imputation
for col, value in additional_imputation.items():
    if col in df.columns:
        df[col] = df[col].fillna(value)
        print(f"Imputed {col} with {value}")

# Check the result of the imputation
missing_after_imputation = df.isna().sum()
print(f"Missing values after additional contextual imputation: {missing_after_imputation[missing_after_imputation > 0]}")

# Export the cleaned DataFrame with imputed values to Excel
output_path = r"C:\Users\HP\OneDrive\Desktop\VERO_code\Phase_1\data\interim\additional_contextual_imputed_data.xlsx"
df.to_excel(output_path, index=False)

print(f"✅ Data after additional contextual imputation has been saved to: {output_path}")


Imputed observation_end_reason with Not Available
Imputed tumor_stage_roman with Not Available
Imputed dpyd_genotype_type with Not Tested
Missing values after additional contextual imputation: ethnicity                  5
education_level            5
bmi_value                  5
bmi_category               5
employment_status          5
alcohol_consumption        5
smoking_status_detail      5
dpyd_genotype_known        4
adr_description          144
dtype: int64


  df.to_excel(output_path, index=False)


✅ Data after additional contextual imputation has been saved to: C:\Users\HP\OneDrive\Desktop\VERO_code\Phase_1\data\interim\additional_contextual_imputed_data.xlsx


In [40]:
import pandas as pd
import numpy as np

# Create a summary of the variables and imputation methods
imputation_summary = []

# For each column, summarize the imputation done
for col in df.columns:
    # Count missing values before imputation
    missing_count = df[col].isna().sum()
    
    if missing_count > 0:  # Only consider columns with missing values
        # Check the type of imputation that was done
        if col in additional_imputation:  # Specific manual imputations
            imputation_type = "Placeholder"
            imputation_value = additional_imputation[col]
        elif df[col].dtype == 'object':  # Categorical variables imputed with mode
            imputation_type = "Mode"
            imputation_value = df[col].mode()[0]
        elif df[col].dtype in ['int64', 'float64']:  # Numeric variables imputed with median
            imputation_type = "Median"
            imputation_value = df[col].median()
        else:
            imputation_type = "Unknown"
            imputation_value = "N/A"
        
        # Append to the summary list
        imputation_summary.append({
            'Variable': col,
            'Missing Count': missing_count,
            'Imputation Type': imputation_type,
            'Imputation Value': imputation_value
        })

# Convert to DataFrame
imputation_summary_df = pd.DataFrame(imputation_summary)

# Save to Excel
imputation_summary_file = r"C:\Users\HP\OneDrive\Desktop\VERO_code\Phase_1\data\raw\imputation_summary.xlsx"
with pd.ExcelWriter(imputation_summary_file, engine="xlsxwriter") as writer:
    imputation_summary_df.to_excel(writer, sheet_name="Imputation Summary", index=False)

print(f"✅ Imputation summary has been saved to: {imputation_summary_file}")


✅ Imputation summary has been saved to: C:\Users\HP\OneDrive\Desktop\VERO_code\Phase_1\data\raw\imputation_summary.xlsx
