In [1]:
import pandas as pd
import numpy as np

In [2]:
DATA_DIR = 'equity-post-HCT-survival-predictions-1'


#train_data = pd.read_csv('/kaggle/input/equity-post-HCT-survival-predictions/train.csv')
#test_data  = pd.read_csv('/kaggle/input/equity-post-HCT-survival-predictions/test.csv')

train_data = pd.read_csv(DATA_DIR +'/train.csv')
test_data  = pd.read_csv(DATA_DIR +'/test.csv')

df = train_data

### Impute Missing Data

In [6]:
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import IterativeImputer, KNNImputer

from sklearn.experimental import enable_iterative_imputer


# Part 1: Data Imputation
def process_missing_data_enhanced(df):
    """
    Enhanced missing data handling with survival-aware features
    """
    df_processed = df.copy()

    # 1. Create missingness indicators for HLA variables first
    hla_vars = [col for col in df_processed.columns if 'hla' in col.lower()]
    for col in hla_vars:
        df_processed[f'{col}_missing'] = df_processed[col].isna().astype(int)

    # Calculate HLA testing completeness score
    df_processed['hla_testing_completeness'] = df_processed[hla_vars].notna().sum(axis=1) / len(hla_vars)

    # 2. Modified Unknown Category handling
    unknown_vars = [
        'tce_match', 'mrd_hct', 'cyto_score_detail', 'tce_div_match',
        'tce_imm_match', 'cyto_score', 'cmv_status', 'dri_score',
        'conditioning_intensity', 'rituximab', 'in_vivo_tcd',
        'gvhd_proph', 'sex_match', 'donor_related', 'vent_hist'
    ]

    for var in unknown_vars:
        if var in df_processed.columns:
            df_processed[var] = df_processed[var].fillna('Unknown')
            if var in ['dri_score', 'conditioning_intensity', 'mrd_hct']:
                df_processed[f'{var}_time_effect'] = df_processed[var].map(
                    lambda x: 1 if x == 'Unknown' else 2
                ) * np.exp(-df_processed['efs_time']/12)

    # 3. Enhanced MICE for HLA variables
    if hla_vars:
        mice_imputer = IterativeImputer(max_iter=10, random_state=42)
        hla_dtypes = {col: df_processed[col].dtype for col in hla_vars}

        hla_data = df_processed[hla_vars + ['efs_time']].copy()

        # Handle categorical variables
        le_dict = {}
        for col in hla_vars:
            if hla_data[col].dtype == 'object':
                le = LabelEncoder()
                non_null_mask = hla_data[col].notna()
                hla_data.loc[non_null_mask, col] = le.fit_transform(
                    hla_data.loc[non_null_mask, col]
                )
                le_dict[col] = le

        hla_imputed = mice_imputer.fit_transform(hla_data)

        for idx, col in enumerate(hla_vars):
            if col in le_dict:
                df_processed[col] = le_dict[col].inverse_transform(
                    hla_imputed[:, idx].round().astype(int)
                )
            else:
                df_processed[col] = hla_imputed[:, idx]

    # 4. Enhanced KNN for clinical variables with disease stratification
    clinical_vars = [
        'cardiac', 'arrhythmia', 'pulm_severe', 'pulm_moderate',
        'hepatic_mild', 'hepatic_severe', 'renal_issue', 'diabetes',
        'psych_disturb', 'peptic_ulcer', 'rheum_issue', 'obesity',
        'prior_tumor', 'melphalan_dose'
    ]

    for disease in df_processed['prim_disease_hct'].unique():
        mask = df_processed['prim_disease_hct'] == disease
        if mask.sum() > 0:
            disease_data = df_processed.loc[mask, clinical_vars].copy()

            for col in clinical_vars:
                if disease_data[col].dtype == 'object':
                    le = LabelEncoder()
                    non_null_mask = disease_data[col].notna()
                    disease_data.loc[non_null_mask, col] = le.fit_transform(
                        disease_data.loc[non_null_mask, col]
                    )

            knn_imputer = KNNImputer(n_neighbors=min(5, mask.sum()-1))
            imputed_values = knn_imputer.fit_transform(disease_data)

            df_processed.loc[mask, clinical_vars] = imputed_values

    # 5. Median/Mode for remaining variables
    remaining_vars = [
        'karnofsky_score', 'donor_age', 'ethnicity', 'comorbidity_score'
    ]

    for var in remaining_vars:
        if var in df_processed.columns:
            if df_processed[var].dtype in ['int64', 'float64']:
                df_processed[var] = df_processed[var].fillna(df_processed[var].median())
            else:
                df_processed[var] = df_processed[var].fillna(df_processed[var].mode()[0])

    return df_processed



### Create Enhanced Feature Set

In [7]:
def create_enhanced_features(df):
    """
    Create enhanced feature set including time-aware features
    """
    df_features = df.copy()

    # 1. Treatment-Patient Interactions
    # Convert categorical to string before concatenation
    karnofsky_bins = [0, 60, 75, 85, 100]
    karnofsky_cut = pd.cut(df_features['karnofsky_score'],
                           bins=karnofsky_bins,
                           labels=['very_low', 'low', 'high', 'very_high'],
                           include_lowest=True)
    df_features['conditioning_karnofsky'] = (
            df_features['conditioning_intensity'].astype(str) + '_' +
            karnofsky_cut.astype(str)
    )

    comorbidity_bins = [0, 2, 4, float('inf')]
    comorbidity_cut = pd.cut(df_features['comorbidity_score'],
                             bins=comorbidity_bins,
                             labels=['low', 'medium', 'high'],
                             include_lowest=True)
    df_features['dri_comorbidity'] = (
            df_features['dri_score'].astype(str) + '_' +
            comorbidity_cut.astype(str)
    )

    age_bins = [0, 20, 40, 60, float('inf')]
    age_cut = pd.cut(df_features['age_at_hct'],
                     bins=age_bins,
                     labels=['young', 'mid', 'senior', 'elderly'],
                     include_lowest=True)
    df_features['age_conditioning'] = (
            age_cut.astype(str) + '_' +
            df_features['conditioning_intensity'].astype(str)
    )

    # 2. HLA Matching Features
    high_res_cols = ['hla_match_drb1_high', 'hla_match_dqb1_high', 'hla_match_c_high',
                     'hla_match_a_high', 'hla_match_b_high']
    low_res_cols = ['hla_match_drb1_low', 'hla_match_dqb1_low', 'hla_match_c_low',
                    'hla_match_a_low', 'hla_match_b_low']

    df_features['hla_high_res_composite'] = df_features[high_res_cols].mean(axis=1)
    df_features['hla_low_res_composite'] = df_features[low_res_cols].mean(axis=1)

    # 3. Time-Aware Features
    df_features['early_hla_risk'] = df_features['hla_high_res_composite'] * np.exp(-df_features['efs_time']/12)
    df_features['late_hla_risk'] = df_features['hla_high_res_composite'] * (1 - np.exp(-df_features['efs_time']/12))

    # Conditioning intensity time effects
    conditioning_map = {
        'RIC': lambda x: np.exp(-x/6),
        'MAC': lambda x: 1 - np.exp(-x/6),
        'NMA': lambda x: np.exp(-x/3),
        'Unknown': lambda x: np.exp(-x/12)
    }

    df_features['conditioning_time_risk'] = df_features.apply(
        lambda row: conditioning_map.get(row['conditioning_intensity'],
                                         lambda x: np.exp(-x/12))(row['efs_time']),
        axis=1
    )

    df_features['age_time_risk'] = df_features['age_at_hct'] * np.exp(-df_features['efs_time']/24)

    df_features['early_gvhd_risk'] = (df_features['efs_time'] <= 3).astype(int)
    df_features['late_gvhd_risk'] = (df_features['efs_time'] > 3).astype(int)

    # 4. Composite Risk Scores
    def standardize(x):
        return (x - x.mean()) / x.std()

    df_features['patient_risk_score'] = (
            standardize(df_features['age_at_hct']) +
            standardize(df_features['comorbidity_score']) -
            standardize(df_features['karnofsky_score'])
    )

    risk_bins = [-float('inf'), -1, 1, float('inf')]
    risk_cut = pd.cut(df_features['patient_risk_score'],
                      bins=risk_bins,
                      labels=['low', 'medium', 'high'])
    df_features['risk_treatment_match'] = (
            risk_cut.astype(str) + '_' +
            df_features['conditioning_intensity'].astype(str)
    )

    # 5. Era-based features
    year_bins = df_features['year_hct'].quantile([0, 0.2, 0.4, 0.6, 0.8, 1.0]).values
    df_features['transplant_era'] = pd.cut(df_features['year_hct'],
                                           bins=year_bins,
                                           labels=['very_early', 'early', 'mid', 'late', 'recent'],
                                           include_lowest=True)

    df_features['era_conditioning'] = (
            df_features['transplant_era'].astype(str) + '_' +
            df_features['conditioning_intensity'].astype(str)
    )

    hla_bins = [-float('inf'), -1, 0, float('inf')]
    hla_cut = pd.cut(standardize(df_features['hla_high_res_composite']),
                     bins=hla_bins,
                     labels=['low', 'medium', 'high'])
    df_features['era_hla_quality'] = (
            df_features['transplant_era'].astype(str) + '_' +
            hla_cut.astype(str)
    )

    # 6. GVHD Prophylaxis Grouping
    gvhd_mapping = {
        'FK+ MMF +- others': 'FK_based',
        'FK+ MTX +- others(not MMF)': 'FK_based',
        'FKalone': 'FK_based',
        'FK+- others(not MMF,MTX)': 'FK_based',
        'CSA + MMF +- others(not FK)': 'CSA_based',
        'CSA + MTX +- others(not MMF,FK)': 'CSA_based',
        'CSA alone': 'CSA_based',
        'CSA +- others(not FK,MMF,MTX)': 'CSA_based',
        'TDEPLETION +- other': 'T_depletion',
        'TDEPLETION alone': 'T_depletion',
        'Cyclophosphamide alone': 'Cy_based',
        'Cyclophosphamide +- others': 'Cy_based',
        'CDselect alone': 'CD_selection',
        'CDselect +- other': 'CD_selection',
        'No GvHD Prophylaxis': 'none',
        'Other GVHD Prophylaxis': 'other'
    }
    df_features['gvhd_main_strategy'] = df_features['gvhd_proph'].map(gvhd_mapping)

    return df_features

In [8]:
# Usage:
# Step 1: Impute missing data
df_imputed = process_missing_data_enhanced(df)

df_imputed.to_csv('imputed_filename.csv', index=False)



In [9]:
# Step 2: Create features
final_df = create_enhanced_features(df_imputed)

# Usage:
# Step 1: Impute missing data

final_df.to_csv('imputed_enhanced_filename.csv', index=False)

In [10]:
final_df

Unnamed: 0,ID,dri_score,psych_disturb,cyto_score,diabetes,hla_match_c_high,hla_high_res_8,tbi_status,arrhythmia,hla_low_res_6,...,conditioning_time_risk,age_time_risk,early_gvhd_risk,late_gvhd_risk,patient_risk_score,risk_treatment_match,transplant_era,era_conditioning,era_hla_quality,gvhd_main_strategy
0,0,N/A - non-malignant indication,0.0,Unknown,0.0,1.197730e+17,1.046154e+26,No TBI,0.0,6.0,...,2.931469e-02,1.702223,0,1,-2.760223,low_Unknown,early,early_Unknown,early_medium,FK_based
1,1,Intermediate,0.0,Intermediate,0.0,2.000000e+00,8.000000e+00,"TBI +- Other, >cGy",0.0,6.0,...,5.409824e-01,35.973978,0,1,0.351557,medium_MAC,very_early,very_early_MAC,very_early_medium,other
2,2,N/A - non-malignant indication,0.0,Unknown,0.0,2.000000e+00,8.000000e+00,No TBI,0.0,6.0,...,1.921620e-01,14.903021,0,1,-1.622741,low_Unknown,recent,recent_Unknown,recent_medium,Cy_based
3,3,High,0.0,Intermediate,0.0,2.000000e+00,8.000000e+00,No TBI,0.0,6.0,...,1.000000e+00,0.607952,0,1,-1.185433,low_MAC,very_early,very_early_MAC,very_early_medium,FK_based
4,4,High,0.0,Unknown,0.0,2.000000e+00,8.000000e+00,No TBI,0.0,6.0,...,9.330516e-01,15.127808,0,1,-1.318961,low_MAC,late,late_MAC,late_medium,T_depletion
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28795,28795,Intermediate - TED AML case <missing cytogenetics,0.4,Favorable,0.0,2.000000e+00,8.000000e+00,No TBI,0.0,6.0,...,9.551979e-01,23.526178,0,1,-0.812294,medium_MAC,late,late_MAC,late_medium,FK_based
28796,28796,High,0.0,Poor,2.0,1.000000e+00,4.000000e+00,No TBI,0.0,5.0,...,4.424918e-01,14.741939,0,1,-0.860402,medium_RIC,mid,mid_RIC,mid_medium,Cy_based
28797,28797,TBD cytogenetics,0.4,Poor,0.0,2.000000e+00,8.000000e+00,No TBI,0.0,6.0,...,9.789215e-01,19.434477,0,1,1.706908,high_MAC,late,late_MAC,late_medium,FK_based
28798,28798,N/A - non-malignant indication,0.0,Poor,0.0,1.000000e+00,4.000000e+00,No TBI,0.0,3.0,...,2.638861e-08,0.004967,0,1,-2.723188,low_NMA,late,late_NMA,late_medium,Cy_based
