In [1]:
import pandas as pd
import numpy as np
from colorama import Fore, Back, Style


In [54]:
DATA_DIR = 'equity-post-HCT-survival-predictions-1'


#train_data = pd.read_csv('/kaggle/input/equity-post-HCT-survival-predictions/train.csv')
#test_data  = pd.read_csv('/kaggle/input/equity-post-HCT-survival-predictions/test.csv')

train = pd.read_csv(DATA_DIR +'/train.csv')
test  = pd.read_csv(DATA_DIR +'/test.csv')

df = train

In [29]:
def inspect_original_hla_values(df):
    hla_vars = [col for col in df.columns if 'hla' in col.lower()]
    print("\nOriginal HLA Values:")
    for col in hla_vars:
        print(f"\n{col}")
        print("Unique values:", sorted(df[col].dropna().unique()))
        print("Value counts:\n", df[col].value_counts().head())
        print("Missing:", df[col].isna().sum())

In [31]:
inspect_original_hla_values(df)


Original HLA Values:

hla_match_c_high
Unique values: [0.0, 1.0, 2.0]
Value counts:
 hla_match_c_high
2.0    18565
1.0     5536
0.0       79
Name: count, dtype: int64
Missing: 4620

hla_high_res_8
Unique values: [2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]
Value counts:
 hla_high_res_8
8.0    13568
4.0     3820
7.0     2385
5.0     1648
6.0     1520
Name: count, dtype: int64
Missing: 5829

hla_low_res_6
Unique values: [2.0, 3.0, 4.0, 5.0, 6.0]
Value counts:
 hla_low_res_6
6.0    15690
3.0     4955
5.0     2808
4.0     2055
2.0       22
Name: count, dtype: int64
Missing: 3270

hla_high_res_6
Unique values: [0.0, 2.0, 3.0, 4.0, 5.0, 6.0]
Value counts:
 hla_high_res_6
6.0    14022
3.0     4596
5.0     2726
4.0     2128
2.0       43
Name: count, dtype: int64
Missing: 5284

hla_high_res_10
Unique values: [3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0]
Value counts:
 hla_high_res_10
10.0    12232
5.0      3161
9.0      2369
6.0      1355
8.0      1314
Name: count, dtype: int64
Missing: 7163

hla_match_dqb

In [30]:
def debug_hla_processing(df, hla_data, hla_imputed, hla_vars):
    print("\nData Type Changes:")
    for idx, col in enumerate(hla_vars):
        print(f"\n{col}")
        print(f"Original dtype: {df[col].dtype}")
        print(f"Pre-MICE dtype: {hla_data[col].dtype}")
        print(f"Post-MICE values range: {hla_imputed[:, idx].min()} to {hla_imputed[:, idx].max()}")

In [44]:
def find_column_with_value(df, value='No'):
    for col in df.columns:
        if df[col].astype(str).eq(value).any():
            print(f"Found '{value}' in column: {col}")
            print("Value counts:\n", df[col].value_counts())
            print("Dtype:", df[col].dtype)
            print()
find_column_with_value(df)

Found 'No' in column: psych_disturb
Value counts:
 psych_disturb
No          23005
Yes          3587
Not done      146
Name: count, dtype: int64
Dtype: object

Found 'No' in column: diabetes
Value counts:
 diabetes
No          22201
Yes          4339
Not done      141
Name: count, dtype: int64
Dtype: object

Found 'No' in column: arrhythmia
Value counts:
 arrhythmia
No          25203
Yes          1277
Not done      118
Name: count, dtype: int64
Dtype: object

Found 'No' in column: vent_hist
Value counts:
 vent_hist
No     27721
Yes      820
Name: count, dtype: int64
Dtype: object

Found 'No' in column: renal_issue
Value counts:
 renal_issue
No          26548
Yes           200
Not done      137
Name: count, dtype: int64
Dtype: object

Found 'No' in column: pulm_severe
Value counts:
 pulm_severe
No          24779
Yes          1706
Not done      180
Name: count, dtype: int64
Dtype: object

Found 'No' in column: rituximab
Value counts:
 rituximab
No     26033
Yes      619
Name: count, dtyp

In [64]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GroupKFold, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder, OrdinalEncoder
from sklearn.impute import IterativeImputer, KNNImputer, SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from lifelines import KaplanMeierFitter

class DataPipeline:
    def __init__(self):
        # Existing initializations
        
        self.year_bins = None
        self.karnofsky_bins = [0, 60, 75, 85, 100]
        self.comorbidity_bins = [0, 2, 4, float('inf')]
        self.age_bins = [0, 20, 40, 60, float('inf')]
        self.risk_bins = [-float('inf'), -1, 1, float('inf')]
        self.hla_bins = [-float('inf'), -1, 0, float('inf')]
        self.scaler = StandardScaler()
        self.label_encoders = {}
        self.mice_imputer = None
        self.knn_imputers = {}
        self.fill_values = {}

        # Add KM fitter
        self.kmf = KaplanMeierFitter()

        self.ordinal_encoders = {}
        self.hla_imputers = {}
        self.clinical_encoders = {}
        self.disease_imputers = {}

        self.categorical_mappings = {
            'dri_score': ['N/A - non-malignant indication', 'Intermediate', 'High', 'Low',
                          'N/A - disease not classifiable', 'N/A - pediatric', 'TBD cytogenetics',
                          'Intermediate - TED AML case <missing cytogenetics>', 'Unknown',
                          'High - TED AML case <missing cytogenetics>', 'Very high',
                          'Missing disease status'],
            'cyto_score': ['Unknown', 'Intermediate', 'Poor', 'Other', 'Favorable', 'TBD',
                           'Normal', 'Not tested'],
            'tbi_status': ['No TBI', 'TBI +- Other, >cGy', 'TBI + Cy +- Other',
                           'TBI +- Other, <=cGy', 'TBI +- Other, unknown dose',
                           'TBI +- Other, -cGy, fractionated', 'TBI +- Other, -cGy, single',
                           'TBI +- Other, -cGy, unknown dose'],
            'graft_type': ['Bone marrow', 'Peripheral blood'],
            'vent_hist': ['No', 'Yes', 'Unknown'],
            'prim_disease_hct': ['IEA', 'AML', 'HIS', 'ALL', 'MPN', 'IIS', 'Solid tumor',
                                 'Other leukemia', 'PCD', 'IPA', 'IMD', 'MDS', 'NHL', 'SAA',
                                 'AI', 'CML', 'Other acute leukemia', 'HD'],
            'cmv_status': ['+/+', '-/+', '-/-', 'Unknown', '+/-'],
            'tce_imm_match': ['Unknown', 'P/P', 'G/B', 'H/B', 'G/G', 'P/H', 'P/B', 'H/H', 'P/G'],
            'rituximab': ['No', 'Unknown', 'Yes'],
            'prod_type': ['BM', 'PB'],
            'cyto_score_detail': ['Unknown', 'Intermediate', 'TBD', 'Poor', 'Favorable', 'Not tested'],
            'conditioning_intensity': ['Unknown', 'MAC', 'RIC', 'NMA', 'TBD', 'No drugs reported',
                                       'N/A, F(pre-TED) not submitted'],
            'ethnicity': ['Not Hispanic or Latino', 'Hispanic or Latino', 'Non-resident of the U.S.'],
            'mrd_hct': ['Unknown', 'Positive', 'Negative'],
            'in_vivo_tcd': ['Yes', 'No', 'Unknown'],
            'tce_match': ['Unknown', 'Permissive', 'HvG non-permissive', 'Fully matched',
                          'GvH non-permissive'],
            'gvhd_proph': ['FKalone', 'Other GVHD Prophylaxis', 'Cyclophosphamide alone',
                           'FK+ MMF +- others', 'TDEPLETION +- other',
                           'CSA + MMF +- others(not FK)', 'CSA + MTX +- others(not MMF,FK)',
                           'FK+ MTX +- others(not MMF)', 'Cyclophosphamide +- others',
                           'CSA alone', 'Unknown', 'TDEPLETION alone', 'No GvHD Prophylaxis',
                           'CDselect alone', 'CDselect +- other', 'Parent Q = yes, but no agent',
                           'FK+- others(not MMF,MTX)', 'CSA +- others(not FK,MMF,MTX)'],
            'sex_match': ['M-F', 'F-F', 'F-M', 'M-M', 'Unknown'],
            'race_group': ['More than one race', 'Asian', 'White',
                           'American Indian or Alaska Native',
                           'Native Hawaiian or other Pacific Islander',
                           'Black or African-American'],
            'tce_div_match': ['Unknown', 'Permissive mismatched', 'GvH non-permissive',
                              'HvG non-permissive', 'Bi-directional non-permissive'],
            'donor_related': ['Unrelated', 'Related', 'Multiple donor (non-UCB)', 'Unknown'],
            'conditioning_karnofsky': ['Unknown_very_high', 'MAC_very_high', 'RIC_very_high',
                                       'MAC_low', 'MAC_high', 'RIC_high', 'NMA_low',
                                       'NMA_very_high', 'RIC_very_low', 'NMA_high',
                                       'Unknown_low', 'Unknown_high', 'RIC_low', 'TBD_low',
                                       'Unknown_very_low', 'TBD_very_high',
                                       'No drugs reported_very_high', 'MAC_very_low',
                                       'NMA_very_low', 'N/A, F(pre-TED) not submitted_very_high',
                                       'No drugs reported_low', 'TBD_high', 'TBD_very_low',
                                       'N/A, F(pre-TED) not submitted_high',
                                       'N/A, F(pre-TED) not submitted_low',
                                       'No drugs reported_very_low',
                                       'N/A, F(pre-TED) not submitted_very_low',
                                       'No drugs reported_high'],
            'transplant_era': ['very_early', 'early', 'mid', 'late', 'recent'],
            'era_conditioning': ['early_Unknown', 'very_early_MAC', 'recent_Unknown',
                                 'late_MAC', 'early_MAC', 'early_RIC', 'late_Unknown',
                                 'mid_MAC', 'very_early_Unknown', 'very_early_RIC',
                                 'late_NMA', 'very_early_NMA', 'mid_NMA', 'mid_RIC',
                                 'mid_Unknown', 'late_RIC', 'recent_RIC', 'early_TBD',
                                 'recent_NMA', 'recent_TBD', 'early_No drugs reported',
                                 'early_NMA', 'recent_MAC', 'very_early_TBD',
                                 'very_early_N/A, F(pre-TED) not submitted',
                                 'very_early_No drugs reported',
                                 'late_N/A, F(pre-TED) not submitted', 'late_TBD',
                                 'mid_No drugs reported',
                                 'mid_N/A, F(pre-TED) not submitted', 'late_No drugs reported',
                                 'mid_TBD', 'early_N/A, F(pre-TED) not submitted',
                                 'recent_No drugs reported',
                                 'recent_N/A, F(pre-TED) not submitted'],
            'gvhd_main_strategy': ['FK_based', 'other', 'Cy_based', 'T_depletion',
                                   'CSA_based', None, 'none', 'CD_selection']
        }

        # Initialize label encoders
        self.initialize_label_encoders()


        # Create and fit label encoders for each categorical variable
        for var, categories in self.categorical_mappings.items():
            encoder = LabelEncoder()
            # Add 'Unknown' if not present
            if 'Unknown' not in categories:
                categories = list(categories) + ['Unknown']
            encoder.fit(categories)
            self.label_encoders[var] = encoder

    def initialize_label_encoders(self):
        # Use the instance variable self.categorical_mappings
        self.label_encoders = {}
        for col, values in self.categorical_mappings.items():
            le = LabelEncoder()
            values = [str(v) if v is None else v for v in values]
            le.fit(values)
            self.label_encoders[col] = le

    def transform_survival_probability(self, df, time_col='efs_time', event_col='efs', training=False):
        """
        Transform using survival probability estimates
        """
        if training:
            # Fit KM model on training data
            self.kmf.fit(df[time_col], df[event_col])

        # Transform both training and test data
        y = self.kmf.survival_function_at_times(df[time_col]).values

        # Adjust probabilities for censored cases only in training
        if training:
            y[df[event_col] == 0] -= 0.2

        return y


# why are trainnig handled differently to validation?
    # why are all categorical handled before the others...
# why are there suddenly extra columns??
    def process_missing_data_enhanced(self, df, training):
        """
        Enhanced missing data handling treating HLA variables as ordinal categories.
        
        Args:
            df: Input DataFrame
            training: Boolean indicating if this is training data
            
        Returns:
            DataFrame with processed and imputed values
        """
        df_processed = df.copy()

        # 1. First identify all variables
        hla_vars = [col for col in df_processed.columns if 'hla' in col.lower()]
        categorical_vars = [
            'tce_match', 'mrd_hct', 'cyto_score_detail', 'tce_div_match',
            'tce_imm_match', 'cyto_score', 'cmv_status', 'dri_score',
            'conditioning_intensity', 'rituximab', 'in_vivo_tcd',
            'gvhd_proph', 'sex_match', 'donor_related', 'vent_hist'
        ]
        clinical_vars = [
            'cardiac', 'arrhythmia', 'pulm_severe', 'pulm_moderate',
            'hepatic_mild', 'hepatic_severe', 'renal_issue', 'diabetes',
            'psych_disturb', 'peptic_ulcer', 'rheum_issue', 'obesity',
            'prior_tumor', 'melphalan_dose'
        ]

        # 2. Create missingness indicators for HLA variables
        for col in hla_vars:
            df_processed[f'{col}_missing'] = df_processed[col].isna().astype(int)

        df_processed['hla_testing_completeness'] = df_processed[hla_vars].notna().sum(axis=1) / len(hla_vars)

        # 3. Handle HLA variables as ordinal categories
        for col in hla_vars:
            if col in df_processed.columns:
                if training:
                    # Get valid values for this column
                    valid_values = sorted(df_processed[col].dropna().unique())

                    # Create and fit imputer first
                    self.hla_imputers[col] = SimpleImputer(strategy='most_frequent')
                    self.hla_imputers[col].fit(df_processed[[col]])

                    # Create ordinal encoder that handles unknown values
                    self.ordinal_encoders[col] = OrdinalEncoder(
                        categories=[valid_values],
                        dtype=int,
                        handle_unknown='use_encoded_value',
                        unknown_value=-1
                    )

                    # First impute the values
                    df_processed[col] = self.hla_imputers[col].transform(df_processed[[col]])

                    # Then fit the encoder on the imputed values
                    self.ordinal_encoders[col].fit(df_processed[col].values.reshape(-1, 1))
                else:
                    # For inference, first impute then encode
                    df_processed[col] = self.hla_imputers[col].transform(df_processed[[col]])

                # Finally encode all values
                df_processed[col] = self.ordinal_encoders[col].transform(
                    df_processed[col].values.reshape(-1, 1)
                ).ravel()

        # 4. Handle categorical variables
        for var in categorical_vars:
            if var in df_processed.columns:
                if training:
                    # Store original categories and add 'Unknown' for future unseen values
                    unique_vals = set(df_processed[var].dropna().unique())
                    unique_vals.add('Unknown')
                    self.unique_categories = getattr(self, 'unique_categories', {})
                    self.unique_categories[var] = unique_vals

                    # Fill NA with 'Unknown'
                    df_processed[var] = df_processed[var].fillna('Unknown')
                    self.label_encoders[var] = LabelEncoder()
                    df_processed[var] = self.label_encoders[var].fit_transform(df_processed[var])
                else:
                    if var in self.label_encoders:
                        # Replace unseen categories with 'Unknown'
                        df_processed[var] = df_processed[var].fillna('Unknown')
                        df_processed[var] = df_processed[var].map(
                            lambda x: 'Unknown' if x not in self.unique_categories[var] else x
                        )
                        df_processed[var] = self.label_encoders[var].transform(df_processed[var])

        # 5. Handle clinical variables by disease group
        remaining_clinical = [col for col in df_processed.columns
                              if col not in hla_vars + categorical_vars + clinical_vars +
                              ['prim_disease_hct'] +
                              [col + '_missing' for col in hla_vars] +
                              ['hla_testing_completeness']]

        # First encode clinical variables
        for col in clinical_vars + remaining_clinical:
            if df_processed[col].dtype == 'object' or df_processed[col].apply(lambda x: isinstance(x, str)).any():
                if training:
                    self.clinical_encoders[col] = LabelEncoder()
                    non_null_mask = df_processed[col].notna()
                    df_processed.loc[non_null_mask, col] = self.clinical_encoders[col].fit_transform(
                        df_processed.loc[non_null_mask, col]
                    )
                else:
                    if col in self.clinical_encoders:
                        non_null_mask = df_processed[col].notna()
                        df_processed.loc[non_null_mask, col] = self.clinical_encoders[col].transform(
                            df_processed.loc[non_null_mask, col]
                        )

        # Now proceed with KNN imputation by disease group
        for disease in df_processed['prim_disease_hct'].unique():
            mask = df_processed['prim_disease_hct'] == disease
            if mask.sum() > 0:
                clinical_data = df_processed.loc[mask, clinical_vars + remaining_clinical].copy()

                if training:
                    n_neighbors = min(5, mask.sum() - 1)
                    self.disease_imputers[disease] = KNNImputer(n_neighbors=n_neighbors)
                    imputed_values = self.disease_imputers[disease].fit_transform(clinical_data)
                else:
                    if disease in self.disease_imputers:
                        imputed_values = self.disease_imputers[disease].transform(clinical_data)
                    else:
                        # Fallback for unseen diseases
                        for col in clinical_vars + remaining_clinical:
                            clinical_data[col] = clinical_data[col].fillna(clinical_data[col].median())
                        imputed_values = clinical_data.values

                df_processed.loc[mask, clinical_vars + remaining_clinical] = imputed_values

        # Convert encoded clinical variables back to original format
        for col in clinical_vars + remaining_clinical:
            if col in self.clinical_encoders:
                df_processed[col] = self.clinical_encoders[col].inverse_transform(
                    df_processed[col].astype(int)
                )

        return df_processed 
    
    
    def create_enhanced_features(self, df, training=False):
        """
        Create enhanced feature set without using time information
        """
        df_features = df.copy()
    
        # 1. Treatment-Patient Interactions
        karnofsky_bins = [0, 60, 75, 85, 100]
        karnofsky_cut = pd.cut(df_features['karnofsky_score'],
                               bins=karnofsky_bins,
                               labels=['very_low', 'low', 'high', 'very_high'],
                               include_lowest=True)
        df_features['conditioning_karnofsky'] = (
                df_features['conditioning_intensity'].astype(str) + '_' +
                karnofsky_cut.astype(str)
        )
    
        comorbidity_bins = [0, 2, 4, float('inf')]
        comorbidity_cut = pd.cut(df_features['comorbidity_score'],
                                 bins=comorbidity_bins,
                                 labels=['low', 'medium', 'high'],
                                 include_lowest=True)
        df_features['dri_comorbidity'] = (
                df_features['dri_score'].astype(str) + '_' +
                comorbidity_cut.astype(str)
        )
    
        age_bins = [0, 20, 40, 60, float('inf')]
        age_cut = pd.cut(df_features['age_at_hct'],
                         bins=age_bins,
                         labels=['young', 'mid', 'senior', 'elderly'],
                         include_lowest=True)
        df_features['age_conditioning'] = (
                age_cut.astype(str) + '_' +
                df_features['conditioning_intensity'].astype(str)
        )
    
        # 2. HLA Matching Features
        high_res_cols = ['hla_match_drb1_high', 'hla_match_dqb1_high', 'hla_match_c_high',
                         'hla_match_a_high', 'hla_match_b_high']
        low_res_cols = ['hla_match_drb1_low', 'hla_match_dqb1_low', 'hla_match_c_low',
                        'hla_match_a_low', 'hla_match_b_low']
    
        df_features['hla_high_res_composite'] = df_features[high_res_cols].mean(axis=1)
        df_features['hla_low_res_composite'] = df_features[low_res_cols].mean(axis=1)
    
        # 3. Composite Risk Scores
        def standardize(x):
            return (x - x.mean()) / x.std()
    
        df_features['patient_risk_score'] = (
                standardize(df_features['age_at_hct']) +
                standardize(df_features['comorbidity_score']) -
                standardize(df_features['karnofsky_score'])
        )
    
        risk_bins = [-float('inf'), -1, 1, float('inf')]
        risk_cut = pd.cut(df_features['patient_risk_score'],
                          bins=risk_bins,
                          labels=['low', 'medium', 'high'])
        df_features['risk_treatment_match'] = (
                risk_cut.astype(str) + '_' +
                df_features['conditioning_intensity'].astype(str)
        )
    
        # 4. Era-based features
        year_bins = df_features['year_hct'].quantile([0, 0.2, 0.4, 0.6, 0.8, 1.0]).values
        df_features['transplant_era'] = pd.cut(df_features['year_hct'],
                                               bins=year_bins,
                                               labels=['very_early', 'early', 'mid', 'late', 'recent'],
                                               include_lowest=True)
    
        df_features['era_conditioning'] = (
                df_features['transplant_era'].astype(str) + '_' +
                df_features['conditioning_intensity'].astype(str)
        )
    
        hla_bins = [-float('inf'), -1, 0, float('inf')]
        hla_cut = pd.cut(standardize(df_features['hla_high_res_composite']),
                         bins=hla_bins,
                         labels=['low', 'medium', 'high'])
        df_features['era_hla_quality'] = (
                df_features['transplant_era'].astype(str) + '_' +
                hla_cut.astype(str)
        )
    
        # 5. GVHD Prophylaxis Grouping
        gvhd_mapping = {
            'FK+ MMF +- others': 'FK_based',
            'FK+ MTX +- others(not MMF)': 'FK_based',
            'FKalone': 'FK_based',
            'FK+- others(not MMF,MTX)': 'FK_based',
            'CSA + MMF +- others(not FK)': 'CSA_based',
            'CSA + MTX +- others(not MMF,FK)': 'CSA_based',
            'CSA alone': 'CSA_based',
            'CSA +- others(not FK,MMF,MTX)': 'CSA_based',
            'TDEPLETION +- other': 'T_depletion',
            'TDEPLETION alone': 'T_depletion',
            'Cyclophosphamide alone': 'Cy_based',
            'Cyclophosphamide +- others': 'Cy_based',
            'CDselect alone': 'CD_selection',
            'CDselect +- other': 'CD_selection',
            'No GvHD Prophylaxis': 'none',
            'Other GVHD Prophylaxis': 'other'
        }
        df_features['gvhd_main_strategy'] = df_features['gvhd_proph'].map(gvhd_mapping)
    
        # 6. Disease-specific Features
        df_features['disease_conditioning'] = (
                df_features['prim_disease_hct'].astype(str) + '_' +
                df_features['conditioning_intensity'].astype(str)
        )
    
        df_features['disease_era'] = (
                df_features['prim_disease_hct'].astype(str) + '_' +
                df_features['transplant_era'].astype(str)
        )
    
        # 7. Enhanced HLA Features
        df_features['hla_mismatch_pattern'] = (
            (df_features['hla_high_res_composite'] < df_features['hla_low_res_composite']).astype(int)
        )
    
        df_features['hla_era_trend'] = df_features.groupby('transplant_era')['hla_high_res_composite'].transform('mean')
    
        # 8. Comorbidity Interactions
        comorbidity_cols = ['cardiac', 'pulm_severe', 'renal_issue', 'hepatic_severe']
        df_features['comorbidity_pattern'] = df_features[comorbidity_cols].apply(
            lambda x: '_'.join(x.index[x == 'Yes']), axis=1
        )
    
        df_features['age_comorbidity_risk'] = df_features['age_at_hct'] * df_features['comorbidity_score']
    
        return df_features

    def process_data(self, df, training=False):
        """
        Main processing pipeline without fold creation
        """
        df_processed = df.copy()

        # Handle missing data
        df_processed = self.process_missing_data_enhanced(df_processed, training)

        # Create enhanced features
        #df_processed = self.create_enhanced_features(df_processed, training)

        # Transform survival probabilities and create labels
        if 'efs_time' in df_processed.columns and 'efs' in df_processed.columns:
            df_processed['label'] = self.transform_survival_probability(
                df_processed,
                time_col='efs_time',
                event_col='efs',
                training=training
            )

        return df_processed






In [61]:
pipeline = DataPipeline()



# Process each fold's data first
x_train = pipeline.process_missing_data_enhanced(train, training=True)


In [51]:
import pandas as pd
import numpy as np

def validate_processed_values(df):
    """
    Validates processed dataframe for suspicious or invalid values.
    Prints validation results.
    """
    print("\n=== Starting Validation ===")

    numeric_cols = df.select_dtypes(include=np.number).columns
    print(f"\nFound {len(numeric_cols)} numeric columns")

    suspicious_columns = []

    for col in numeric_cols:
        print(f"\n{'-'*40}")
        print(f"Column: {col}")
        print(f"{'-'*40}")

        values = df[col].values

        # Basic stats
        stats = {
            'min': values.min(),
            'max': values.max(),
            'mean': values.mean(),
            'median': np.median(values),
            'std': values.std(),
            'zeros_pct': (values == 0).mean() * 100
        }

        for stat, value in stats.items():
            print(f"{stat:<10}: {value:,.4f}")

        # Check for issues
        if np.any(np.isinf(values)):
            suspicious_columns.append(f"{col} (inf values)")
            print("WARNING: Found infinite values")

        abs_max = np.abs(values).max()
        if abs_max > 1e10:
            suspicious_columns.append(f"{col} (large values: {abs_max})")
            print(f"WARNING: Found very large values: {abs_max}")

        # Check for scientific notation
        if df[col].astype(str).str.contains('e', case=False).any():
            suspicious_columns.append(f"{col} (scientific notation)")
            print("WARNING: Found scientific notation values")

        # Check for very small non-zero values
        non_zero_vals = values[values != 0]
        if len(non_zero_vals) > 0:
            min_non_zero = np.abs(non_zero_vals).min()
            if min_non_zero < 1e-10:
                print(f"WARNING: Found very small non-zero values: {min_non_zero}")

    if suspicious_columns:
        print("\n=== WARNINGS ===")
        print("Found suspicious values in these columns:")
        for col in suspicious_columns:
            print(f"- {col}")
    else:
        print("\n=== NO WARNINGS ===")
        print("All numeric columns appear to have reasonable values")

In [67]:
# After preprocessing
#df_processed = pipeline.process_missing_data_enhanced(df, training=True)

# Run validation
validation_results = validate_processed_values(x_train)





=== Starting Validation ===

Found 61 numeric columns

----------------------------------------
Column: dri_score
----------------------------------------
min       : 0.0000
max       : 11.0000
mean      : 3.8267
median    : 2.0000
std       : 3.1955
zeros_pct : 16.2977

----------------------------------------
Column: cyto_score
----------------------------------------
min       : 0.0000
max       : 7.0000
mean      : 4.1073
median    : 5.0000
std       : 2.6032
zeros_pct : 10.6163

----------------------------------------
Column: hla_match_c_high
----------------------------------------
min       : 0.0000
max       : 2.0000
mean      : 1.8013
median    : 2.0000
std       : 0.4056
zeros_pct : 0.2648

----------------------------------------
Column: hla_high_res_8
----------------------------------------
min       : 0.0000
max       : 6.0000
mean      : 5.0988
median    : 6.0000
std       : 1.4706
zeros_pct : 0.0043

----------------------------------------
Column: hla_low_res_6
-----

In [None]:
# 1. Load data
# 2. Create temporary pipeline to analyze all categories
#temp_pipeline = DataPipeline()
#full_processed = temp_pipeline.process_data(train, training=True)

# Print all categorical columns and their values
#print("\nAll categorical columns and their values:")
#for col in full_processed.select_dtypes(include=['object', 'category']).columns:
#    print(f"\n{col}:")
#    print(full_processed[col].unique())

# Now we know all possible categories that could appear in any fold
# We can modify the DataPipeline class to handle these known categories

In [6]:
from colorama import Fore
from lifelines.utils import concordance_index


class ParticipantVisibleError(Exception):
    pass


def custom_score(solution, submission, row_id_column_name, prediction_label='prediction', print_info=True):

    del solution[row_id_column_name]
    del submission[row_id_column_name]

    event_label = 'efs'
    interval_label = 'efs_time'

    for col in submission.columns:
        if not pd.api.types.is_numeric_dtype(submission[col]):
            raise ParticipantVisibleError(f'Submission column {col} must be a number')
    # Merging solution and submission dfs on ID
    merged_df = pd.concat([solution, submission], axis=1)
    merged_df.reset_index(inplace=True)
    merged_df_race_dict = dict(merged_df.groupby(['race_group']).groups)
    metric_dict = {}
    for race in sorted(merged_df_race_dict.keys()):
        # Retrieving values from y_test based on index
        indices = sorted(merged_df_race_dict[race])
        merged_df_race = merged_df.iloc[indices]
        # Calculate the concordance index
        c_index_race = concordance_index(
            merged_df_race[interval_label],
            -merged_df_race[prediction_label],
            merged_df_race[event_label])

        metric_dict[race] = c_index_race

    race_c_index = list(metric_dict.values())
    c_score = float(np.mean(race_c_index) - np.std(race_c_index))
    if print_info:
        print(f"{Fore.GREEN}{Style.BRIGHT}# c-index={c_score:.4f}, mean={np.mean(race_c_index):.4f} std={np.std(race_c_index):.4f}{Style.RESET_ALL}")

    return c_score, metric_dict


def display_overall(df):

    race_groups = [
        'American Indian or Alaska Native', 'Asian',
        'Black or African-American', 'More than one race',
        'Native Hawaiian or other Pacific Islander', 'White'
    ]
    df['mean'] = df[race_groups].mean(axis=1)
    df['std'] = np.std(df[race_groups], axis=1)
    df['score'] = df['mean'] - df['std']
    df = df.T
    df['Overall'] = df.mean(axis=1)
    temp = df.drop(index=['std']).values
    display(df
            .iloc[:len(race_groups)]
            .style
            .format(precision=4)
            .background_gradient(axis=None, vmin=temp.min(), vmax=temp.max(), cmap="cool")
            .concat(df.iloc[len(race_groups):].style.format(precision=3))
            )


In [7]:
def CIndexMetric_XGB(y_true, y_pred):
    ds_pred["prediction"] = y_pred
    cindex_score, _ = custom_score(ds_true.copy(), ds_pred.copy(), "ID", print_info=False)
    return -cindex_score

In [65]:
from xgboost import XGBRegressor

# 1. Start with raw data 
train = pd.read_csv(DATA_DIR +'/train.csv')



# 2. Create folds
folds = 5
train['kfold'] = -1

#skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=42)
#groups = train['efs'].astype(str)
#for fold, (train_idx, val_idx) in enumerate(skf.split(X=train, y=groups)):
#    train.loc[val_idx, 'kfold'] = fold

groups = train['race_group']
skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=42)
for fold, (train_idx, val_idx) in enumerate(skf.split(X=train, y=groups)):
    train.loc[val_idx, 'kfold'] = fold

# 3. Initialize outputs
oof_xgb = train[['kfold','ID','efs','efs_time','race_group']].copy()
oof_xgb['prediction'] = 0.0
metric_df = []

# 4. Training loop
for fold in range(skf.n_splits):
    pipeline = DataPipeline()

    x_train = train[train.kfold != fold].copy()
    x_valid = train[train.kfold == fold].copy()

    # Process each fold's data first
    x_train = pipeline.process_data(x_train, training=True)
    x_valid = pipeline.process_data(x_valid, training=False)

    # Check all object/categorical columns
    #categorical_cols = train.select_dtypes(include=['object', 'category']).columns
    #print("\nAll categorical columns:")
    #for col in categorical_cols:
    #    print(f"\n{col}:")
    #    print(train[col].unique())

    # Now define FEATURES after new columns are created
    RMV = ["ID","efs","efs_time","label",'y','kfold']
    FEATURES = [c for c in x_train.columns if not c in RMV]

    # Print features on first fold
    if fold == 0:
        feature_importances_xgb = pd.DataFrame()
        feature_importances_xgb['feature'] = FEATURES
        print(f"There are {len(FEATURES)} FEATURES: {FEATURES}")

    # Get labels
    y_train = x_train['label']
    y_valid = x_valid['label']
    y_label = x_valid['efs']

    # Select features
    x_train = x_train[FEATURES]
    x_valid = x_valid[FEATURES]

    # Handle data types
    for c in x_train.columns:
        if x_train[c].dtype == "object":
            x_train[c] = x_train[c].fillna("NAN").astype("category")
            x_valid[c] = x_valid[c].fillna("NAN").astype("category")
        else:
            if x_train[c].dtype=="float64":
                x_train[c] = x_train[c].replace([np.inf, -np.inf], np.nan).astype("float32")
                x_valid[c] = x_valid[c].replace([np.inf, -np.inf], np.nan).astype("float32")
            if x_train[c].dtype=="int64":
                x_train[c] = x_train[c].astype("int32")
                x_valid[c] = x_valid[c].astype("int32")


    # Set up for metrics
    ds_true = oof_xgb.loc[oof_xgb.kfold==fold, ["ID","efs","efs_time","race_group"]].copy().reset_index(drop=True)
    ds_pred = oof_xgb.loc[oof_xgb.kfold==fold, ["ID"]].copy().reset_index(drop=True)

    # Train model
    clf = XGBRegressor(
        max_depth=3,
        colsample_bytree=0.5,
        subsample=0.8,
        n_estimators=10000,
        learning_rate=0.03,
        early_stopping_rounds=100,
        objective='reg:squarederror',
        enable_categorical=True,
        min_child_weight=5,
        eval_metric=CIndexMetric_XGB,
        disable_default_eval_metric=True,
        missing=np.nan
    )

    clf.fit(
        x_train, y_train,
        eval_set=[(x_valid, y_valid)],
        verbose=500,
    )

    # Get feature importance and predictions
    feature_importances_xgb[f'fold_{fold + 1}'] = feature_importances_xgb['feature'].map(clf.get_booster().get_score())
    preds_valid = clf.predict(x_valid)
    oof_xgb.loc[oof_xgb.kfold==fold, 'prediction'] = preds_valid

    # Save model
    clf.save_model(f"xgb_model_{fold}.bin")

    # Calculate metrics
    y_true = oof_xgb.loc[oof_xgb.kfold==fold, ["ID","efs","efs_time","race_group"]].copy().reset_index(drop=True)
    y_pred = oof_xgb.loc[oof_xgb.kfold==fold, ["ID","prediction"]].copy().reset_index(drop=True)
    m, metric_dict = custom_score(y_true, y_pred, "ID", print_info=True)
    metric_df.append(metric_dict)

There are 92 FEATURES: ['dri_score', 'psych_disturb', 'cyto_score', 'diabetes', 'hla_match_c_high', 'hla_high_res_8', 'tbi_status', 'arrhythmia', 'hla_low_res_6', 'graft_type', 'vent_hist', 'renal_issue', 'pulm_severe', 'prim_disease_hct', 'hla_high_res_6', 'cmv_status', 'hla_high_res_10', 'hla_match_dqb1_high', 'tce_imm_match', 'hla_nmdp_6', 'hla_match_c_low', 'rituximab', 'hla_match_drb1_low', 'hla_match_dqb1_low', 'prod_type', 'cyto_score_detail', 'conditioning_intensity', 'ethnicity', 'year_hct', 'obesity', 'mrd_hct', 'in_vivo_tcd', 'tce_match', 'hla_match_a_high', 'hepatic_severe', 'donor_age', 'prior_tumor', 'hla_match_b_low', 'peptic_ulcer', 'age_at_hct', 'hla_match_a_low', 'gvhd_proph', 'rheum_issue', 'sex_match', 'hla_match_b_high', 'race_group', 'comorbidity_score', 'karnofsky_score', 'hepatic_mild', 'tce_div_match', 'donor_related', 'melphalan_dose', 'hla_low_res_8', 'cardiac', 'hla_match_drb1_high', 'pulm_moderate', 'hla_low_res_10', 'hla_match_c_high_missing', 'hla_high_re



[0]	validation_0-CIndexMetric_XGB:-0.53488
[367]	validation_0-CIndexMetric_XGB:-0.59674
[32m[1m# c-index=0.6000, mean=0.6074 std=0.0074[0m




[0]	validation_0-CIndexMetric_XGB:-0.54519
[348]	validation_0-CIndexMetric_XGB:-0.62261
[32m[1m# c-index=0.6249, mean=0.6348 std=0.0100[0m




[0]	validation_0-CIndexMetric_XGB:-0.54298
[379]	validation_0-CIndexMetric_XGB:-0.61899
[32m[1m# c-index=0.6202, mean=0.6273 std=0.0071[0m




[0]	validation_0-CIndexMetric_XGB:-0.58618
[410]	validation_0-CIndexMetric_XGB:-0.65051
[32m[1m# c-index=0.6515, mean=0.6579 std=0.0064[0m




In [66]:
feature_importances_xgb['importance'] = feature_importances_xgb.drop('feature', axis=1).mean(axis=1)
feature_importances_xgb = feature_importances_xgb.sort_values('importance', ascending=False).reset_index(drop=True)
feature_importances_xgb.head(20)

Unnamed: 0,feature,fold_1,fold_2,fold_3,fold_4,fold_5,importance
0,disease_era,397.0,387.0,360.0,405.0,456.0,401.0
1,disease_conditioning,354.0,333.0,313.0,368.0,380.0,349.6
2,dri_comorbidity,247.0,257.0,276.0,262.0,290.0,266.4
3,conditioning_karnofsky,131.0,149.0,127.0,152.0,159.0,143.6
4,era_conditioning,116.0,116.0,116.0,113.0,114.0,115.0
5,sex_match,96.0,106.0,99.0,105.0,106.0,102.4
6,age_conditioning,85.0,98.0,92.0,87.0,106.0,93.6
7,year_hct,92.0,86.0,90.0,94.0,87.0,89.8
8,comorbidity_pattern,83.0,90.0,83.0,90.0,74.0,84.0
9,risk_treatment_match,66.0,67.0,52.0,59.0,67.0,62.2


In [195]:
new_data = pd.read_csv(DATA_DIR +'/test.csv')

# Apply the same pipeline
pipeline = DataPipeline()
new_data = pipeline.process_data(new_data, training=True)

# Select FEATURES
RMV = ["ID", "efs", "efs_time", "label", "y", "kfold"]
FEATURES = [c for c in new_data.columns if c not in RMV]
new_data = new_data[FEATURES]

# Handle data types (similar to training loop)
for c in new_data.columns:
    if new_data[c].dtype == "object":
        new_data[c] = new_data[c].fillna("NAN").astype("category")
    else:
        if new_data[c].dtype == "float64":
            new_data[c] = new_data[c].replace([np.inf, -np.inf], np.nan).astype("float32")
        if new_data[c].dtype == "int64":
            new_data[c] = new_data[c].astype("int32")

clf.predict(new_data)

array([0.48765048, 0.61907995, 0.5356562 ], dtype=float32)

In [209]:
import numpy as np
import pandas as pd
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBRegressor
from scipy.stats import uniform, randint
import optuna
from typing import Dict, List, Tuple
import logging

class XGBoostTuner:
    def __init__(
            self,
            train_data: pd.DataFrame,
            pipeline: DataPipeline,
            fold_column: str = 'kfold',
            n_trials: int = 100,
            random_state: int = 42
    ):
        self.train_data = train_data
        self.pipeline = pipeline
        self.fold_column = fold_column
        self.n_trials = n_trials
        self.random_state = random_state
        self.best_params = None
        self.study = None

        # Set up logging
        logging.basicConfig(level=logging.INFO)
        self.logger = logging.getLogger(__name__)

    def _handle_nan_values(self, true_data: pd.DataFrame, pred_data: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
        """
        Handle NaN values in the data before calculating metrics.
        """
        # Get valid indices (non-NaN in both true and predicted)
        valid_mask = (
                ~true_data['efs'].isna() &
                ~true_data['efs_time'].isna() &
                ~pred_data['prediction'].isna()
        )

        # Filter both dataframes
        true_data_clean = true_data[valid_mask].copy()
        pred_data_clean = pred_data[valid_mask].copy()

        return true_data_clean, pred_data_clean

    def _objective(self, trial: optuna.Trial) -> float:
        """
        Objective function for Optuna optimization.
        """
        # Define parameter search space
        param = {
            'max_depth': trial.suggest_int('max_depth', 2, 8),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
            'n_estimators': trial.suggest_int('n_estimators', 1000, 20000),
            'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.3, 0.8),
            'subsample': trial.suggest_float('subsample', 0.6, 0.9),
            'early_stopping_rounds': 100,
            'objective': 'reg:squarederror',
            'enable_categorical': True,
            'eval_metric': CIndexMetric_XGB,
            'disable_default_eval_metric': True,
            'missing': np.nan
        }

        # Cross-validation scores for each fold
        cv_scores = []

        # Perform cross-validation
        for fold in range(self.train_data[self.fold_column].nunique()):
            # Split data
            x_train = self.train_data[self.train_data[self.fold_column] != fold].copy()
            x_valid = self.train_data[self.train_data[self.fold_column] == fold].copy()

            # Store necessary columns for metrics before processing
            valid_metrics = x_valid[['ID', 'efs', 'efs_time', 'race_group']].copy()

            # Process each fold's data using the pipeline
            x_train = self.pipeline.process_data(x_train, training=True)
            x_valid = self.pipeline.process_data(x_valid, training=False)

            # Get labels
            y_train = x_train['label']
            y_valid = x_valid['label']

            # Define features after pipeline processing
            RMV = ["ID", "efs", "efs_time", "label", 'y', 'kfold']
            FEATURES = [c for c in x_train.columns if c not in RMV]

            # Select features
            x_train = x_train[FEATURES]
            x_valid = x_valid[FEATURES]

            # Handle data types
            for c in x_train.columns:
                if x_train[c].dtype == "object":
                    x_train[c] = x_train[c].fillna("NAN").astype("category")
                    x_valid[c] = x_valid[c].fillna("NAN").astype("category")
                else:
                    if x_train[c].dtype == "float64":
                        x_train[c] = x_train[c].replace([np.inf, -np.inf], np.nan).astype("float32")
                        x_valid[c] = x_valid[c].replace([np.inf, -np.inf], np.nan).astype("float32")
                    if x_train[c].dtype == "int64":
                        x_train[c] = x_train[c].astype("int32")
                        x_valid[c] = x_valid[c].astype("int32")

            # Train model
            model = XGBRegressor(**param)
            model.fit(
                x_train, y_train,
                eval_set=[(x_valid, y_valid)],
                verbose=0
            )

            # Get predictions
            preds = model.predict(x_valid)

            # Calculate custom metric using stored metrics data
            y_pred = pd.DataFrame({
                "ID": valid_metrics["ID"].values,
                "prediction": preds
            })

            # Handle NaN values before calculating metrics
            true_clean, pred_clean = self._handle_nan_values(valid_metrics, y_pred)

            if len(true_clean) > 0:  # Only calculate metric if we have valid data
                metric, _ = custom_score(true_clean, pred_clean, "ID", print_info=False)
                cv_scores.append(metric)
            else:
                self.logger.warning(f"No valid data for fold {fold} after handling NaN values")
                cv_scores.append(0.0)  # or some other appropriate default value

        mean_score = np.mean(cv_scores)
        self.logger.info(f"Trial finished with score: {mean_score:.4f}")
        return mean_score

    def optimize(self) -> Dict:
        """
        Run the optimization process.
        """
        self.logger.info("Starting hyperparameter optimization...")

        study = optuna.create_study(
            direction="maximize",
            study_name="xgboost_optimization"
        )

        study.optimize(
            self._objective,
            n_trials=self.n_trials,
            show_progress_bar=True
        )

        self.best_params = study.best_params
        self.study = study

        self.logger.info(f"Best score: {study.best_value:.4f}")
        self.logger.info("Best hyperparameters:")
        for key, value in study.best_params.items():
            self.logger.info(f"    {key}: {value}")

        return self.best_params

    def plot_optimization_history(self) -> None:
        """
        Plot the optimization history using Optuna's visualization tools.
        """
        if self.study is None:
            raise ValueError("No study available. Run optimize() first.")

        try:
            from optuna.visualization import plot_optimization_history
            fig = plot_optimization_history(self.study)
            fig.show()
        except ImportError:
            self.logger.warning("Plotly is required for visualization. Install with: pip install plotly")

In [215]:
import numpy as np
import pandas as pd
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBRegressor
from scipy.stats import uniform, randint
import optuna
from typing import Dict, List, Tuple
import logging

class XGBoostTuner:
    def __init__(
            self,
            train_data: pd.DataFrame,
            pipeline: DataPipeline,
            fold_column: str = 'kfold',
            n_trials: int = 100,
            random_state: int = 42
    ):
        self.train_data = train_data
        self.pipeline = pipeline
        self.fold_column = fold_column
        self.n_trials = n_trials
        self.random_state = random_state
        self.best_params = None
        self.study = None

        # Set up logging
        logging.basicConfig(level=logging.INFO)
        self.logger = logging.getLogger(__name__)

    def _handle_nan_values(self, true_data: pd.DataFrame, pred_data: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
        """Handle NaN values in the data before calculating metrics."""
        valid_mask = (
                ~true_data['efs'].isna() &
                ~true_data['efs_time'].isna() &
                ~pred_data['prediction'].isna()
        )

        true_data_clean = true_data[valid_mask].copy()
        pred_data_clean = pred_data[valid_mask].copy()

        return true_data_clean, pred_data_clean

    def _objective(self, trial: optuna.Trial) -> float:
        """
        Objective function for Optuna optimization.
        Returns the negative of (mean - std) of race-stratified c-indices
        to match the original metric convention.
        """
        param = {
            'max_depth': trial.suggest_int('max_depth', 2, 4),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.05, log=True),
            'n_estimators': trial.suggest_int('n_estimators', 8000, 12000),
            'min_child_weight': trial.suggest_int('min_child_weight', 3, 7),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.4, 0.6),
            'subsample': trial.suggest_float('subsample', 0.7, 0.9),
            'early_stopping_rounds': 100,
            'objective': 'reg:squarederror',
            'enable_categorical': True,
            'eval_metric': CIndexMetric_XGB,
            'disable_default_eval_metric': True,
            'missing': np.nan
        }

        cv_scores = []

        for fold in range(self.train_data[self.fold_column].nunique()):
            x_train = self.train_data[self.train_data[self.fold_column] != fold].copy()
            x_valid = self.train_data[self.train_data[self.fold_column] == fold].copy()

            valid_metrics = x_valid[['ID', 'efs', 'efs_time', 'race_group']].copy()

            x_train = self.pipeline.process_data(x_train, training=True)
            x_valid = self.pipeline.process_data(x_valid, training=False)

            y_train = x_train['label']
            y_valid = x_valid['label']

            RMV = ["ID", "efs", "efs_time", "label", 'y', 'kfold']
            FEATURES = [c for c in x_train.columns if c not in RMV]

            x_train = x_train[FEATURES]
            x_valid = x_valid[FEATURES]

            for c in x_train.columns:
                if x_train[c].dtype == "object":
                    x_train[c] = x_train[c].fillna("NAN").astype("category")
                    x_valid[c] = x_valid[c].fillna("NAN").astype("category")
                else:
                    if x_train[c].dtype == "float64":
                        x_train[c] = x_train[c].replace([np.inf, -np.inf], np.nan).astype("float32")
                        x_valid[c] = x_valid[c].replace([np.inf, -np.inf], np.nan).astype("float32")
                    if x_train[c].dtype == "int64":
                        x_train[c] = x_train[c].astype("int32")
                        x_valid[c] = x_valid[c].astype("int32")

            model = XGBRegressor(**param)
            model.fit(
                x_train, y_train,
                eval_set=[(x_valid, y_valid)],
                verbose=0
            )

            preds = model.predict(x_valid)

            y_pred = pd.DataFrame({
                "ID": valid_metrics["ID"].values,
                "prediction": preds
            })

            true_clean, pred_clean = self._handle_nan_values(valid_metrics, y_pred)

            if len(true_clean) > 0:
                metric, _ = custom_score(true_clean, pred_clean, "ID", print_info=False)
                cv_scores.append(metric)
            else:
                self.logger.warning(f"No valid data for fold {fold} after handling NaN values")
                cv_scores.append(0.0)

        mean_score = np.mean(cv_scores)
        self.logger.info(f"Trial finished with score: {mean_score:.4f}")
        return mean_score

    def optimize(self) -> Dict:
        """Run the optimization process."""
        self.logger.info("Starting hyperparameter optimization...")

        study = optuna.create_study(
            direction="minimize",  # Because both CIndexMetric_XGB and custom_score return negative values
            study_name="xgboost_optimization"
        )

        study.optimize(
            self._objective,
            n_trials=self.n_trials,
            show_progress_bar=True
        )

        self.best_params = {
            'max_depth': study.best_params['max_depth'],
            'learning_rate': study.best_params['learning_rate'],
            'n_estimators': study.best_params['n_estimators'],
            'min_child_weight': study.best_params['min_child_weight'],
            'colsample_bytree': study.best_params['colsample_bytree'],
            'subsample': study.best_params['subsample'],
            'early_stopping_rounds': 100,
            'objective': 'reg:squarederror',
            'enable_categorical': True,
            'eval_metric': CIndexMetric_XGB,
            'disable_default_eval_metric': True,
            'missing': np.nan
        }

        self.study = study

        self.logger.info(f"Best score: {study.best_value:.4f}")
        self.logger.info("Best hyperparameters:")
        for key, value in self.best_params.items():
            self.logger.info(f"    {key}: {value}")

        return self.best_params

    def plot_optimization_history(self) -> None:
        """Plot the optimization history."""
        if self.study is None:
            raise ValueError("No study available. Run optimize() first.")

        try:
            from optuna.visualization import plot_optimization_history
            fig = plot_optimization_history(self.study)
            fig.show()
        except ImportError:
            self.logger.warning("Plotly is required for visualization. Install with: pip install plotly")

    def get_best_params(self) -> Dict:
        """Return the best parameters found during optimization."""
        if self.best_params is None:
            raise ValueError("No best parameters available. Run optimize() first.")
        return self.best_params

In [None]:
# Initialize pipeline and tuner
pipeline = DataPipeline()
tuner = XGBoostTuner(
    train_data=train,
    pipeline=pipeline,
    n_trials=100
)

# Run optimization
tuner.optimize()

# Get best parameters
best_params = tuner.get_best_params()

# Create validation dataframe for metric calculation
ds_true = train[["ID", "efs", "efs_time", "race_group"]].copy()
ds_pred = train[["ID"]].copy()

# Initialize model with best parameters
clf = XGBRegressor(**best_params)

# Train the model (your original training loop)
clf.fit(
    x_train, y_train,
    eval_set=[(x_valid, y_valid)],
    verbose=500
)

INFO:__main__:Starting hyperparameter optimization...
[I 2024-12-19 20:33:14,656] A new study created in memory with name: xgboost_optimization


  0%|          | 0/100 [00:00<?, ?it/s]

  true_data_clean = true_data[valid_mask].copy()
  pred_data_clean = pred_data[valid_mask].copy()
  true_data_clean = true_data[valid_mask].copy()
  pred_data_clean = pred_data[valid_mask].copy()


In [197]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor

def generate_predictions(
        test_path: str,
        model_paths: list,
        pipeline: DataPipeline,
        submission_path: str = "submission.csv"
) -> None:
    """
    Generate predictions for test data using an ensemble of trained models.
    
    Args:
        test_path: Path to test CSV file
        model_paths: List of paths to saved XGBoost models (one per fold)
        pipeline: Trained DataPipeline instance
        submission_path: Path to save submission file
    """
    # Read test data
    test = pd.read_csv(test_path)
    test_processed = pipeline.process_data(test, training=False)

    # Define features (same as training)
    RMV = ["ID", "efs", "efs_time", "label", 'y', 'kfold']
    FEATURES = [c for c in test_processed.columns if not c in RMV]

    # Handle data types
    for c in FEATURES:
        if test_processed[c].dtype == "object":
            test_processed[c] = test_processed[c].fillna("NAN").astype("category")
        else:
            if test_processed[c].dtype == "float64":
                test_processed[c] = test_processed[c].replace([np.inf, -np.inf], np.nan).astype("float32")
            if test_processed[c].dtype == "int64":
                test_processed[c] = test_processed[c].astype("int32")

    # Get features for prediction
    X_test = test_processed[FEATURES]

    # Make predictions with each model
    predictions = []
    for model_path in model_paths:
        model = XGBRegressor()
        model.load_model(model_path)
        pred = model.predict(X_test)
        predictions.append(pred)

    # Average predictions from all models
    final_predictions = np.mean(predictions, axis=0)

    # Create submission file
    submission = pd.DataFrame({
        'ID': test['ID'],
        'prediction': final_predictions
    })

    # Save submission
    submission.to_csv(submission_path, index=False)
    print(f"Submission saved to {submission_path}")

    # Print some basic statistics
    print("\nPrediction Statistics:")
    print(submission['prediction'].describe())

    return submission

# Example usage:
"""
# After training your models
pipeline = DataPipeline()  # Your trained pipeline

# List of your saved model paths
model_paths = [f"xgb_model_{fold}.bin" for fold in range(5)]

# Generate predictions
submission = generate_predictions(
    test_path='test.csv',
    model_paths=model_paths,
    pipeline=pipeline,
    submission_path='submission.csv'
)
"""

2.1.1
