In [27]:
import pandas as pd
import tensorflow as ttf
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

In [20]:
train_data = pd.read_csv("../data/train.csv")
test_data = pd.read_csv("../data/test.csv")
unseen_data = train_data.sample(500)

df = train_data.drop(unseen_data.index)

In [25]:
columns_to_drop = [
    'ID',                
    'tce_match',         
    'tce_div_match',     
    'cyto_score_detail', 
    'mrd_hct',      
    'efs_time',
]

categorical_columns_ohe = [
    'tbi_status',
    'graft_type',
    'prim_disease_hct',
    'prod_type',
    'conditioning_intensity',
    'ethnicity',
    'race_group',
    'donor_related',
]

categorical_columns_le = [
    'hla_match_c_high',      
    'hla_high_res_8',
    'hla_high_res_6',
    'hla_high_res_10',
    'hla_match_dqb1_high',
    'hla_match_drb1_high',
    'hla_match_c_low',
    'hla_match_a_high',
    'hla_match_a_low',
    'hla_match_b_high',
    'hla_match_b_low',
    'hla_match_dqb1_low',
    'hla_match_drb1_low',
    'hla_low_res_6',
    'hla_low_res_8',
    'hla_low_res_10',
]

numerical_columns = [
    'age_at_hct',
    'donor_age',
    'comorbidity_score',
    'karnofsky_score',
]

target_column = [
    'efs'
]

In [30]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

def create_pipeline():
    columns_to_drop = [
        'ID',                
        'tce_match',         
        'tce_div_match',     
        'cyto_score_detail', 
        'mrd_hct',      
        'efs_time',
    ]

    categorical_columns_ohe = [
        'tbi_status',
        'graft_type',
        'prim_disease_hct',
        'prod_type',
        'conditioning_intensity',
        'ethnicity',
        'race_group',
        'donor_related',
    ]

    categorical_columns_le = [
        'hla_match_c_high',      
        'hla_high_res_8',
        'hla_high_res_6',
        'hla_high_res_10',
        'hla_match_dqb1_high',
        'hla_match_drb1_high',
        'hla_match_c_low',
        'hla_match_a_high',
        'hla_match_a_low',
        'hla_match_b_high',
        'hla_match_b_low',
        'hla_match_dqb1_low',
        'hla_match_drb1_low',
        'hla_low_res_6',
        'hla_low_res_8',
        'hla_low_res_10',
    ]

    numerical_columns = [
        'age_at_hct',
        'donor_age',
        'comorbidity_score',
        'karnofsky_score',
    ]

    target_column = 'efs'

    drop_transformer = ('drop_columns', 'drop', columns_to_drop)

    ohe_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(sparse=False, handle_unknown='ignore'))
    ])

    numerical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ])

    le_transformers = {}
    for col in categorical_columns_le:
        le_transformers[col] = LabelEncoder()

    preprocessor = ColumnTransformer(
        transformers=[
            ('ohe', ohe_transformer, categorical_columns_ohe),
            ('num', numerical_transformer, numerical_columns),
            drop_transformer,
        ],
        remainder='passthrough'
    )

    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
    ])

    return pipeline, le_transformers

def preprocess_with_pipeline(pipeline, le_transformers, df, fit=True):
    for col, le in le_transformers.items():
        df[col] = df[col].fillna("Unknown").astype(str)
        if fit:
            df[col] = le.fit_transform(df[col])
        else:
            df[col] = le.transform(df[col])

    processed_data = pipeline.fit_transform(df) if fit else pipeline.transform(df)

    ohe_columns = pipeline.named_steps['preprocessor'].transformers_[0][1]['onehot'].get_feature_names_out()
    remaining_columns = list(df.columns.drop(categorical_columns_ohe + columns_to_drop))

    final_columns = list(ohe_columns) + remaining_columns
    processed_df = pd.DataFrame(processed_data, columns=final_columns)

    return processed_df

In [31]:
pipeline, le_transformers = create_pipeline()

train_preprocessed = preprocess_with_pipeline(pipeline, le_transformers, df, fit=True)

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is

In [36]:
train_preprocessed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28300 entries, 0 to 28299
Data columns (total 94 columns):
 #   Column                                        Non-Null Count  Dtype 
---  ------                                        --------------  ----- 
 0   x0_No TBI                                     28300 non-null  object
 1   x0_TBI + Cy +- Other                          28300 non-null  object
 2   x0_TBI +- Other, -cGy, fractionated           28300 non-null  object
 3   x0_TBI +- Other, -cGy, single                 28300 non-null  object
 4   x0_TBI +- Other, -cGy, unknown dose           28300 non-null  object
 5   x0_TBI +- Other, <=cGy                        28300 non-null  object
 6   x0_TBI +- Other, >cGy                         28300 non-null  object
 7   x0_TBI +- Other, unknown dose                 28300 non-null  object
 8   x1_Bone marrow                                28300 non-null  object
 9   x1_Peripheral blood                           28300 non-null  object
 10