In [17]:
import pandas as pd
import numpy as np
import sklearn
import mlxtend
import xgboost

from sklearn import feature_selection
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS

from sklearn.metrics import f1_score, make_scorer, roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.impute import KNNImputer
from sklearn.impute import SimpleImputer

from sklearn.preprocessing import MinMaxScaler
from sklearn.utils.class_weight import compute_class_weight

from xgboost import XGBClassifier
from sklearn.model_selection import KFold

In [18]:
from platform import python_version
print(python_version())

3.10.12


In [19]:
print("pd version:", pd.__version__)
print("sklearn version: ", sklearn.__version__)
print("mlxtend version: ", mlxtend.__version__)
print("xgboost version: ", xgboost.__version__)

pd version: 2.0.3
sklearn version:  1.2.2
mlxtend version:  0.23.0
xgboost version:  2.0.2


In [20]:
class DataLoader():
    def __init__(self, data_path):
        self.data_path = data_path

    def load_data(self):
        self.train_data = pd.read_csv(self.data_path + 'train_X.csv')
        self.train_label = pd.read_csv(self.data_path + 'train_y.csv')
        self.test_data = pd.read_csv(self.data_path + 'test_X.csv')
        self.test_patient_id = self.test_data['patient_id']
#         with open(self.parms_path + 'features', 'rb') as feature_p:
#             self.best_features = pickle.load(feature_p)
#         with open(self.parms_path + 'parms', 'rb') as parm_p:
#             self.best_parms = pickle.load(parm_p)

#         self.best_features = list(self.best_features)

        self.best_features = ['apache_4a_hospital_death_prob', 'd1_spo2_min', 'd1_resprate_min', 'ventilated_apache',
                              'intubated_apache', 'd1_mbp_min', 'd1_sysbp_min', 'aids', 'gcs_motor_apache', 'hospital_id', 
                              'elective_surgery', 'd1_heartrate_min', 'd1_glucose_min', 'd1_temp_min', 'gcs_verbal_apache']
        self.best_parms = {'clf__colsample_bytree': 0.8, 'clf__gamma': 0.1, 'clf__learning_rate': 0.001, 'clf__max_depth': 3, 'clf__min_child_weight': 1, 'clf__n_estimators': 12000, 'clf__random_state': 42, 'clf__scale_pos_weight': 3, 'clf__subsample': 0.8, 'sfs__k_features': 15} 
    
        # for debug mode
        if not __debug__:
            print("=====================================")
            print("=        Entering Debug Mode        =")
            print("=====================================")
            self.train_data = self.train_data.iloc[:100, :]
            self.train_label = self.train_label.iloc[:100, :]
            # self.test_data = self.test_data.iloc[:100, :]

    def get_train_data(self):
        return self.train_data
    
    def get_train_label(self):
        return self.train_label
    
    def get_test_data(self):
        return self.test_data
    
    def get_test_patient_id(self):
        return self.test_patient_id
    
    def get_best_parms(self):
        return self.best_parms
    
    def get_best_features(self):
        return self.best_features
    
    def process(self):
        self.load_data()
        print("Data Loading Completed")
        return self.get_train_data(), self.get_train_label(), self.get_test_data(), self.get_test_patient_id()

In [21]:
class Preprocessing():
    def __init__(self, train_data, train_label, test_data):
        self.train_data = train_data
        self.train_label = train_label
        self.test_data = test_data

    def drop_useless(self):
        # drop the useless columns
        useless_features = ['encounter_id', 'patient_id']
        self.train_data = self.train_data.drop(useless_features, axis=1)
        self.test_data = self.test_data.drop(useless_features, axis=1)

    def categorical_processing(self):
        categorical_feature = ['hospital_id'
                                , 'elective_surgery'
                                , 'ethnicity'
                                , 'gender'
                                , 'icu_admit_source'
                                , 'icu_id'
                                , 'icu_stay_type'
                                , 'icu_type'
                                , 'apache_post_operative'
                                , 'arf_apache'
                                , 'gcs_unable_apache'
                                , 'intubated_apache'
                                , 'ventilated_apache'
                                , 'aids'
                                , 'cirrhosis'
                                , 'diabetes_mellitus'
                                , 'hepatic_failure'
                                , 'immunosuppression'
                                , 'leukemia'
                                , 'lymphoma'
                                , 'solid_tumor_with_metastasis'
                                , 'apache_3j_bodysystem'
                                , 'apache_2_bodysystem']
        binary_feature = ['elective_surgery', 
                          'gender', 
                          'apache_post_operative', 
                          'arf_apache', 
                          'gcs_unable_apache', 
                          'intubated_apache', 
                          'ventilated_apache', 
                          'aids', 
                          'cirrhosis', 
                          'diabetes_mellitus', 
                          'hepatic_failure', 
                          'immunosuppression', 
                          'leukemia', 
                          'lymphoma', 
                          'solid_tumor_with_metastasis']
        # option1: drop the categorical features

        # print("=====================================")
        # print("drop non-binary categorical features")
        # print("=====================================")

        # replace male to 0 and female to 1
        self.train_data['gender'].replace({'M': 0, 'F': 1}, inplace=True)
        self.test_data['gender'].replace({'M': 0, 'F': 1}, inplace=True)

        # drop non-binary categorical features
        # non_binary_feature = list(set(categorical_feature) - set(binary_feature))
        # self.train_data = self.train_data.drop(non_binary_feature, axis=1)
        # self.test_data = self.test_data.drop(non_binary_feature, axis=1)

        # option2: use one-hot encoding
        # self.train_data = pd.get_dummies(self.train_data, columns=categorical_feature)
        # self.test_data = pd.get_dummies(self.test_data, columns=categorical_feature)

        # option3: use frequency encoding
        # consider both train and test data
        # print("=====================================")
        # print("=          Frequency Encoding       =")
        # # print("Only Non-binary categorical features")
        # print("=====================================")
        # # non_binary_feature = list(set(categorical_feature) - set(binary_feature))
        # for feature in categorical_feature:
        #     freq = pd.concat([self.train_data[feature], self.test_data[feature]]).value_counts()
        #     self.train_data[feature] = self.train_data[feature].map(freq)
        #     self.test_data[feature] = self.test_data[feature].map(freq)

        # option4: combining frequency encoding and label encoding
        # count the died sum for each category
        # print("=====================================")
        # print("=     Frequency / Label Encoding    =")
        # print("Only Non-binary categorical features")
        # print("=====================================")
        # labeled_train_data = pd.concat([self.train_data, self.train_label], axis=1)
        # non_binary_feature = list(set(categorical_feature) - set(binary_feature))
        # for feature in non_binary_feature:
        #     freq = labeled_train_data.groupby(feature)['has_died'].sum()
        #     self.train_data[feature] = self.train_data[feature].map(freq)
        #     self.test_data[feature] = self.test_data[feature].map(freq)

        # option5: label encoding
        print("=====================================")
        print("=            Label Encoding         =")
#         print("Only Non-binary categorical features")
        print("=====================================")
        labeled_train_data = pd.concat([self.train_data, self.train_label], axis=1)
#         non_binary_feature = list(set(categorical_feature) - set(binary_feature))
        for feature in categorical_feature:
            mean = labeled_train_data.groupby(feature)['has_died'].mean()
            self.train_data[feature] = self.train_data[feature].map(mean)
            self.test_data[feature] = self.test_data[feature].map(mean)


    def data_details(self):
        # show the details and information of dataset
        print("=====================================")
        print("train data information:")
        print(self.train_data.info())
        print(self.train_data.describe())
        print(self.train_data.isnull().sum())
        print(self.train_data.head())
        print("=====================================")

        print("=====================================")
        print("test data information:")
        print(self.test_data.info())
        print(self.test_data.describe())
        print(self.test_data.isnull().sum())
        print(self.test_data.head(3))
        print("=====================================")

    def normalize(self):
        print("=====================================")
        print("=          normalization            =")
        print("=       using min max scaler        =")
        print("=====================================")
        # min max scaler for each feature
        scaler = MinMaxScaler()
        # fit the scalar with merged data
        scaler.fit(pd.concat([self.train_data, self.test_data]))
        train_norm = scaler.transform(self.train_data)
        test_norm = scaler.transform(self.test_data)

        self.train_data = pd.DataFrame(train_norm, columns=self.train_data.columns)
        self.test_data = pd.DataFrame(test_norm, columns=self.test_data.columns)

    def get_train_data(self):
        return self.train_data

    def get_test_data(self):
        return self.test_data

    def process(self):
        self.drop_useless()
        self.categorical_processing()
        
        # impute the missing values
        my_imputer = Imputer(self.train_data, self.test_data, 1)
        my_imputer.process()
        self.train_data, self.test_data = my_imputer.get_train_data(), my_imputer.get_test_data()

        self.normalize()
        print("Preprocessing Completed")
        self.data_details()

In [22]:
class Imputer():
    def __init__(self, train_data, test_data, k = 5):
        self.train_data = train_data
        self.test_data = test_data
        self.merged_df = None
        self.k = k

    def merge_data(self):
        # merge train and test data
        self.merged_df = pd.concat([self.train_data, self.test_data])

    def impute(self):
        print("=====================================")
        print("=          Imputation               =")
        print("=       using KNN Imputer           =")
        print("=====================================")
        imputer = KNNImputer(n_neighbors=self.k)
        imputed_data = imputer.fit_transform(self.merged_df)
        self.merged_df = pd.DataFrame(imputed_data, columns=self.merged_df.columns)

        # print("=====================================")
        # print("=          Imputation               =")
        # print("=       using Simple Imputer        =")
        # print("=====================================")
        # categorical_feature = ['hospital_id'
        #                         , 'elective_surgery'
        #                         , 'ethnicity'
        #                         , 'gender'
        #                         , 'icu_admit_source'
        #                         , 'icu_id'
        #                         , 'icu_stay_type'
        #                         , 'icu_type'
        #                         , 'apache_post_operative'
        #                         , 'arf_apache'
        #                         , 'gcs_unable_apache'
        #                         , 'intubated_apache'
        #                         , 'ventilated_apache'
        #                         , 'aids'
        #                         , 'cirrhosis'
        #                         , 'diabetes_mellitus'
        #                         , 'hepatic_failure'
        #                         , 'immunosuppression'
        #                         , 'leukemia'
        #                         , 'lymphoma'
        #                         , 'solid_tumor_with_metastasis'
        #                         , 'apache_3j_bodysystem'
        #                         , 'apache_2_bodysystem']
        # imputer = SimpleImputer(strategy='most_frequent')
        # imputed_data = imputer.fit_transform(self.merged_df[categorical_feature])
        # self.merged_df[categorical_feature] = pd.DataFrame(imputed_data, columns=categorical_feature)
        # imputer = SimpleImputer(strategy='median')
        # non_category_feature = list(set(self.merged_df.columns) - set(categorical_feature))
        # imputed_data = imputer.fit_transform(self.merged_df[non_category_feature])
        # self.merged_df[non_category_feature] = pd.DataFrame(imputed_data, columns=non_category_feature)

        # split train and test data
        self.train_data = self.merged_df.iloc[:len(self.train_data), :]
        self.test_data = self.merged_df.iloc[len(self.train_data):, :]

    def get_train_data(self):
        return self.train_data
    
    def get_test_data(self):
        return self.test_data
    
    def process(self):
        self.merge_data()
        print("Imputation Started")
        self.impute()
        print("Imputation Completed")

In [23]:
class Model():
    def __init__(self, train_data, train_label, test_data, test_patient_id, best_parms, best_features):
        self.train_data = train_data
        self.train_label = train_label
        self.test_data = test_data
        self.test_patient_id = test_patient_id
        self.best_params = best_parms
        self.best_features = best_features

        self.predicted_label = None

    def cleansing(self):
        # drop the features that are not selected by best features
        self.train_data = self.train_data[self.best_features]
        self.test_data = self.test_data[self.best_features]

    def kfold(self):
        kf = KFold(n_splits=5, shuffle=True, random_state=42)

        f1_scores = []
        auroc_scores = []

        for train_index, test_index in kf.split(self.train_data):
            X_train, X_test = self.train_data.iloc[train_index], self.train_data.iloc[test_index]
            y_train, y_test = self.train_label.iloc[train_index], self.train_label.iloc[test_index]
            clf = XGBClassifier(
                colsample_bytree=self.best_params['clf__colsample_bytree'],
                gamma=self.best_params['clf__gamma'],
                learning_rate=self.best_params['clf__learning_rate'],
                max_depth=self.best_params['clf__max_depth'],
                min_child_weight=self.best_params['clf__min_child_weight'],
                n_estimators=self.best_params['clf__n_estimators'],
                random_state=self.best_params['clf__random_state'],
                scale_pos_weight=self.best_params['clf__scale_pos_weight'],
                subsample=self.best_params['clf__subsample']
            )
            clf.fit(X_train, y_train)
            y_pred = clf.predict(X_test)
            f1 = f1_score(y_test, y_pred, average='macro')
            f1_scores.append(f1)
            auroc = roc_auc_score(y_test, y_pred)
            auroc_scores.append(auroc)
            print("F1 Score: ", f1)
            print("AUROC: ", auroc)

        print("Average Macro F1 Score: ", np.mean(f1_scores))
        print("Average AUROC: ", np.mean(auroc_scores))

    def train(self):
        # fit the classifier with best parameters
        print("=====================================")
        print("=         Training Details          =")
        print("selected features: ", self.best_features)
        print("selected parameters: ", self.best_params)
        print("=====================================")
        self.best_clf = XGBClassifier(n_estimators = self.best_params['clf__n_estimators'],
                                      max_depth = self.best_params['clf__max_depth'],
                                      min_child_weight = self.best_params['clf__min_child_weight'],
                                      learning_rate = self.best_params['clf__learning_rate'],
                                      subsample = self.best_params['clf__subsample'],
                                      colsample_bytree = self.best_params['clf__colsample_bytree'],
                                      gamma = self.best_params['clf__gamma'],
                                      scale_pos_weight = self.best_params['clf__scale_pos_weight'],
                                      random_state = self.best_params['clf__random_state'])
        self.best_clf.fit(self.train_data, self.train_label)
    
    def predict(self):
        # fit the classifier with train data
        self.best_clf.fit(self.train_data, self.train_label)

        # predict the test data
        self.predicted_label = self.best_clf.predict(self.test_data)
    
    def write_to_csv(self, path):
        # merge the patient id and predicted label
        df = pd.DataFrame({'patient_id': self.test_patient_id, 'pred': self.predicted_label})
        # details of the dataframe
        print("=====================================")
        print("=         Predict Details           =")
        print(df.info())
        print(df.describe())
        print(df.head())
        print("has_died: ", df['pred'].sum())
        print("=====================================")
        df.to_csv(path, index=False)

In [24]:
# load the data
my_data_loader = DataLoader('/kaggle/input/2023-data-mining-lab-assignment-2' + '/')
train_data, train_label, test_data, test_patient_id = my_data_loader.process()

# preprocessing & imputation
my_preprocessing = Preprocessing(train_data, train_label, test_data)
my_preprocessing.process()
train_data, test_data = my_preprocessing.get_train_data(), my_preprocessing.get_test_data()
best_parms, best_features = my_data_loader.get_best_parms(), my_data_loader.get_best_features()

print(best_parms, best_features)

# training
my_model = Model(train_data, train_label, test_data, test_patient_id, best_parms, best_features)
my_model.cleansing()
print("validation start")
my_model.kfold()
print("validation end")
print("Training Started")
my_model.train()
print("Training Completed")
print("Prediction Started")
my_model.predict()
print("Prediction Completed")
my_model.write_to_csv('/kaggle/working/rep_result.csv')
print("Output Completed")

Data Loading Completed
=            Label Encoding         =
Imputation Started
=          Imputation               =
=       using KNN Imputer           =
Imputation Completed
=          normalization            =
=       using min max scaler        =
Preprocessing Completed
train data information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44939 entries, 0 to 44938
Data columns (total 81 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   hospital_id                    44939 non-null  float64
 1   age                            44939 non-null  float64
 2   bmi                            44939 non-null  float64
 3   elective_surgery               44939 non-null  float64
 4   ethnicity                      44939 non-null  float64
 5   gender                         44939 non-null  float64
 6   height                         44939 non-null  float64
 7   icu_admit_source               44939 non-null