<a href="https://colab.research.google.com/github/Yutong-Lu/Datathon-4/blob/main/YutongLu_XGBoost.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as  sns
import plotly.express as px
import numpy as np
import warnings
import matplotlib.pyplot as plt
from sklearn.naive_bayes import CategoricalNB, GaussianNB, MultinomialNB
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.metrics import RocCurveDisplay, roc_curve, accuracy_score, confusion_matrix, roc_auc_score, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from category_encoders import OneHotEncoder, TargetEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.feature_selection import RFECV
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, HistGradientBoostingClassifier

warnings.filterwarnings('ignore')

In [None]:
data = pd.read_csv('datathon4.csv')
data.head()

Unnamed: 0,encounter_id,patient_id,hospital_id,hospital_death,age,bmi,elective_surgery,ethnicity,gender,height,...,aids,cirrhosis,diabetes_mellitus,hepatic_failure,immunosuppression,leukemia,lymphoma,solid_tumor_with_metastasis,apache_3j_bodysystem,apache_2_bodysystem
0,66154,25312,118,0,68.0,22.73,0,Caucasian,M,180.3,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,Sepsis,Cardiovascular
1,114252,59342,81,0,77.0,27.42,0,Caucasian,F,160.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,Respiratory,Respiratory
2,119783,50777,118,0,25.0,31.95,0,Caucasian,F,172.7,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Metabolic,Metabolic
3,79267,46918,118,0,81.0,22.64,1,Caucasian,F,165.1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Cardiovascular,Cardiovascular
4,92056,34377,33,0,19.0,,0,Caucasian,M,188.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Trauma,Trauma


In [None]:
print(data.shape)

(91713, 186)


In [None]:
# Count the number of missing values (NaN or None) in each column of the 'data' DataFrame
missing_value_counts = data.isnull().sum()
missing_value_counts

encounter_id                      0
patient_id                        0
hospital_id                       0
hospital_death                    0
age                            4228
                               ... 
leukemia                        715
lymphoma                        715
solid_tumor_with_metastasis     715
apache_3j_bodysystem           1662
apache_2_bodysystem            1662
Length: 186, dtype: int64

In [None]:
# all readmission status is 0, also redundant with icu_admit_type
(data['readmission_status'] == 1).sum()

0

In [None]:
sum(data['apache_4a_hospital_death_prob'].isnull())

7947

In [None]:
# Create a subset with manual feature selection
df = data.drop(['encounter_id', 'patient_id', 'hospital_id','icu_id',
               'apache_3j_bodysystem', 'apache_2_bodysystem', 'readmission_status',
                'apache_4a_icu_death_prob', 'apache_4a_hospital_death_prob'], axis=1)

In [None]:
# Define the column that will be used as the target for modeling or analysis
target_column = 'hospital_death'

# List of columns that contain categorical data
categorical_columns = ['ethnicity', 'gender','hospital_admit_source', 'icu_admit_source',
                       'icu_stay_type', 'icu_type']

# List of columns that contain numerical data (excluding categorical columns and the target column)
numerical_columns = [c for c in df.columns if c not in categorical_columns and c != target_column]

for c in categorical_columns:
    print(df[c].unique())

['Caucasian' nan 'Hispanic' 'African American' 'Asian' 'Native American'
 'Other/Unknown']
['M' 'F' nan]
['Floor' 'Emergency Department' 'Operating Room' nan 'Direct Admit'
 'Other Hospital' 'Other ICU' 'ICU to SDU' 'Recovery Room'
 'Chest Pain Center' 'Step-Down Unit (SDU)' 'Acute Care/Floor' 'PACU'
 'Observation' 'ICU' 'Other']
['Floor' 'Accident & Emergency' 'Operating Room / Recovery'
 'Other Hospital' 'Other ICU' nan]
['admit' 'readmit' 'transfer']
['CTICU' 'Med-Surg ICU' 'CCU-CTICU' 'Neuro ICU' 'MICU' 'SICU'
 'Cardiac ICU' 'CSICU']


In [None]:
# Splitting the data into 80% training and 20% testing
train = df.sample(frac=0.8, random_state=10)
test = df.drop(train.index)

In [None]:
# Imputation

# Create a copy of the dataset to use for imputation
train_imputed = train.copy()

# Imputation
imputer = IterativeImputer(max_iter=10, random_state=42)

# Use the imputer to impute the null values in the specified columns
train_imputed[numerical_columns] = imputer.fit_transform(train_imputed[numerical_columns])

KeyboardInterrupt: 

In [None]:
# Initializing the ColumnTransformer
# One-hot encoding is applied to all categorical columns except 'country'
# Target encoding is applied specifically to the 'country' column
ct = ColumnTransformer([
    ('one_hot_encoder', OneHotEncoder(), [c for c in categorical_columns if c != 'ethnicity']),
    ('target_encoder', TargetEncoder(), ['ethnicity'])
], remainder='passthrough')  # Any other columns not specified will be passed through without any transformation

# Initializing the Gradient Boosting Classifier with specified parameters
random_forest = HistGradientBoostingClassifier(max_iter = 100, learning_rate=1.0, max_depth=1)

# Creating a Pipeline:
# First, the data goes through the specified column transformations (ct)
# Then, the transformed data is used to train or predict using the Gradient Boosting model
model = Pipeline([
    ('pre_process', ct),        # Pre-processing step: Applying column transformations
    ('hist_boost', random_forest) # Training/prediction step: Using Gradient Boosting
])

In [None]:
# Training the Gradient Boosting model on the training dataset
model = model.fit(train.drop('hospital_death', axis=1),
                  train['hospital_death'])

# Predicting on the training dataset and computing the accuracy
Y_pred = model.predict(train.drop('hospital_death', axis=1))
accuracy_score(train['hospital_death'], Y_pred)

# Predicting on the test dataset and computing the accuracy
Y_pred = model.predict(test.drop('hospital_death', axis=1))
accuracy_score(test['hospital_death'], Y_pred)

0.913645532355667

In [None]:
# Defining the hyperparameters to be tuned using GridSearchCV
param_dist = {
    "hist_boost__max_iter" : [100, 200],
    "hist_boost__max_depth" : [1, 3, 5],
    "hist_boost__min_samples_leaf" : [25, 50],
    "hist_boost__learning_rate" : [.1,  .2]
}

# Using StratifiedKFold for cross-validation, ensuring each fold has the same proportion of observations with each target value
skf = StratifiedKFold(n_splits=10)

# Setting up the GridSearchCV to find the best hyperparameters for the Gradient Boosting model
random_search = GridSearchCV(model, param_grid=param_dist, cv=skf)

# Fitting the GridSearchCV on the training data
random_search.fit(train.drop('hospital_death', axis=1),
                  train['hospital_death'])

# Storing and displaying the results of the grid search
results = pd.DataFrame(random_search.cv_results_)
results[results['rank_test_score'] == 1]

In [None]:
# Updating the model's parameters with the best ones found from GridSearchCV
model = model.set_params(**random_search.best_params_)

# Retraining the model with the best parameters on the training dataset
model = model.fit(train.drop('hospital_death', axis=1), train['hospital_death'])

# Predicting on the training dataset and computing the accuracy
Y_pred = model.predict(train.drop('hospital_death', axis=1))
accuracy_score(train['hospital_death'], Y_pred)

# Predicting on the test dataset and computing the accuracy
Y_pred = model.predict(test.drop('hospital_death', axis=1))
accuracy_score(test['hospital_death'], Y_pred)