In [804]:
import sklearn
import time
import pandas as pd   
from matplotlib import pyplot
from numpy import mean
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import train_test_split, RepeatedKFold, GridSearchCV, cross_val_score
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import GaussianNB, CategoricalNB
import numpy as np



In [805]:

def missing_value_counts(dataframe):
    missing_counts = dataframe.isnull().sum()
    missing_counts_df = pd.DataFrame({'Column': missing_counts.index, 'Missing_Values_Count': missing_counts.values})
    return missing_counts_df

In [806]:
def custom_impute(df):
    # Separate columns into numerical and categorical
    numerical_columns = df.select_dtypes(include=['number']).columns
    categorical_columns = df.select_dtypes(exclude=['number']).columns

    # Step 1: Impute Numerical Columns with Mean
    num_imputer = SimpleImputer(strategy='mean')
    df[numerical_columns] = num_imputer.fit_transform(df[numerical_columns])

    # Step 2: Impute Categorical Columns with Mode
    cat_imputer = SimpleImputer(strategy='most_frequent')
    df[categorical_columns] = cat_imputer.fit_transform(df[categorical_columns])

    return df

In [807]:
def min_max_scale_dataframe(df):
    numerical_columns = df.select_dtypes(include=['number']).columns
    scaler = MinMaxScaler()
    df[numerical_columns] = scaler.fit_transform(df[numerical_columns])

    return df

In [808]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

def convert_numerical_to_categorical(df, num_bins=5):
    numerical_cols = df.select_dtypes(include=['number']).columns
    df_copy = df.copy()

    # Exclude the "hospital_death" column from numerical_cols
    numerical_cols = [col for col in numerical_cols if col != "hospital_death"]

    for col in numerical_cols:
        bin_labels = [f"{col}_bin_{i}" for i in range(num_bins)]
        df_copy[col] = pd.cut(df_copy[col], bins=num_bins, labels=bin_labels)

    categorical_transformer = OneHotEncoder(sparse=False, handle_unknown='ignore')
    new_categorical_cols = df_copy.select_dtypes(exclude=['number']).columns

    combined_categorical = pd.get_dummies(df_copy[new_categorical_cols], drop_first=True)
    df_copy.drop(numerical_cols, axis=1, inplace=True)

    df_copy = pd.concat([df_copy, combined_categorical], axis=1)

    return df_copy


In [809]:
df = pd.read_csv('train.csv')
imputed_df = custom_impute(df)
imputed_df



Unnamed: 0,RecordID,hospital_id,icu_id,ethnicity,gender,icu_admit_source,icu_stay_type,icu_type,apache_3j_bodysystem,apache_2_bodysystem,...,h1_sysbp_min,h1_sysbp_noninvasive_max,h1_sysbp_noninvasive_min,d1_glucose_max,d1_potassium_max,apache_4a_hospital_death_prob,apache_4a_icu_death_prob,immunosuppression,solid_tumor_with_metastasis,hospital_death
0,1.0,126.0,1931.0,Caucasian,M,Floor,transfer,Med-Surg ICU,Metabolic,Metabolic,...,86.0,93.0,86.0,174.740055,4.235642,0.010000,0.000000,0.0,0.0,0.0
1,2.0,112.0,1544.0,African American,M,Accident & Emergency,admit,Med-Surg ICU,Cardiovascular,Cardiovascular,...,95.0,95.0,95.0,333.000000,3.700000,0.099918,0.054392,0.0,0.0,0.0
2,3.0,153.0,1517.0,Caucasian,M,Floor,admit,MICU,Respiratory,Respiratory,...,162.0,174.0,162.0,160.000000,4.200000,0.380000,0.150000,0.0,0.0,0.0
3,4.0,109.0,1811.0,Caucasian,M,Accident & Emergency,admit,Med-Surg ICU,Neurological,Neurologic,...,140.0,163.0,140.0,258.000000,3.800000,0.120000,0.060000,0.0,0.0,0.0
4,5.0,287.0,1845.0,Caucasian,F,Accident & Emergency,admit,CSICU,Cardiovascular,Cardiovascular,...,119.0,119.0,119.0,110.000000,3.900000,0.150000,0.070000,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,49996.0,110.0,1689.0,Caucasian,M,Accident & Emergency,admit,Med-Surg ICU,Musculoskeletal/Skin,Undefined Diagnoses,...,98.0,98.0,98.0,98.000000,3.900000,0.020000,0.010000,0.0,0.0,0.0
49996,49997.0,155.0,1719.0,Caucasian,F,Floor,admit,Med-Surg ICU,Neurological,Neurologic,...,145.0,145.0,145.0,145.000000,3.800000,0.020000,0.000000,0.0,0.0,0.0
49997,49998.0,93.0,1912.0,Caucasian,F,Accident & Emergency,transfer,Med-Surg ICU,Sepsis,Cardiovascular,...,85.0,115.0,85.0,117.000000,4.235642,0.090000,0.050000,1.0,0.0,0.0
49998,49999.0,230.0,1720.0,Caucasian,M,Operating Room / Recovery,admit,Cardiac ICU,Cardiovascular,Cardiovascular,...,137.0,137.0,137.0,119.000000,4.000000,0.000000,0.000000,0.0,0.0,0.0


In [810]:
min_max_scale_dataframe(imputed_df)
imputed_df

Unnamed: 0,RecordID,hospital_id,icu_id,ethnicity,gender,icu_admit_source,icu_stay_type,icu_type,apache_3j_bodysystem,apache_2_bodysystem,...,h1_sysbp_min,h1_sysbp_noninvasive_max,h1_sysbp_noninvasive_min,d1_glucose_max,d1_potassium_max,apache_4a_hospital_death_prob,apache_4a_icu_death_prob,immunosuppression,solid_tumor_with_metastasis,hospital_death
0,0.00000,0.163366,0.962130,Caucasian,M,Floor,transfer,Med-Surg ICU,Metabolic,Metabolic,...,0.234043,0.121622,0.232394,0.189108,0.341820,0.507538,0.507614,0.0,0.0,0.0
1,0.00002,0.094059,0.504142,African American,M,Accident & Emergency,admit,Med-Surg ICU,Cardiovascular,Cardiovascular,...,0.297872,0.135135,0.295775,0.483271,0.214286,0.552723,0.535224,0.0,0.0,0.0
2,0.00004,0.297030,0.472189,Caucasian,M,Floor,admit,MICU,Respiratory,Respiratory,...,0.773050,0.668919,0.767606,0.161710,0.333333,0.693467,0.583756,0.0,0.0,0.0
3,0.00006,0.079208,0.820118,Caucasian,M,Accident & Emergency,admit,Med-Surg ICU,Neurological,Neurologic,...,0.617021,0.594595,0.612676,0.343866,0.238095,0.562814,0.538071,0.0,0.0,0.0
4,0.00008,0.960396,0.860355,Caucasian,F,Accident & Emergency,admit,CSICU,Cardiovascular,Cardiovascular,...,0.468085,0.297297,0.464789,0.068773,0.261905,0.577889,0.543147,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,0.99992,0.084158,0.675740,Caucasian,M,Accident & Emergency,admit,Med-Surg ICU,Musculoskeletal/Skin,Undefined Diagnoses,...,0.319149,0.155405,0.316901,0.046468,0.261905,0.512563,0.512690,0.0,0.0,0.0
49996,0.99994,0.306931,0.711243,Caucasian,F,Floor,admit,Med-Surg ICU,Neurological,Neurologic,...,0.652482,0.472973,0.647887,0.133829,0.238095,0.512563,0.507614,0.0,0.0,0.0
49997,0.99996,0.000000,0.939645,Caucasian,F,Accident & Emergency,transfer,Med-Surg ICU,Sepsis,Cardiovascular,...,0.226950,0.270270,0.225352,0.081784,0.341820,0.547739,0.532995,1.0,0.0,0.0
49998,0.99998,0.678218,0.712426,Caucasian,M,Operating Room / Recovery,admit,Cardiac ICU,Cardiovascular,Cardiovascular,...,0.595745,0.418919,0.591549,0.085502,0.285714,0.502513,0.507614,0.0,0.0,0.0


In [811]:
missing_value_counts(imputed_df)

Unnamed: 0,Column,Missing_Values_Count
0,RecordID,0
1,hospital_id,0
2,icu_id,0
3,ethnicity,0
4,gender,0
5,icu_admit_source,0
6,icu_stay_type,0
7,icu_type,0
8,apache_3j_bodysystem,0
9,apache_2_bodysystem,0


In [812]:
cat_df = convert_numerical_to_categorical(imputed_df)

In [813]:
cat_df

Unnamed: 0,ethnicity,gender,icu_admit_source,icu_stay_type,icu_type,apache_3j_bodysystem,apache_2_bodysystem,hospital_death,RecordID_RecordID_bin_1,RecordID_RecordID_bin_2,...,apache_4a_icu_death_prob_apache_4a_icu_death_prob_bin_3,apache_4a_icu_death_prob_apache_4a_icu_death_prob_bin_4,immunosuppression_immunosuppression_bin_1,immunosuppression_immunosuppression_bin_2,immunosuppression_immunosuppression_bin_3,immunosuppression_immunosuppression_bin_4,solid_tumor_with_metastasis_solid_tumor_with_metastasis_bin_1,solid_tumor_with_metastasis_solid_tumor_with_metastasis_bin_2,solid_tumor_with_metastasis_solid_tumor_with_metastasis_bin_3,solid_tumor_with_metastasis_solid_tumor_with_metastasis_bin_4
0,Caucasian,M,Floor,transfer,Med-Surg ICU,Metabolic,Metabolic,0.0,False,False,...,False,False,False,False,False,False,False,False,False,False
1,African American,M,Accident & Emergency,admit,Med-Surg ICU,Cardiovascular,Cardiovascular,0.0,False,False,...,False,False,False,False,False,False,False,False,False,False
2,Caucasian,M,Floor,admit,MICU,Respiratory,Respiratory,0.0,False,False,...,False,False,False,False,False,False,False,False,False,False
3,Caucasian,M,Accident & Emergency,admit,Med-Surg ICU,Neurological,Neurologic,0.0,False,False,...,False,False,False,False,False,False,False,False,False,False
4,Caucasian,F,Accident & Emergency,admit,CSICU,Cardiovascular,Cardiovascular,0.0,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,Caucasian,M,Accident & Emergency,admit,Med-Surg ICU,Musculoskeletal/Skin,Undefined Diagnoses,0.0,False,False,...,False,False,False,False,False,False,False,False,False,False
49996,Caucasian,F,Floor,admit,Med-Surg ICU,Neurological,Neurologic,0.0,False,False,...,False,False,False,False,False,False,False,False,False,False
49997,Caucasian,F,Accident & Emergency,transfer,Med-Surg ICU,Sepsis,Cardiovascular,0.0,False,False,...,False,False,False,False,False,True,False,False,False,False
49998,Caucasian,M,Operating Room / Recovery,admit,Cardiac ICU,Cardiovascular,Cardiovascular,0.0,False,False,...,False,False,False,False,False,False,False,False,False,False


In [814]:
df_onehot = pd.get_dummies(cat_df)
df_onehot

Unnamed: 0,hospital_death,RecordID_RecordID_bin_1,RecordID_RecordID_bin_2,RecordID_RecordID_bin_3,RecordID_RecordID_bin_4,hospital_id_hospital_id_bin_1,hospital_id_hospital_id_bin_2,hospital_id_hospital_id_bin_3,hospital_id_hospital_id_bin_4,icu_id_icu_id_bin_1,...,apache_2_bodysystem_Cardiovascular,apache_2_bodysystem_Gastrointestinal,apache_2_bodysystem_Haematologic,apache_2_bodysystem_Metabolic,apache_2_bodysystem_Neurologic,apache_2_bodysystem_Renal/Genitourinary,apache_2_bodysystem_Respiratory,apache_2_bodysystem_Trauma,apache_2_bodysystem_Undefined Diagnoses,apache_2_bodysystem_Undefined diagnoses
0,0.0,False,False,False,False,False,False,False,False,False,...,False,False,False,True,False,False,False,False,False,False
1,0.0,False,False,False,False,False,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False
2,0.0,False,False,False,False,True,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
3,0.0,False,False,False,False,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False
4,0.0,False,False,False,False,False,False,False,True,False,...,True,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,0.0,False,False,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
49996,0.0,False,False,False,True,True,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False
49997,0.0,False,False,False,True,False,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False
49998,0.0,False,False,False,True,False,False,True,False,False,...,True,False,False,False,False,False,False,False,False,False


In [815]:
df_onehot.columns
pd.DataFrame(df_onehot.columns)

Unnamed: 0,0
0,hospital_death
1,RecordID_RecordID_bin_1
2,RecordID_RecordID_bin_2
3,RecordID_RecordID_bin_3
4,RecordID_RecordID_bin_4
...,...
279,apache_2_bodysystem_Renal/Genitourinary
280,apache_2_bodysystem_Respiratory
281,apache_2_bodysystem_Trauma
282,apache_2_bodysystem_Undefined Diagnoses


In [816]:
X = df_onehot.loc[:, df_onehot.columns != "hospital_death"]
y = df_onehot[["hospital_death"]]

X.shape

(50000, 283)

In [817]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=69)

In [818]:
model = DecisionTreeClassifier(max_depth=5, min_samples_leaf=3, min_samples_split=3)

model.fit(X, y) 

feature_importances = model.feature_importances_

feature_importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': feature_importances
})

feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

selected_features = feature_importance_df['Feature'][:500]

X = X[selected_features]

In [819]:
X.shape

(50000, 359)

In [820]:

def roc_auc_cv(model, X, y):
    cv = RepeatedKFold(n_splits=10, n_repeats=2, random_state=1)
    predicted_probabilities = cross_val_score(model, X, y, scoring="roc_auc", cv=cv, n_jobs=-1)

    return mean(predicted_probabilities)


In [821]:
def find_best_parameters(X, y):
    
    param_grid = {
        'var_smoothing': [1e-13, 1e-10, 1e-9, 1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 
                          0.09, 0.095, 0.097, 0.099, 0.1, 0.101, 0.103, 0.105, 0.11]
    }
    

    nb = GaussianNB()

    grid_search = GridSearchCV(estimator=nb, param_grid=param_grid, cv=10, scoring='roc_auc')

    grid_search.fit(X, y)

    best_params = grid_search.best_params_
    best_score = grid_search.best_score_

    return best_params, best_score

# find_best_parameters(X, y)


In [822]:
# find_best_parameters(X, y)

In [823]:
X.shape


(50000, 359)

In [824]:
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import RepeatedKFold
from sklearn.metrics import roc_auc_score
import numpy as np

def roc_auc_cv_cnb(model, X, y):
    cv = RepeatedKFold(n_splits=10, n_repeats=2, random_state=1)
    y_probabilities = cross_val_predict(model, X, y, cv=cv, method='predict_proba', n_jobs=-1)

    # Assuming binary classification, you can select the positive class
    # probability (usually class 1)
    y_scores = y_probabilities[:, 1]

    # Calculate ROC AUC score
    roc_auc = roc_auc_score(y, y_scores)

    return roc_auc


In [825]:
# model = GaussianNB(var_smoothing=1e-09)
model = CategoricalNB()

In [826]:
model.fit(X, y)

  y = column_or_1d(y, warn=True)


In [827]:
X_test = X_test[selected_features]
X_test.shape

(10000, 359)

In [828]:
md_probs = model.predict_proba(X_test)
md_probs = md_probs[:,1]
md_auc = roc_auc_score(y_test, md_probs)
md_auc

0.8473534072545446

In [None]:
def generate_predictions_for_model(model, test_file, output_file):
    
    df_test = pd.read_csv(test_file)
    record_ids = df_test["RecordID"]
    
    df_test_imputed = custom_impute(df_test)
    cat_df = convert_numerical_to_categorical(df_test_imputed)
    df_test_onehot = pd.get_dummies(cat_df)
    
    # df_test_onehot = pd.get_dummies(df_test_imputed)
    # min_max_scale_dataframe(df_test_onehot)
    
    X_test = df_test_onehot.loc[:, df_test_onehot.columns != "hospital_death"]
    # X_test = X_test[selected_features]
    # Generate predictions using the model
    probs = model.predict_proba(X_test)
    probs = probs[:, 1]
    
    # Create a DataFrame for the results
    result = pd.DataFrame({'RecordID': record_ids, 'hospital_death': probs})
    
    # Save the results to a CSV file
    result.to_csv(output_file, index=False, header=["RecordID", "hospital_death"])
    
generate_predictions_for_model(model, "test.csv", "results30.csv")
