In [1]:
import sklearn
import time
import pandas as pd   
from matplotlib import pyplot
from numpy import mean
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import train_test_split, RepeatedKFold, GridSearchCV, cross_val_score
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import GaussianNB, CategoricalNB
import numpy as np



In [2]:

def missing_value_counts(dataframe):
    missing_counts = dataframe.isnull().sum()
    missing_counts_df = pd.DataFrame({'Column': missing_counts.index, 'Missing_Values_Count': missing_counts.values})
    return missing_counts_df

In [7]:
def num_impute(df):
    
    df = df.copy()
    # Separate columns into numerical and categorical
    numerical_columns = df.select_dtypes(include=['number']).columns
    categorical_columns = df.select_dtypes(exclude=['number']).columns

    # Step 1: Impute Numerical Columns with Mean
    num_imputer = SimpleImputer(strategy='mean')
    num_imputer = KNNImputer(n_neighbors=150)
    df[numerical_columns] = num_imputer.fit_transform(df[numerical_columns])

    return df

def cat_impute(df):
    
    df = df.copy()
    categorical_columns = df.select_dtypes(exclude=['number']).columns
    num_imputer = SimpleImputer(strategy='most_frequent')
    df[categorical_columns] = num_imputer.fit_transform(df[categorical_columns])

    return df

In [4]:
def robust_scale(df):
    numerical_columns = df.select_dtypes(include=['number']).columns
    scaler = RobustScaler()
    df[numerical_columns] = scaler.fit_transform(df[numerical_columns])

    return df

In [5]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

def convert_numerical_to_categorical(df, num_bins=9):
    numerical_cols = df.select_dtypes(include=['number']).columns
    df_copy = df.copy()

    # Exclude the "hospital_death" column from numerical_cols
    numerical_cols = [col for col in numerical_cols if col != "hospital_death"]

    for col in numerical_cols:
        bin_labels = [f"{col}_bin_{i}" for i in range(num_bins)]
        df_copy[col] = pd.cut(df_copy[col], bins=num_bins, labels=bin_labels)

    categorical_transformer = OneHotEncoder(sparse=False, handle_unknown='ignore')
    new_categorical_cols = df_copy.select_dtypes(exclude=['number']).columns

    combined_categorical = pd.get_dummies(df_copy[new_categorical_cols], drop_first=True)
    df_copy.drop(numerical_cols, axis=1, inplace=True)

    df_copy = pd.concat([df_copy, combined_categorical], axis=1)

    return df_copy



In [9]:
df = pd.read_csv('train.csv')
robust_scale(df)

Unnamed: 0,RecordID,hospital_id,icu_id,ethnicity,gender,icu_admit_source,icu_stay_type,icu_type,apache_3j_bodysystem,apache_2_bodysystem,...,h1_sysbp_min,h1_sysbp_noninvasive_max,h1_sysbp_noninvasive_min,d1_glucose_max,d1_potassium_max,apache_4a_hospital_death_prob,apache_4a_icu_death_prob,immunosuppression,solid_tumor_with_metastasis,hospital_death
0,-1.00000,-0.666667,1.241270,Caucasian,M,Floor,transfer,Med-Surg ICU,Metabolic,Metabolic,...,-0.805556,-1.000000,-0.805556,,,-0.333333,-0.333333,0.0,0.0,0.0
1,-0.99996,-0.789474,0.012698,African American,M,Accident & Emergency,admit,Med-Surg ICU,Cardiovascular,Cardiovascular,...,-0.555556,-0.945946,-0.555556,2.164706,-0.500,,,0.0,0.0,0.0
2,-0.99992,-0.429825,-0.073016,Caucasian,M,Floor,admit,MICU,Respiratory,Respiratory,...,1.305556,1.189189,1.305556,0.129412,0.125,2.750000,2.166667,0.0,0.0,0.0
3,-0.99988,-0.815789,0.860317,Caucasian,M,Accident & Emergency,admit,Med-Surg ICU,Neurological,Neurologic,...,0.694444,0.891892,0.694444,1.282353,-0.375,0.583333,0.666667,0.0,0.0,0.0
4,-0.99984,0.745614,0.968254,Caucasian,F,Accident & Emergency,admit,CSICU,Cardiovascular,Cardiovascular,...,0.111111,-0.297297,0.111111,-0.458824,-0.250,0.833333,0.833333,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,0.99984,-0.807018,0.473016,Caucasian,M,Accident & Emergency,admit,Med-Surg ICU,Musculoskeletal/Skin,Undefined Diagnoses,...,-0.472222,-0.864865,-0.472222,-0.600000,-0.250,-0.250000,-0.166667,0.0,0.0,0.0
49996,0.99988,-0.412281,0.568254,Caucasian,F,Floor,admit,Med-Surg ICU,Neurological,Neurologic,...,0.833333,0.405405,0.833333,-0.047059,-0.375,-0.250000,-0.333333,0.0,0.0,0.0
49997,0.99992,-0.956140,1.180952,Caucasian,F,Accident & Emergency,transfer,Med-Surg ICU,Sepsis,Cardiovascular,...,-0.833333,-0.405405,-0.833333,-0.376471,,0.333333,0.500000,1.0,0.0,0.0
49998,0.99996,0.245614,0.571429,Caucasian,M,Operating Room / Recovery,admit,Cardiac ICU,Cardiovascular,Cardiovascular,...,0.611111,0.189189,0.611111,-0.352941,-0.125,-0.416667,-0.333333,0.0,0.0,0.0


In [10]:
imputed_df = num_impute(df)
imputed_df


Unnamed: 0,RecordID,hospital_id,icu_id,ethnicity,gender,icu_admit_source,icu_stay_type,icu_type,apache_3j_bodysystem,apache_2_bodysystem,...,h1_sysbp_min,h1_sysbp_noninvasive_max,h1_sysbp_noninvasive_min,d1_glucose_max,d1_potassium_max,apache_4a_hospital_death_prob,apache_4a_icu_death_prob,immunosuppression,solid_tumor_with_metastasis,hospital_death
0,-1.00000,-0.666667,1.241270,Caucasian,M,Floor,transfer,Med-Surg ICU,Metabolic,Metabolic,...,-0.805556,-1.000000,-0.805556,0.120549,0.100833,-0.333333,-0.333333,0.0,0.0,0.0
1,-0.99996,-0.789474,0.012698,African American,M,Accident & Emergency,admit,Med-Surg ICU,Cardiovascular,Cardiovascular,...,-0.555556,-0.945946,-0.555556,2.164706,-0.500000,-0.099444,-0.170000,0.0,0.0,0.0
2,-0.99992,-0.429825,-0.073016,Caucasian,M,Floor,admit,MICU,Respiratory,Respiratory,...,1.305556,1.189189,1.305556,0.129412,0.125000,2.750000,2.166667,0.0,0.0,0.0
3,-0.99988,-0.815789,0.860317,Caucasian,M,Accident & Emergency,admit,Med-Surg ICU,Neurological,Neurologic,...,0.694444,0.891892,0.694444,1.282353,-0.375000,0.583333,0.666667,0.0,0.0,0.0
4,-0.99984,0.745614,0.968254,Caucasian,F,Accident & Emergency,admit,CSICU,Cardiovascular,Cardiovascular,...,0.111111,-0.297297,0.111111,-0.458824,-0.250000,0.833333,0.833333,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,0.99984,-0.807018,0.473016,Caucasian,M,Accident & Emergency,admit,Med-Surg ICU,Musculoskeletal/Skin,Undefined Diagnoses,...,-0.472222,-0.864865,-0.472222,-0.600000,-0.250000,-0.250000,-0.166667,0.0,0.0,0.0
49996,0.99988,-0.412281,0.568254,Caucasian,F,Floor,admit,Med-Surg ICU,Neurological,Neurologic,...,0.833333,0.405405,0.833333,-0.047059,-0.375000,-0.250000,-0.333333,0.0,0.0,0.0
49997,0.99992,-0.956140,1.180952,Caucasian,F,Accident & Emergency,transfer,Med-Surg ICU,Sepsis,Cardiovascular,...,-0.833333,-0.405405,-0.833333,-0.376471,0.049167,0.333333,0.500000,1.0,0.0,0.0
49998,0.99996,0.245614,0.571429,Caucasian,M,Operating Room / Recovery,admit,Cardiac ICU,Cardiovascular,Cardiovascular,...,0.611111,0.189189,0.611111,-0.352941,-0.125000,-0.416667,-0.333333,0.0,0.0,0.0


In [11]:
cat_imputed = cat_impute(imputed_df)

In [12]:
dropped_df = cat_imputed.copy().dropna(axis=0)
dropped_df.shape

(50000, 58)

In [61]:
cat_df = convert_numerical_to_categorical(imputed_df)
cat_df

Unnamed: 0,ethnicity,gender,icu_admit_source,icu_stay_type,icu_type,apache_3j_bodysystem,apache_2_bodysystem,hospital_death,RecordID_RecordID_bin_1,RecordID_RecordID_bin_2,...,immunosuppression_immunosuppression_bin_7,immunosuppression_immunosuppression_bin_8,solid_tumor_with_metastasis_solid_tumor_with_metastasis_bin_1,solid_tumor_with_metastasis_solid_tumor_with_metastasis_bin_2,solid_tumor_with_metastasis_solid_tumor_with_metastasis_bin_3,solid_tumor_with_metastasis_solid_tumor_with_metastasis_bin_4,solid_tumor_with_metastasis_solid_tumor_with_metastasis_bin_5,solid_tumor_with_metastasis_solid_tumor_with_metastasis_bin_6,solid_tumor_with_metastasis_solid_tumor_with_metastasis_bin_7,solid_tumor_with_metastasis_solid_tumor_with_metastasis_bin_8
0,Caucasian,M,Floor,transfer,Med-Surg ICU,Metabolic,Metabolic,0.0,False,False,...,False,False,False,False,False,False,False,False,False,False
1,African American,M,Accident & Emergency,admit,Med-Surg ICU,Cardiovascular,Cardiovascular,0.0,False,False,...,False,False,False,False,False,False,False,False,False,False
2,Caucasian,M,Floor,admit,MICU,Respiratory,Respiratory,0.0,False,False,...,False,False,False,False,False,False,False,False,False,False
3,Caucasian,M,Accident & Emergency,admit,Med-Surg ICU,Neurological,Neurologic,0.0,False,False,...,False,False,False,False,False,False,False,False,False,False
4,Caucasian,F,Accident & Emergency,admit,CSICU,Cardiovascular,Cardiovascular,0.0,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,Caucasian,M,Accident & Emergency,admit,Med-Surg ICU,Musculoskeletal/Skin,Undefined Diagnoses,0.0,False,False,...,False,False,False,False,False,False,False,False,False,False
49996,Caucasian,F,Floor,admit,Med-Surg ICU,Neurological,Neurologic,0.0,False,False,...,False,False,False,False,False,False,False,False,False,False
49997,Caucasian,F,Accident & Emergency,transfer,Med-Surg ICU,Sepsis,Cardiovascular,0.0,False,False,...,False,True,False,False,False,False,False,False,False,False
49998,Caucasian,M,Operating Room / Recovery,admit,Cardiac ICU,Cardiovascular,Cardiovascular,0.0,False,False,...,False,False,False,False,False,False,False,False,False,False


In [14]:
df_onehot = pd.get_dummies(cat_df)
df_onehot

Unnamed: 0,hospital_death,RecordID_RecordID_bin_1,RecordID_RecordID_bin_2,RecordID_RecordID_bin_3,RecordID_RecordID_bin_4,RecordID_RecordID_bin_5,RecordID_RecordID_bin_6,RecordID_RecordID_bin_7,RecordID_RecordID_bin_8,hospital_id_hospital_id_bin_1,...,apache_2_bodysystem_Cardiovascular,apache_2_bodysystem_Gastrointestinal,apache_2_bodysystem_Haematologic,apache_2_bodysystem_Metabolic,apache_2_bodysystem_Neurologic,apache_2_bodysystem_Renal/Genitourinary,apache_2_bodysystem_Respiratory,apache_2_bodysystem_Trauma,apache_2_bodysystem_Undefined Diagnoses,apache_2_bodysystem_Undefined diagnoses
0,0.0,False,False,False,False,False,False,False,False,True,...,False,False,False,True,False,False,False,False,False,False
1,0.0,False,False,False,False,False,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False
2,0.0,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
3,0.0,False,False,False,False,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False
4,0.0,False,False,False,False,False,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,0.0,False,False,False,False,False,False,False,True,False,...,False,False,False,False,False,False,False,False,True,False
49996,0.0,False,False,False,False,False,False,False,True,False,...,False,False,False,False,True,False,False,False,False,False
49997,0.0,False,False,False,False,False,False,False,True,False,...,True,False,False,False,False,False,False,False,False,False
49998,0.0,False,False,False,False,False,False,False,True,False,...,True,False,False,False,False,False,False,False,False,False


In [15]:
df_onehot.columns
pd.DataFrame(df_onehot.columns)

Unnamed: 0,0
0,hospital_death
1,RecordID_RecordID_bin_1
2,RecordID_RecordID_bin_2
3,RecordID_RecordID_bin_3
4,RecordID_RecordID_bin_4
...,...
479,apache_2_bodysystem_Renal/Genitourinary
480,apache_2_bodysystem_Respiratory
481,apache_2_bodysystem_Trauma
482,apache_2_bodysystem_Undefined Diagnoses


In [16]:
X = df_onehot.loc[:, df_onehot.columns != "hospital_death"]
y = df_onehot[["hospital_death"]]

X.shape

(50000, 483)

In [17]:
model = DecisionTreeClassifier(max_depth=6, min_samples_leaf=3, min_samples_split=3)

# Fit the model to your data
model.fit(X, y)  # Use y if you have a target variable, otherwise omit it

# Get feature importances
feature_importances = model.feature_importances_

# Create a DataFrame to store feature names and their importance scores
feature_importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': feature_importances
})

# Sort features by importance in descending order
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Select the top 20 features
selected_features = feature_importance_df['Feature'][:]

# Create a new DataFrame with only the selected features
X_selected = X[selected_features]
# X = X_selected


In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=69)

In [50]:
X_train.dtypes
model.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


In [51]:
def roc_auc_cv(model, X, y):
    cv = RepeatedKFold(n_splits=10, n_repeats=2, random_state=1)
    predicted_probabilities = cross_val_score(model, X, y, scoring="roc_auc", cv=cv, n_jobs=-1)

    return mean(predicted_probabilities)


In [45]:
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import RepeatedKFold
from sklearn.metrics import roc_auc_score
import numpy as np

def roc_auc_cv_cnb(model, X, y):
    cv = RepeatedKFold(n_splits=10, n_repeats=2, random_state=1)
    y_probabilities = cross_val_predict(model, X, y, cv=cv, method='predict_proba', n_jobs=-1)

    # Assuming binary classification, you can select the positive class
    # probability (usually class 1)
    y_scores = y_probabilities[:, 1]

    # Calculate ROC AUC score
    roc_auc = roc_auc_score(y, y_scores)

    return roc_auc

In [26]:
from sklearn.naive_bayes import CategoricalNB
from sklearn.model_selection import GridSearchCV

def find_best_parameters_categorical(X, y):
    param_grid = {
        'alpha': [1e-13, 1e-10, 1e-9, 1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2]
    }

    categorical_nb = CategoricalNB()

    grid_search = GridSearchCV(estimator=categorical_nb, param_grid=param_grid, cv=5, scoring='roc_auc', n_jobs=3)

    grid_search.fit(X, y)

    best_params = grid_search.best_params_
    best_score = grid_search.best_score_

    return best_params, best_score

# Usage
best_params_categorical, best_score_categorical = find_best_parameters_categorical(X, y)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
Traceback (most recent call last):
  File "/opt/homebrew/Caskroom/miniforge/base/envs/imlenv/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 459, in _score
    y_pred = method_caller(clf, "decision_function", X, pos_label=pos_label)
  File "/opt/homebrew/Caskroom/miniforge/base/envs/imlenv/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 86, in _cached_call
    result, _ = _get_response_values(
  File "/opt/homebrew/Caskroom/miniforge/base/envs/imlenv/lib/python3.10/site-packages/sklearn/utils/_response.py", line 181, in _get_response_values
    prediction_method = _check_response_method(estimator, response_method)
  File "/opt/homebrew/Caskroom/miniforge/base/envs/imlenv/lib/python3.10/site-packages/sklearn/utils/validation.py", line 1939, in _check_response_method
    raise AttributeError(
AttributeError: CategoricalNB has none of the following attributes: d

In [27]:
best_params_categorical, best_score_categorical

({'alpha': 1e-13}, nan)

In [33]:
def roc_auc_cv(model, X, y):
    cv = RepeatedKFold(n_splits=10, n_repeats=2, random_state=1)
    predicted_probabilities = cross_val_score(model, X, y, scoring="roc_auc", cv=cv, n_jobs=-1)

    return mean(predicted_probabilities)

In [54]:
# model = GaussianNB(var_smoothing=1e-09)
model = CategoricalNB()
model.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


In [63]:
md_probs = model.predict_proba(X_test)

IndexError: index 1 is out of bounds for axis 1 with size 1

In [55]:
roc_auc_cv(model, X, y)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
Traceback (most recent call last):
  File "/opt/homebrew/Caskroom/miniforge/base/envs/imlenv/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 459, in _score
    y_pred = method_caller(clf, "decision_function", X, pos_label=pos_label)
  File "/opt/homebrew/Caskroom/miniforge/base/envs/imlenv/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 86, in _cached_call
    result, _ = _get_response_values(
  File "/opt/homebrew/Caskroom/miniforge/base/envs/imlenv/lib/python3.10/site-packages/sklearn/utils/_response.py", line 181, in _get_response_values
    prediction_method = _check_response_method(estimator, response_method)
  File "/opt/homebrew/Caskroom/miniforge/base/envs/imlenv/lib/python3.10/site-packag

nan

In [36]:
model.fit(X, y)
model.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [37]:
# X_test = X_test[selected_features]
X_test.shape

(10000, 483)

In [41]:
md_probs = model.predict_proba(X_test)
md_probs = md_probs[0]
md_auc = roc_auc_score(y_test, md_probs)
md_auc

IndexError: index 1 is out of bounds for axis 1 with size 1

In [None]:
def generate_predictions_for_model(model, test_file, output_file):
    
    df_test = pd.read_csv(test_file)
    record_ids = df_test["RecordID"]
    
    df_test_imputed = num_impute(df_test)
    cat_df = convert_numerical_to_categorical(df_test_imputed)
    df_test_onehot = pd.get_dummies(cat_df)
    
    # df_test_onehot = pd.get_dummies(df_test_imputed)
    # min_max_scale_dataframe(df_test_onehot)
    
    X_test = df_test_onehot.loc[:, df_test_onehot.columns != "hospital_death"]
    # X_test = X_test[selected_features]
    # Generate predictions using the model
    probs = model.predict_proba(X_test)
    probs = probs[:, 1]
    
    # Create a DataFrame for the results
    result = pd.DataFrame({'RecordID': record_ids, 'hospital_death': probs})
    
    # Save the results to a CSV file
    result.to_csv(output_file, index=False, header=["RecordID", "hospital_death"])
    
generate_predictions_for_model(model, "test.csv", "results33.csv")
