In [1]:
import sklearn
import time
import pandas as pd   
from matplotlib import pyplot
from numpy import mean
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import train_test_split, RepeatedKFold, GridSearchCV, cross_val_score
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import GaussianNB, CategoricalNB
import numpy as np



In [3]:

def missing_value_counts(dataframe):
    missing_counts = dataframe.isnull().sum()
    missing_counts_df = pd.DataFrame({'Column': missing_counts.index, 'Missing_Values_Count': missing_counts.values})
    return missing_counts_df

In [4]:
def robust_scale(df):
    numerical_columns = df.select_dtypes(include=['number']).columns
    scaler = RobustScaler()
    df[numerical_columns] = scaler.fit_transform(df[numerical_columns])

    return df

In [13]:
def knn_impute_numerical_columns(df, n_neighbors=5):
    df_imputed = df.copy()
    numerical_cols = df_imputed.select_dtypes(include=['number']).columns
    imputer = KNNImputer(n_neighbors=n_neighbors)
    df_imputed[numerical_cols] = imputer.fit_transform(df_imputed[numerical_cols])
    return df_imputed


In [6]:
df = pd.read_csv('train.csv')

In [8]:
robust_scale(df)

Unnamed: 0,RecordID,hospital_id,icu_id,ethnicity,gender,icu_admit_source,icu_stay_type,icu_type,apache_3j_bodysystem,apache_2_bodysystem,...,h1_sysbp_min,h1_sysbp_noninvasive_max,h1_sysbp_noninvasive_min,d1_glucose_max,d1_potassium_max,apache_4a_hospital_death_prob,apache_4a_icu_death_prob,immunosuppression,solid_tumor_with_metastasis,hospital_death
0,-1.00000,-0.666667,1.241270,Caucasian,M,Floor,transfer,Med-Surg ICU,Metabolic,Metabolic,...,-0.805556,-1.000000,-0.805556,,,-0.333333,-0.333333,0.0,0.0,0.0
1,-0.99996,-0.789474,0.012698,African American,M,Accident & Emergency,admit,Med-Surg ICU,Cardiovascular,Cardiovascular,...,-0.555556,-0.945946,-0.555556,2.164706,-0.500,,,0.0,0.0,0.0
2,-0.99992,-0.429825,-0.073016,Caucasian,M,Floor,admit,MICU,Respiratory,Respiratory,...,1.305556,1.189189,1.305556,0.129412,0.125,2.750000,2.166667,0.0,0.0,0.0
3,-0.99988,-0.815789,0.860317,Caucasian,M,Accident & Emergency,admit,Med-Surg ICU,Neurological,Neurologic,...,0.694444,0.891892,0.694444,1.282353,-0.375,0.583333,0.666667,0.0,0.0,0.0
4,-0.99984,0.745614,0.968254,Caucasian,F,Accident & Emergency,admit,CSICU,Cardiovascular,Cardiovascular,...,0.111111,-0.297297,0.111111,-0.458824,-0.250,0.833333,0.833333,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,0.99984,-0.807018,0.473016,Caucasian,M,Accident & Emergency,admit,Med-Surg ICU,Musculoskeletal/Skin,Undefined Diagnoses,...,-0.472222,-0.864865,-0.472222,-0.600000,-0.250,-0.250000,-0.166667,0.0,0.0,0.0
49996,0.99988,-0.412281,0.568254,Caucasian,F,Floor,admit,Med-Surg ICU,Neurological,Neurologic,...,0.833333,0.405405,0.833333,-0.047059,-0.375,-0.250000,-0.333333,0.0,0.0,0.0
49997,0.99992,-0.956140,1.180952,Caucasian,F,Accident & Emergency,transfer,Med-Surg ICU,Sepsis,Cardiovascular,...,-0.833333,-0.405405,-0.833333,-0.376471,,0.333333,0.500000,1.0,0.0,0.0
49998,0.99996,0.245614,0.571429,Caucasian,M,Operating Room / Recovery,admit,Cardiac ICU,Cardiovascular,Cardiovascular,...,0.611111,0.189189,0.611111,-0.352941,-0.125,-0.416667,-0.333333,0.0,0.0,0.0


In [34]:
df_imputed = knn_impute_numerical_columns(df, n_neighbors=10)

In [35]:
missing_value_counts(df_imputed)

Unnamed: 0,Column,Missing_Values_Count
0,RecordID,0
1,hospital_id,0
2,icu_id,0
3,ethnicity,576
4,gender,4
5,icu_admit_source,47
6,icu_stay_type,0
7,icu_type,0
8,apache_3j_bodysystem,276
9,apache_2_bodysystem,276


In [None]:
missing_value_counts(df_imputed)

In [36]:
df_dropped = df_imputed.copy().dropna(axis=0)
df_dropped.shape
missing_value_counts(df_dropped)

Unnamed: 0,Column,Missing_Values_Count
0,RecordID,0
1,hospital_id,0
2,icu_id,0
3,ethnicity,0
4,gender,0
5,icu_admit_source,0
6,icu_stay_type,0
7,icu_type,0
8,apache_3j_bodysystem,0
9,apache_2_bodysystem,0


In [37]:
df_dropped.shape

(49103, 58)

In [40]:
df_onehot = pd.get_dummies(df_dropped)
df_onehot
df_onehot.shape

(49103, 96)

In [41]:
X = df_onehot.loc[:, df_onehot.columns != "hospital_death"]
y = df_onehot[["hospital_death"]]

X.shape

(49103, 95)

In [42]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)


In [44]:
def roc_auc_cv(model, X, y):
    cv = RepeatedKFold(n_splits=10, n_repeats=2, random_state=1)
    predicted_probabilities = cross_val_score(model, X, y, scoring="roc_auc", cv=cv, n_jobs=-1)

    return mean(predicted_probabilities)

In [68]:
def calculate_roc_auc(model, X_test, y_test):
    md_probs = model.predict_proba(X_test)
    md_probs = md_probs[:, 1]
    md_auc = roc_auc_score(y_test, md_probs)
    return md_auc

# roc_auc = calculate_roc_auc(model, X_test, y_test)
# print(roc_auc)


In [80]:
model  = KNeighborsClassifier(n_neighbors=300)
model.fit(X_train, y_train)
calculate_roc_auc(model, X_test, y_test)
# roc_auc_cv(model, X, y)


  return self._fit(X, y)


0.8592231095803132

In [75]:
model.fit(X, y)


  return self._fit(X, y)


In [86]:
knn_model = KNeighborsClassifier()

param_grid = {
    'n_neighbors': range(300, 321),  # Number of neighbors to consider
    # 'weights': ['uniform', 'distance'],  # Weighting scheme ('uniform' or 'distance')
}

# Create the Grid Search object
grid_search = GridSearchCV(knn_model, param_grid, cv=5, scoring='roc_auc', n_jobs=-1)

# Fit the Grid Search to your data
grid_search.fit(X, y)

# Print the best hyperparameters and corresponding ROC AUC score
print("Best Parameters:", grid_search.best_params_)
print("Best ROC AUC Score:", grid_search.best_score_)


  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)


In [64]:
def find_best_n_neighbors(model, X, y):
    best_n_neighbors = 0
    best_roc_auc = 0

    for n_neighbors in range(200, 321):
        # roc_auc = calculate_roc_auc(model, X_test, y_test)
        roc_auc = roc_auc_cv(model, X, y)
        
        if roc_auc > best_roc_auc:
            best_roc_auc = roc_auc
            best_n_neighbors = n_neighbors

    return best_n_neighbors, best_roc_auc

# Example usage:
best_n, best_roc_auc = find_best_n_neighbors(model, X, y)
print(f"Best n_neighbors: {best_n}")
print(f"Best ROC AUC CV score: {best_roc_auc}")


  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)


Best n_neighbors: 250
Best ROC AUC CV score: 0.8624237385991635


In [76]:
def generate_predictions_for_model(model, test_file, output_file):
    
    df_test = pd.read_csv(test_file)
    record_ids = df_test["RecordID"]
    
    df_scaled = robust_scale(df_test)
    df_test_imputed = knn_impute_numerical_columns(df_scaled, n_neighbors=10)
    # df_test_imputed = df_test_imputed.dropna(axis=0)
    
    df_test_onehot = pd.get_dummies(df_test_imputed)
    
    X_test = df_test_onehot.loc[:, df_test_onehot.columns != "hospital_death"]
    
    probs = model.predict_proba(X_test)
    probs = probs[:, 1]
    
    # Create a DataFrame for the results
    result = pd.DataFrame({'RecordID': record_ids, 'hospital_death': probs})
    
    # Save the results to a CSV file
    result.to_csv(output_file, index=False, header=["RecordID", "hospital_death"])


In [77]:
generate_predictions_for_model(model, "test.csv", "results34.csv")
