In [4]:
import sklearn
import time
import pandas as pd   
from matplotlib import pyplot
from numpy import mean
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import train_test_split, RepeatedKFold, GridSearchCV, cross_val_score
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import GaussianNB, CategoricalNB
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2  # Choose an appropriate scoring function for your data

import numpy as np


from imblearn.over_sampling import SMOTE

In [260]:

def missing_value_counts(dataframe):
    missing_counts = dataframe.isnull().sum()
    missing_counts_df = pd.DataFrame({'Column': missing_counts.index, 'Missing_Values_Count': missing_counts.values})
    return missing_counts_df

In [261]:
def robust_scale(df):
    numerical_columns = df.select_dtypes(include=['number']).columns
    scaler = RobustScaler()
    df[numerical_columns] = scaler.fit_transform(df[numerical_columns])

    return df

In [273]:
def knn_impute_numerical_columns(df, n_neighbors=100):
    df_imputed = df.copy()
    df_imputed = pd.get_dummies(drop_first=True, data=df_imputed)
    
    categorical_columns = df_imputed.select_dtypes(exclude=['number']).columns
    numerical_cols = df_imputed.select_dtypes(include=['number']).columns
    
    imputer = KNNImputer(n_neighbors=n_neighbors)
    # imputer = SimpleImputer(strategy='mean') 
    df_imputed[numerical_cols] = imputer.fit_transform(df_imputed[numerical_cols])
    
    # Label encode categorical columns
    # label_encoders = {}
    # for col in categorical_columns:
    #     label_encoders[col] = LabelEncoder()
    #     df_imputed[col] = label_encoders[col].fit_transform(df_imputed[col])

    # imputer = SimpleImputer(strategy='constant', fill_value='Unknown')

    
    # df_imputed[categorical_columns] = imputer.fit_transform(df_imputed[categorical_columns])
    # cat_imputer = SimpleImputer(strategy='most_frequent')
    # df_imputed[categorical_columns] = cat_imputer.fit_transform(df_imputed[categorical_columns])

    return df_imputed


In [274]:
df = pd.read_csv('train.csv')

In [275]:
robust_scale(df)

Unnamed: 0,RecordID,hospital_id,icu_id,ethnicity,gender,icu_admit_source,icu_stay_type,icu_type,apache_3j_bodysystem,apache_2_bodysystem,...,h1_sysbp_min,h1_sysbp_noninvasive_max,h1_sysbp_noninvasive_min,d1_glucose_max,d1_potassium_max,apache_4a_hospital_death_prob,apache_4a_icu_death_prob,immunosuppression,solid_tumor_with_metastasis,hospital_death
0,-1.00000,-0.666667,1.241270,Caucasian,M,Floor,transfer,Med-Surg ICU,Metabolic,Metabolic,...,-0.805556,-1.000000,-0.805556,,,-0.333333,-0.333333,0.0,0.0,0.0
1,-0.99996,-0.789474,0.012698,African American,M,Accident & Emergency,admit,Med-Surg ICU,Cardiovascular,Cardiovascular,...,-0.555556,-0.945946,-0.555556,2.164706,-0.500,,,0.0,0.0,0.0
2,-0.99992,-0.429825,-0.073016,Caucasian,M,Floor,admit,MICU,Respiratory,Respiratory,...,1.305556,1.189189,1.305556,0.129412,0.125,2.750000,2.166667,0.0,0.0,0.0
3,-0.99988,-0.815789,0.860317,Caucasian,M,Accident & Emergency,admit,Med-Surg ICU,Neurological,Neurologic,...,0.694444,0.891892,0.694444,1.282353,-0.375,0.583333,0.666667,0.0,0.0,0.0
4,-0.99984,0.745614,0.968254,Caucasian,F,Accident & Emergency,admit,CSICU,Cardiovascular,Cardiovascular,...,0.111111,-0.297297,0.111111,-0.458824,-0.250,0.833333,0.833333,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,0.99984,-0.807018,0.473016,Caucasian,M,Accident & Emergency,admit,Med-Surg ICU,Musculoskeletal/Skin,Undefined Diagnoses,...,-0.472222,-0.864865,-0.472222,-0.600000,-0.250,-0.250000,-0.166667,0.0,0.0,0.0
49996,0.99988,-0.412281,0.568254,Caucasian,F,Floor,admit,Med-Surg ICU,Neurological,Neurologic,...,0.833333,0.405405,0.833333,-0.047059,-0.375,-0.250000,-0.333333,0.0,0.0,0.0
49997,0.99992,-0.956140,1.180952,Caucasian,F,Accident & Emergency,transfer,Med-Surg ICU,Sepsis,Cardiovascular,...,-0.833333,-0.405405,-0.833333,-0.376471,,0.333333,0.500000,1.0,0.0,0.0
49998,0.99996,0.245614,0.571429,Caucasian,M,Operating Room / Recovery,admit,Cardiac ICU,Cardiovascular,Cardiovascular,...,0.611111,0.189189,0.611111,-0.352941,-0.125,-0.416667,-0.333333,0.0,0.0,0.0


In [276]:
df_imputed = knn_impute_numerical_columns(df, n_neighbors=200)

In [277]:
# missing_value_counts(df_imputed)
df_imputed_copy = df_imputed.copy()


In [283]:
# missing_value_counts(df_imputed)

In [278]:
df_dropped = df_imputed.copy().dropna(axis=0)
df_dropped.shape
# missing_value_counts(df_dropped)

(50000, 89)

In [279]:
df_dropped.shape

(50000, 89)

In [280]:
df_onehot = pd.get_dummies(df_dropped)
df_onehot
df_onehot.shape

(50000, 89)

In [285]:
X = df_onehot.loc[:, df_onehot.columns != "hospital_death"]
y = df_onehot[["hospital_death"]]

X.shape

(50000, 88)

In [298]:
from sklearn.ensemble import RandomForestClassifier

# Create a Random Forest classifier
estimator = RandomForestClassifier(n_estimators=1000, max_depth=5, random_state=0, n_jobs=3)
# Fit the model to your data
estimator.fit(X, y)

# Get feature importances
feature_importances = estimator.feature_importances_


feature_importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': feature_importances
})

feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

feature_num = 10
selected_features = feature_importance_df['Feature'][:feature_num]
selected_features

# k = 10
# # Select top-k features based on importances
# top_k_indices = feature_importances.argsort()[-k:][::-1]

# X_new = X.iloc[:, top_k_indices]
# X_new.columns

  return fit_method(estimator, *args, **kwargs)


46    apache_4a_hospital_death_prob
47         apache_4a_icu_death_prob
9                   gcs_eyes_apache
10                 gcs_motor_apache
17                ventilated_apache
25                     d1_sysbp_min
26         d1_sysbp_noninvasive_min
24                      d1_spo2_min
7               apache_3j_diagnosis
16                      temp_apache
Name: Feature, dtype: object

In [295]:
import matplotlib.pyplot as plt

# Your existing code
model = DecisionTreeClassifier(max_depth=5, min_samples_leaf=3, min_samples_split=3)
model.fit(X, y)
feature_importances = model.feature_importances_

feature_importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': feature_importances
})

feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

feature_num = 20
selected_features = feature_importance_df['Feature'][:10]

# Visualization
# plt.figure(figsize=(12, 6))
# plt.barh(selected_features[::-1], feature_importance_df['Importance'][:feature_num][::-1])
# plt.xlabel('Feature Importance')
# plt.title('Top 20 Feature Importances')
# plt.gca().invert_yaxis()  # Reverse the order for better visualization
# plt.show()

selected_features


46    apache_4a_hospital_death_prob
24                      d1_spo2_min
22           d1_mbp_noninvasive_min
25                     d1_sysbp_min
26         d1_sysbp_noninvasive_min
27                      d1_temp_min
23                  d1_resprate_max
47         apache_4a_icu_death_prob
2                            icu_id
21                       d1_mbp_min
Name: Feature, dtype: object

In [299]:
X = df_onehot[selected_features]
X.shape
X
cols = X.columns
cols

Index(['apache_4a_hospital_death_prob', 'apache_4a_icu_death_prob',
       'gcs_eyes_apache', 'gcs_motor_apache', 'ventilated_apache',
       'd1_sysbp_min', 'd1_sysbp_noninvasive_min', 'd1_spo2_min',
       'apache_3j_diagnosis', 'temp_apache'],
      dtype='object')

In [206]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)


In [86]:

# Initialize the SMOTE oversampling technique
oversampler = SMOTE(sampling_strategy='auto', random_state=42)  # You can adjust 'sampling_strategy' as needed

# Apply SMOTE to oversample the minority class
X_resampled, y_resampled = oversampler.fit_resample(X_train, y_train)

# Create a new DataFrame with the resampled data
df_resampled = pd.DataFrame(data=X_resampled, columns=X_train.columns)
df_resampled['hospital_death'] = y_resampled  # Add the target column back


In [125]:
df_resampled

Unnamed: 0,RecordID,hospital_id,icu_id,age,elective_surgery,pre_icu_los_days,apache_2_diagnosis,apache_3j_diagnosis,apache_post_operative,gcs_eyes_apache,...,apache_2_bodysystem_Gastrointestinal,apache_2_bodysystem_Haematologic,apache_2_bodysystem_Metabolic,apache_2_bodysystem_Neurologic,apache_2_bodysystem_Renal/Genitourinary,apache_2_bodysystem_Respiratory,apache_2_bodysystem_Trauma,apache_2_bodysystem_Undefined Diagnoses,apache_2_bodysystem_Undefined diagnoses,hospital_death
0,-0.466429,0.649123,0.580952,0.217391,1.0,0.496000,0.462766,1.789948,1.0,0.000000,...,False,False,False,False,False,True,False,False,False,0.0
1,0.446309,-0.666667,1.241270,0.043478,0.0,-0.002000,-0.085106,-0.389984,0.0,0.000000,...,False,False,False,False,False,True,False,False,False,0.0
2,0.483190,-0.105263,0.031746,-0.478261,0.0,-0.164000,0.962766,-0.391804,0.0,0.000000,...,False,False,False,False,False,True,False,False,False,1.0
3,-0.516790,-0.429825,-0.085714,-0.086957,0.0,-0.184000,-0.053191,-0.599976,0.0,0.000000,...,False,False,False,False,False,False,False,False,False,0.0
4,-0.805156,-0.649123,0.574603,-0.347826,1.0,0.906000,0.968085,2.001960,1.0,0.000000,...,True,False,False,False,False,False,False,False,False,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71635,-0.949008,0.010438,-0.497600,0.047355,0.0,-0.361465,-0.047872,0.188013,0.0,0.000000,...,False,False,False,False,False,False,False,False,False,1.0
71636,-0.718244,0.261063,1.121006,-0.019007,0.0,-0.353473,0.903178,-0.186394,0.0,-0.063887,...,True,False,False,False,False,False,False,False,False,1.0
71637,-0.644587,0.065542,0.595365,-0.229715,0.0,-0.245568,-0.044836,-0.267529,0.0,-2.165550,...,False,False,False,False,False,False,False,False,False,1.0
71638,-0.272233,-0.808472,0.581653,0.557796,0.0,-0.117768,0.968967,0.003724,0.0,-3.000000,...,True,False,False,False,True,False,False,False,False,1.0


In [160]:
# X = df_onehot.loc[:, df_onehot.columns != "hospital_death"]
# y = df_onehot[["hospital_death"]]
# X.shape

(50000, 88)

In [300]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)


In [301]:
X.shape

(50000, 10)

In [302]:
def roc_auc_cv(model, X, y):
    cv = RepeatedKFold(n_splits=10, n_repeats=2, random_state=1)
    predicted_probabilities = cross_val_score(model, X, y, scoring="roc_auc", cv=cv, n_jobs=-1)

    return mean(predicted_probabilities)

In [303]:
def calculate_roc_auc(model, X_test, y_test):
    md_probs = model.predict_proba(X_test)
    md_probs = md_probs[:, 1]
    md_auc = roc_auc_score(y_test, md_probs)
    return md_auc

# roc_auc = calculate_roc_auc(model, X_test, y_test)
# print(roc_auc)


In [307]:
model  = KNeighborsClassifier(n_neighbors=300)
# model.fit(X_train, y_train)
# calculate_roc_auc(model, X_test, y_test)
roc_auc_cv(model, X, y)


  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)


0.8603654177998201

In [187]:
model  = KNeighborsClassifier(n_neighbors=320)


In [71]:
X_train.shape
y.shape

(89646, 1)

In [308]:
model  = KNeighborsClassifier(n_neighbors=480)
model.fit(X_train, y_train)
calculate_roc_auc(model, X_test, y_test)


  return self._fit(X, y)


0.8603074336096499

In [256]:
model.fit(X, y)


  return self._fit(X, y)


In [99]:
calculate_roc_auc(model, X_test, y_test)

0.8655057116250574

In [259]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, RobustScaler
from sklearn.impute import KNNImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import GridSearchCV

df = pd.read_csv('train.csv')
y = df['hospital_death']
X = df.drop('hospital_death', axis=1)

# Define numerical and categorical columns
numerical_columns = X.select_dtypes(include=np.number).columns
categorical_columns = X.select_dtypes(include='object').columns

# Create transformers for preprocessing
numerical_transformer = Pipeline([
    ('imputer', KNNImputer(n_neighbors=5)),
    ('scaler', RobustScaler())
])

categorical_transformer = Pipeline([
    ('onehot', OneHotEncoder(drop='first', sparse=False))
])

# Use ColumnTransformer to apply transformations to respective columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_columns),
        ('cat', categorical_transformer, categorical_columns)
    ])

# Create a Decision Tree classifier
model = KNeighborsClassifier()

# Create a feature selector using SelectKBest with F-statistic
feature_selector = SelectKBest(score_func=f_classif)

# Create a pipeline that includes preprocessing, feature selection, and the Decision Tree classifier
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('feature_selector', feature_selector),
    ('classifier', model)
])

# Define a grid of hyperparameters to search
param_grid = {
    'feature_selector__k': range(7, 12)  # Number of features to select with Decision Tree
}

# Create GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=2)

# Fit the Grid Search to your data
grid_search.fit(X, y)

# Print the best hyperparameters and corresponding accuracy score
print("Best Parameters:", grid_search.best_params_)
print("Best Accuracy Score:", grid_search.best_score_)




Best Parameters: {'feature_selector__k': 10}
Best Accuracy Score: 0.91662




In [107]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import SelectKBest, f_classif

# Define your KNN model
knn_model = KNeighborsClassifier()

# Define a range of values for the number of neighbors to consider
param_grid = {
    'knn__n_neighbors': range(300, 350, 400),  # Number of neighbors to consider
    'feature_selection__k': [10, 15, 20]  # Number of features to select
}

# Create a feature selector
feature_selector = SelectKBest(score_func=f_classif)

# Create the Grid Search object with a pipeline that includes feature selection
pipeline = Pipeline([
    ('feature_selection', feature_selector),
    ('knn', knn_model)
])

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='roc_auc', n_jobs=-1)

# Fit the Grid Search to your data
grid_search.fit(X, y)

# Print the best hyperparameters and corresponding ROC AUC score
print("Best Parameters:", grid_search.best_params_)
print("Best ROC AUC Score:", grid_search.best_score_)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  y = column_or_1d(y, warn=True)
  return self._fit(X, y)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  y = column_or_1d(y, warn=True)
  return self._fit(X, y)
  y = column_or_1d(y, warn=True)
  return self._fit(X, y)
  y = column_or_1d(y, warn=True)
  return self._fit(X, y)
  y = column_or_1d(y, warn=True)
  return self._fit(X, y)
  y = column_or_1d(y, warn=True)
  return self._fit(X, y)
  y = column_or_1d(y, warn=True)
  return self._fit(X, y)
  y = column_or_1d(y, warn=True)
  return self._fit(X, y)


Best Parameters: {'feature_selection__k': 10, 'knn__n_neighbors': 300}
Best ROC AUC Score: 0.8599244214732769


  y = column_or_1d(y, warn=True)
  return self._fit(X, y)


In [44]:
knn_model = KNeighborsClassifier()

param_grid = {
    'n_neighbors': range(295, 307),  # Number of neighbors to consider
    
}

# Create the Grid Search object
grid_search = GridSearchCV(knn_model, param_grid, cv=5, scoring='roc_auc', n_jobs=-1)

# Fit the Grid Search to your data
grid_search.fit(X, y)

# Print the best hyperparameters and corresponding ROC AUC score
print("Best Parameters:", grid_search.best_params_)
print("Best ROC AUC Score:", grid_search.best_score_)


  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)


Best Parameters: {'n_neighbors': 298}
Best ROC AUC Score: 0.8624999588923373


  return self._fit(X, y)


In [158]:
def find_best_n_neighbors(model, X, y):
    best_n_neighbors = 0
    best_roc_auc = 0

    for n_neighbors in range(270,285 ):
        # roc_auc = calculate_roc_auc(model, X_test, y_test)
        roc_auc = calculate_roc_auc(model, X, y)
        
        if roc_auc > best_roc_auc:
            best_roc_auc = roc_auc
            best_n_neighbors = n_neighbors

    return best_n_neighbors, best_roc_auc

# Example usage:
best_n, best_roc_auc = find_best_n_neighbors(model, X, y)
print(f"Best n_neighbors: {best_n}")
print(f"Best ROC AUC CV score: {best_roc_auc}")


Best n_neighbors: 285
Best ROC AUC CV score: 0.8691283285069423


In [319]:
X

Unnamed: 0,RecordID,hospital_id,icu_id,age,elective_surgery,pre_icu_los_days,apache_2_diagnosis,apache_3j_diagnosis,apache_post_operative,gcs_eyes_apache,...,apache_2_bodysystem_Cardiovascular,apache_2_bodysystem_Gastrointestinal,apache_2_bodysystem_Haematologic,apache_2_bodysystem_Metabolic,apache_2_bodysystem_Neurologic,apache_2_bodysystem_Renal/Genitourinary,apache_2_bodysystem_Respiratory,apache_2_bodysystem_Trauma,apache_2_bodysystem_Undefined Diagnoses,apache_2_bodysystem_Undefined diagnoses
0,-1.00000,-0.666667,1.241270,-1.608696,0.0,-0.366,0.000000,0.592016,0.0,-1.0,...,False,False,False,True,False,False,False,False,False,False
1,-0.99996,-0.789474,0.012698,0.173913,0.0,-0.284,0.957447,-0.595816,0.0,0.0,...,True,False,False,False,False,False,False,False,False,False
2,-0.99992,-0.429825,-0.073016,0.826087,0.0,31.780,-0.085106,-0.389984,0.0,-1.0,...,False,False,False,False,False,False,True,False,False,False
3,-0.99988,-0.815789,0.860317,-0.260870,0.0,-0.132,0.952128,-0.008000,0.0,0.0,...,False,False,False,False,True,False,False,False,False,False
4,-0.99984,0.745614,0.968254,0.869565,0.0,-0.302,-0.026596,-0.601976,0.0,0.0,...,True,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,0.99984,-0.807018,0.473016,-1.000000,0.0,-0.394,0.989362,1.389944,0.0,-1.0,...,False,False,False,False,False,False,False,False,True,False
49996,0.99988,-0.412281,0.568254,-0.173913,0.0,13.500,-0.010638,0.000000,0.0,0.0,...,False,False,False,False,True,False,False,False,False,False
49997,0.99992,-0.956140,1.180952,-0.652174,0.0,-0.392,-0.047872,0.188072,0.0,0.0,...,True,False,False,False,False,False,False,False,False,False
49998,0.99996,0.245614,0.571429,-0.478261,1.0,0.816,0.425532,1.607936,1.0,0.0,...,True,False,False,False,False,False,False,False,False,False


In [178]:
model = KNeighborsClassifier(n_neighbors=299)
model.fit(X, y)

  return self._fit(X, y)


In [179]:
X

Unnamed: 0,apache_4a_hospital_death_prob,d1_sysbp_noninvasive_min,d1_spo2_min,d1_mbp_noninvasive_min,d1_temp_min,apache_4a_icu_death_prob,d1_resprate_max,h1_resprate_min,d1_sysbp_min,h1_sysbp_max,h1_sysbp_noninvasive_max,heart_rate_apache,d1_mbp_min,icu_id,apache_3j_diagnosis
0,-0.333333,-0.923077,0.333333,-0.45,-0.392157,-0.333333,5.1,0.500000,-0.923077,-1.027027,-1.000000,0.181818,-0.45,1.241270,0.592016
1,0.010000,0.000000,0.500000,0.15,-1.764706,0.045000,0.2,-0.166667,0.000000,-0.972973,-0.945946,-0.515152,0.15,0.012698,-0.595816
2,2.750000,1.461538,-1.166667,0.65,-1.764706,2.166667,0.0,0.000000,1.461538,1.162162,1.189189,0.121212,0.65,-0.073016,-0.389984
3,0.583333,1.730769,-0.166667,2.00,-0.588235,0.666667,1.9,0.000000,1.730769,0.864865,0.891892,-1.545455,2.00,0.860317,-0.008000
4,0.833333,-0.192308,-0.666667,-0.05,0.588235,0.833333,0.6,1.833333,-0.192308,-0.324324,-0.297297,1.848485,-0.05,0.968254,-0.601976
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,-0.250000,-0.423077,-0.333333,-0.40,0.588235,-0.166667,0.0,0.166667,-0.423077,-0.891892,-0.864865,-0.303030,-0.40,0.473016,1.389944
49996,-0.250000,1.384615,0.833333,1.40,0.000000,-0.333333,-0.5,-0.666667,1.384615,0.378378,0.405405,-0.424242,1.40,0.568254,0.000000
49997,0.333333,-0.384615,-0.166667,-0.15,-0.392157,0.500000,0.7,-0.833333,-0.384615,-0.432432,-0.405405,0.848485,-0.15,1.180952,0.188072
49998,-0.416667,0.653846,0.666667,0.90,2.156863,-0.333333,-0.7,-0.500000,0.653846,0.162162,0.189189,0.060606,0.90,0.571429,1.607936


In [257]:
def generate_predictions_for_model(model, test_file, output_file):
    
    df_test = pd.read_csv(test_file)
    record_ids = df_test["RecordID"]
    
    df_scaled = robust_scale(df_test)
    df_test_imputed = knn_impute_numerical_columns(df_scaled, n_neighbors=100)
    # df_test_imputed = df_test_imputed.dropna(axis=0)
    
    df_test_onehot = pd.get_dummies(df_test_imputed)
    
    X_test = df_test_onehot[selected_features] 
    
    # X_test = df_test_onehot.loc[:, df_test_onehot.columns != "hospital_death"]
    
    probs = model.predict_proba(X_test)
    probs = probs[:, 1]
    
    # Create a DataFrame for the results
    result = pd.DataFrame({'RecordID': record_ids, 'hospital_death': probs})
    
    # Save the results to a CSV file
    result.to_csv(output_file, index=False, header=["RecordID", "hospital_death"])


In [258]:
generate_predictions_for_model(model, "test.csv", "results47.csv")
