In [48]:
import sklearn
sklearn.__version__

import pandas as pd   
from matplotlib import pyplot
from numpy import mean
import time


from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import train_test_split, RepeatedKFold, GridSearchCV, cross_val_score
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.preprocessing import MinMaxScaler

from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder



def missing_value_counts(dataframe):
    # Calculate the count of missing values in each column
    missing_counts = dataframe.isnull().sum()

    # Create a DataFrame to display the results
    missing_counts_df = pd.DataFrame({'Column': missing_counts.index, 'Missing_Values_Count': missing_counts.values})

    return missing_counts_df


In [49]:
import pandas as pd
from sklearn.impute import SimpleImputer

df = pd.read_csv("train.csv")
df.dtypes

RecordID                           int64
hospital_id                        int64
icu_id                             int64
ethnicity                         object
gender                            object
icu_admit_source                  object
icu_stay_type                     object
icu_type                          object
apache_3j_bodysystem              object
apache_2_bodysystem               object
age                              float64
elective_surgery                   int64
pre_icu_los_days                 float64
apache_2_diagnosis               float64
apache_3j_diagnosis              float64
apache_post_operative              int64
gcs_eyes_apache                  float64
gcs_motor_apache                 float64
gcs_unable_apache                float64
gcs_verbal_apache                float64
heart_rate_apache                float64
intubated_apache                   int64
resprate_apache                  float64
temp_apache                      float64
ventilated_apach

In [50]:
numerical_columns = df.select_dtypes(include=['number']).columns

numerical_imputer = SimpleImputer(strategy='mean')

df[numerical_columns] = numerical_imputer.fit_transform(df[numerical_columns])

scaler = MinMaxScaler()
df[numerical_columns] = scaler.fit_transform(df[numerical_columns])



numerical_columns = df.select_dtypes(include=['number']).columns
categorical_columns = df.select_dtypes(exclude=['number']).columns

label_encoders = {}
for col in categorical_columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Step 3: Initialize the KNNImputer
knn_imputer = KNNImputer(n_neighbors=5)  # You can adjust the number of neighbors as needed

for cat_col in categorical_columns:
    cols_for_imputation = numerical_columns.tolist() + [cat_col]
    imputation_data = df[cols_for_imputation]
    
    # Identify rows with missing values in the current categorical column
    missing_rows = imputation_data[imputation_data[cat_col].isnull()]
    
    # Perform KNN imputation
    
    imputed_values = knn_imputer.fit_transform(imputation_data)
    
    # Update the DataFrame with the imputed values
    df[cols_for_imputation] = imputed_values
    
# Step 5: Inverse Transform Categorical Columns Back to Original Values
for col in categorical_columns:
    le = label_encoders[col]
    df[col] = le.inverse_transform(df[col].astype(int))

In [51]:
df_onehot = pd.get_dummies(df)
df_onehot.dtypes


RecordID                                   float64
hospital_id                                float64
icu_id                                     float64
age                                        float64
elective_surgery                           float64
                                            ...   
apache_2_bodysystem_Renal/Genitourinary       bool
apache_2_bodysystem_Respiratory               bool
apache_2_bodysystem_Trauma                    bool
apache_2_bodysystem_Undefined Diagnoses       bool
apache_2_bodysystem_Undefined diagnoses       bool
Length: 96, dtype: object

In [52]:
df_copy = df_onehot.copy()
X = df_onehot.loc[:, df_onehot.columns != "hospital_death"]
y = df_onehot[["hospital_death"]]

# Initialize a RandomForestClassifier (or any other tree-based model)
model = DecisionTreeClassifier()

# Fit the model to your data
model.fit(X, y)  # Use y if you have a target variable, otherwise omit it

# Get feature importances
feature_importances = model.feature_importances_

# Create a DataFrame to store feature names and their importance scores
feature_importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': feature_importances
})

# Sort features by importance in descending order
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Select the top 20 features
selected_features = feature_importance_df['Feature'][:20]

# Create a new DataFrame with only the selected features
X_selected = X[selected_features]


In [53]:
X_selected

Unnamed: 0,apache_4a_hospital_death_prob,d1_spo2_min,RecordID,pre_icu_los_days,age,h1_resprate_min,icu_id,heart_rate_apache,d1_glucose_max,d1_potassium_max,d1_sysbp_min,temp_apache,d1_resprate_max,apache_4a_icu_death_prob,hospital_id,d1_heartrate_max,resprate_apache,apache_3j_diagnosis,h1_heartrate_min,d1_temp_min
0,0.507538,0.95,0.00000,0.003004,0.164384,0.147287,0.962130,0.540541,0.189108,0.341820,0.252101,0.539474,0.807692,0.507614,0.163366,0.436975,0.839286,0.319394,0.407407,0.729318
1,0.552723,0.96,0.00002,0.003340,0.726027,0.116279,0.504142,0.385135,0.483271,0.214286,0.453782,0.447368,0.179487,0.535224,0.094059,0.294118,0.482143,0.049545,0.342593,0.610895
2,0.693467,0.86,0.00004,0.134922,0.931507,0.124031,0.472189,0.527027,0.161710,0.333333,0.773109,0.526316,0.153846,0.583756,0.297030,0.420168,0.392857,0.096306,0.518519,0.610895
3,0.562814,0.92,0.00006,0.003964,0.589041,0.124031,0.820118,0.155405,0.343866,0.238095,0.831933,0.552632,0.397436,0.538071,0.079208,0.159664,0.964286,0.183084,0.305556,0.712401
4,0.577889,0.89,0.00008,0.003267,0.945205,0.209302,0.860355,0.912162,0.068773,0.261905,0.411765,0.605263,0.230769,0.543147,0.960396,0.882353,0.589286,0.048146,1.000000,0.813906
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,0.512563,0.91,0.99992,0.002889,0.356164,0.131783,0.675740,0.432432,0.046468,0.261905,0.361345,0.631579,0.153846,0.512690,0.084158,0.260504,0.017857,0.500666,0.407407,0.813906
49996,0.512563,0.98,0.99994,0.059906,0.616438,0.093023,0.711243,0.405405,0.133829,0.238095,0.756303,0.565789,0.089744,0.507614,0.306931,0.201681,0.142857,0.184901,0.425926,0.763153
49997,0.547739,0.92,0.99996,0.002897,0.465753,0.085271,0.939645,0.689189,0.081784,0.341820,0.369748,0.539474,0.243590,0.532995,0.000000,0.605042,0.517857,0.227627,0.805556,0.729318
49998,0.502513,0.97,0.99998,0.007854,0.520548,0.100775,0.712426,0.513514,0.085502,0.285714,0.596639,0.568963,0.064103,0.507614,0.678218,0.361345,0.160714,0.550188,0.407407,0.949247


In [54]:

dt_classifier = DecisionTreeClassifier()

# Define the hyperparameters and their possible values to search
param_grid = {
    'criterion': ['gini'],
    'max_depth': [5, 6, 7, 8],
    'min_samples_split': [3, 4, 5, 6, 7, 8, 9, 10],
    'min_samples_leaf': [2, 3, 4, 5, 6, 7]
}

# Create a GridSearchCV object with cross-validation
grid_search = GridSearchCV(estimator=dt_classifier, param_grid=param_grid, cv=5, scoring='roc_auc')

# Fit the GridSearchCV to your data
grid_search.fit(X, y)

# Get the best hyperparameters from the grid search
best_params = grid_search.best_params_

# Get the best estimator (model) with the optimal hyperparameters
best_dt_model = grid_search.best_estimator_

In [55]:
trainX, testX, trainy, testy = train_test_split(X, y, test_size=0.1, random_state=5)
best_dt_model.fit(trainX,trainy)

md_probs = best_dt_model.predict_proba(testX)
md_probs = md_probs[:,1]
md_auc = roc_auc_score(testy, md_probs)
md_auc

0.8479151188234695

In [56]:
best_dt_model.fit(X,y)

In [57]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder

def preprocess_data(df, k_neighbors=5):
    # Step 1: Identify Numerical and Categorical Columns
    numerical_columns = df.select_dtypes(include=['number']).columns
    categorical_columns = df.select_dtypes(exclude=['number']).columns

    # Step 2: Apply Mean Imputation to Numerical Columns
    numerical_imputer = SimpleImputer(strategy='mean')
    df[numerical_columns] = numerical_imputer.fit_transform(df[numerical_columns])

    # Step 3: Min-Max Scaling of Numerical Columns
    scaler = MinMaxScaler()
    df[numerical_columns] = scaler.fit_transform(df[numerical_columns])

    # Step 4: Encode Categorical Columns to Numerical
    label_encoders = {}
    for col in categorical_columns:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])
        label_encoders[col] = le

    # Step 5: Initialize the KNNImputer
    knn_imputer = KNNImputer(n_neighbors=k_neighbors)

    for cat_col in categorical_columns:
        cols_for_imputation = numerical_columns.tolist() + [cat_col]
        imputation_data = df[cols_for_imputation]

        # Identify rows with missing values in the current categorical column
        missing_rows = imputation_data[imputation_data[cat_col].isnull()]

        # Perform KNN imputation
        imputed_values = knn_imputer.fit_transform(imputation_data)

        # Update the DataFrame with the imputed values
        df[cols_for_imputation] = imputed_values

    # Step 6: Inverse Transform Categorical Columns Back to Original Values
    for col in categorical_columns:
        le = label_encoders[col]
        df[col] = le.inverse_transform(df[col].astype(int))

    return df

# Usage example:
# df_processed = preprocess_data(df)


In [58]:
def generate_predictions_for_model(model, test_file, output_file):
    # Load the test data
    df_test = pd.read_csv(test_file)
    record_ids = df_test["RecordID"]
    
    preprocess_data(df_test)
    
    df_test_onehot = pd.get_dummies(df_test)
    X_test = df_test_onehot.loc[:, df_test_onehot.columns != "hospital_death"]
    
    
    # Generate predictions using the model
    probs = model.predict_proba(X_test)
    probs = probs[:, 1]
    
    # Create a DataFrame for the results
    result = pd.DataFrame({'RecordID': record_ids, 'hospital_death': probs})
    
    # Save the results to a CSV file
    result.to_csv(output_file, index=False, header=["RecordID", "hospital_death"])
    
generate_predictions_for_model(best_dt_model, "test.csv", "results9.csv")
