In [553]:
import sklearn
sklearn.__version__

import pandas as pd   
from matplotlib import pyplot
from numpy import mean
import time


from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import train_test_split, RepeatedKFold, GridSearchCV, cross_val_score
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler
from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder



def missing_value_counts(dataframe):
    # Calculate the count of missing values in each column
    missing_counts = dataframe.isnull().sum()

    # Create a DataFrame to display the results
    missing_counts_df = pd.DataFrame({'Column': missing_counts.index, 'Missing_Values_Count': missing_counts.values})

    return missing_counts_df

import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder

def preprocess_data(df, k_neighbors=5):
    # Step 1: Identify Numerical and Categorical Columns
    numerical_columns = df.select_dtypes(include=['number']).columns
    categorical_columns = df.select_dtypes(exclude=['number']).columns

    # Step 2: Apply Mean Imputation to Numerical Columns
    numerical_imputer = SimpleImputer(strategy='mean')
    df[numerical_columns] = numerical_imputer.fit_transform(df[numerical_columns])

    # Step 3: Min-Max Scaling of Numerical Columns
    scaler = StandardScaler()
    df[numerical_columns] = scaler.fit_transform(df[numerical_columns])

    # Step 4: Encode Categorical Columns to Numerical
    label_encoders = {}
    for col in categorical_columns:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])
        label_encoders[col] = le

    # Step 5: Initialize the KNNImputer
    knn_imputer = KNNImputer(n_neighbors=k_neighbors)

    for cat_col in categorical_columns:
        cols_for_imputation = numerical_columns.tolist() + [cat_col]
        imputation_data = df[cols_for_imputation]

        # Identify rows with missing values in the current categorical column
        missing_rows = imputation_data[imputation_data[cat_col].isnull()]

        # Perform KNN imputation
        imputed_values = knn_imputer.fit_transform(imputation_data)

        # Update the DataFrame with the imputed values
        df[cols_for_imputation] = imputed_values

    # Step 6: Inverse Transform Categorical Columns Back to Original Values
    for col in categorical_columns:
        le = label_encoders[col]
        df[col] = le.inverse_transform(df[col].astype(int))

    return df



In [554]:
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder

def custom_impute(df, n_neighbors=5):
    # Separate columns into numerical and categorical
    numerical_columns = df.select_dtypes(include=['number']).columns
    categorical_columns = df.select_dtypes(exclude=['number']).columns
    print(categorical_columns)

    # Step 1: Encode Categorical Columns to Numerical
    label_encoders = {}
    for col in categorical_columns:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])
        label_encoders[col] = le

    # Step 2: Impute Numerical Columns with Mean
    df[numerical_columns] = df[numerical_columns].fillna(df[numerical_columns].mean())

    # Step 3: Initialize the KNNImputer for Categorical Columns
    knn_imputer = KNNImputer(n_neighbors=n_neighbors)

    # Step 4: Create a DataFrame for imputation, including both numerical and encoded categorical columns
    imputation_data = pd.concat([df[numerical_columns], df[categorical_columns]], axis=1)

    # Step 5: Perform KNN imputation on Categorical Columns
    imputed_values = knn_imputer.fit_transform(imputation_data)

    # Step 6: Update the original DataFrame with the imputed values
    # Ensure the order of columns in the DataFrame matches the original order
    imputed_df = pd.DataFrame(imputed_values, columns=imputation_data.columns)
    
    imputed_df[categorical_columns] = imputed_df[categorical_columns].astype(str)
    
    scaler = MinMaxScaler()
    imputed_df[numerical_columns] = scaler.fit_transform(imputed_df[numerical_columns])

    # # Step 7: Inverse Transform Categorical Columns Back to Original Values
    # for col in categorical_columns:
    #     le = label_encoders[col]
    #     imputed_df[col] = le.inverse_transform(imputed_df[col].astype(int))

    return imputed_df

# Example usage:
# Replace 'your_dataframe' with your DataFrame.
# df_imputed = custom_impute(your_dataframe)


In [555]:
import pandas as pd
from sklearn.impute import SimpleImputer

df = pd.read_csv("train.csv")

In [556]:
imputed_df = custom_impute(df)



# categorical_columns = df.select_dtypes(exclude=['number']).columns
# categorical_columns

Index(['ethnicity', 'gender', 'icu_admit_source', 'icu_stay_type', 'icu_type',
       'apache_3j_bodysystem', 'apache_2_bodysystem'],
      dtype='object')


In [557]:
imputed_df.head()

Unnamed: 0,RecordID,hospital_id,icu_id,age,elective_surgery,pre_icu_los_days,apache_2_diagnosis,apache_3j_diagnosis,apache_post_operative,gcs_eyes_apache,...,immunosuppression,solid_tumor_with_metastasis,hospital_death,ethnicity,gender,icu_admit_source,icu_stay_type,icu_type,apache_3j_bodysystem,apache_2_bodysystem
0,0.0,0.163366,0.96213,0.164384,0.0,0.003004,0.101449,0.319394,0.0,0.666667,...,0.0,0.0,0.0,2.0,1.0,1.0,2.0,5.0,5.0,3.0
1,2e-05,0.094059,0.504142,0.726027,0.0,0.00334,0.971014,0.049545,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,5.0,0.0,0.0
2,4e-05,0.29703,0.472189,0.931507,0.0,0.134922,0.024155,0.096306,0.0,0.666667,...,0.0,0.0,0.0,2.0,1.0,1.0,0.0,4.0,8.0,6.0
3,6e-05,0.079208,0.820118,0.589041,0.0,0.003964,0.966184,0.183084,0.0,1.0,...,0.0,0.0,0.0,2.0,1.0,0.0,0.0,5.0,7.0,4.0
4,8e-05,0.960396,0.860355,0.945205,0.0,0.003267,0.077295,0.048146,0.0,1.0,...,0.0,0.0,0.0,2.0,0.0,0.0,0.0,1.0,0.0,0.0


In [558]:
missing_value_counts(imputed_df)

Unnamed: 0,Column,Missing_Values_Count
0,RecordID,0
1,hospital_id,0
2,icu_id,0
3,age,0
4,elective_surgery,0
5,pre_icu_los_days,0
6,apache_2_diagnosis,0
7,apache_3j_diagnosis,0
8,apache_post_operative,0
9,gcs_eyes_apache,0


In [559]:
# df = preprocess_data(df)
# print(imputed_df.dtypes)
df_onehot = pd.get_dummies(imputed_df)
df_onehot.dtypes


RecordID                   float64
hospital_id                float64
icu_id                     float64
age                        float64
elective_surgery           float64
                            ...   
apache_2_bodysystem_5.0       bool
apache_2_bodysystem_6.0       bool
apache_2_bodysystem_7.0       bool
apache_2_bodysystem_8.0       bool
apache_2_bodysystem_9.0       bool
Length: 101, dtype: object

In [560]:
# df_copy = df_onehot.copy()
X = df_onehot.loc[:, df_onehot.columns != "hospital_death"]
y = df_onehot[["hospital_death"]]

# Initialize a RandomForestClassifier (or any other tree-based model)
model = DecisionTreeClassifier(max_depth=5, min_samples_leaf=3, min_samples_split=3)

# Fit the model to your data
model.fit(X, y)  # Use y if you have a target variable, otherwise omit it

# Get feature importances
feature_importances = model.feature_importances_

# Create a DataFrame to store feature names and their importance scores
feature_importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': feature_importances
})

# Sort features by importance in descending order
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Select the top 20 features
selected_features = feature_importance_df['Feature'][:10]

# Create a new DataFrame with only the selected features
X_selected = X[selected_features]


In [561]:
X_selected

Unnamed: 0,apache_4a_hospital_death_prob,d1_spo2_min,d1_sysbp_min,d1_sysbp_noninvasive_min,d1_mbp_min,d1_mbp_noninvasive_min,apache_4a_icu_death_prob,d1_temp_min,h1_resprate_min,h1_sysbp_noninvasive_max
0,0.507538,0.95,0.252101,0.251912,0.366667,0.366667,0.507614,0.729318,0.147287,0.121622
1,0.552723,0.96,0.453782,0.453644,0.500000,0.500000,0.535224,0.610895,0.116279,0.135135
2,0.693467,0.86,0.773109,0.773052,0.611111,0.611111,0.583756,0.610895,0.124031,0.668919
3,0.562814,0.92,0.831933,0.831890,0.911111,0.911111,0.538071,0.712401,0.124031,0.594595
4,0.577889,0.89,0.411765,0.411616,0.455556,0.455556,0.543147,0.813906,0.209302,0.297297
...,...,...,...,...,...,...,...,...,...,...
49995,0.512563,0.91,0.361345,0.361183,0.377778,0.377778,0.512690,0.813906,0.131783,0.155405
49996,0.512563,0.98,0.756303,0.756241,0.777778,0.777778,0.507614,0.763153,0.093023,0.472973
49997,0.547739,0.92,0.369748,0.369589,0.433333,0.433333,0.532995,0.729318,0.085271,0.270270
49998,0.502513,0.97,0.596639,0.596537,0.666667,0.666667,0.507614,0.949247,0.100775,0.418919


In [None]:

# dt_classifier = DecisionTreeClassifier()

# # Define the hyperparameters and their possible values to search
# param_grid = {
#     'criterion': ['gini'],
#     'max_depth': [6],
#     'min_samples_split': [4,],
#     'min_samples_leaf': [ 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30 ,33]
# }

# # Create a GridSearchCV object with cross-validation
# grid_search = GridSearchCV(estimator=dt_classifier, param_grid=param_grid, cv=5, scoring='roc_auc')

# # Fit the GridSearchCV to your data
# grid_search.fit(X, y)

# # Get the best hyperparameters from the grid search
# best_params = grid_search.best_params_

# # Get the best estimator (model) with the optimal hyperparameters
# best_dt_model = grid_search.best_estimator_

In [None]:
# best_dt_model

In [None]:
max = -1
ans = 200

for depth in range(8, 9, 1):

    for leaf in range(70, 91, 1):

        for split in range(800, 801, 5):
            
            
            best_dt_model = DecisionTreeClassifier(max_depth=depth, min_samples_leaf=leaf, min_samples_split=split)
            trainX, testX, trainy, testy = train_test_split(X, y, test_size=0.1, random_state=5)
            best_dt_model.fit(trainX,trainy)

            md_probs = best_dt_model.predict_proba(testX)
            md_probs = md_probs[:,1]
            md_auc = roc_auc_score(testy, md_probs)
            md_auc
        
    
            if md_auc > max:
                max = md_auc
                ans = depth, leaf, split
                print(ans, max)
            # print(min_sam)
        

print("BEST:", ans, max) 

In [562]:

def fitting_models_CV2(model):
    cv = RepeatedKFold(n_splits=10, n_repeats=2)#, random_state=1)
    
    start = time.perf_counter()
    #scores = cross_val_score(clf, X, y, scoring="accuracy", cv=cv) 
    scores = cross_val_score(model, X, y, scoring="roc_auc", cv=cv, n_jobs=-1) 
    end = time.perf_counter()        
    score = format(mean(scores), '.4f')
    duration = format((end-start),'.4f')
    print(" {} - {}".format(score,duration))
    
    
    # dt = DecisionTreeClassifier()
    # cv = RepeatedKFold(n_splits=10, n_repeats=1)#, random_state=1)
    # grid_search = GridSearchCV(dt, parameters, cv=cv, n_jobs=-1, scoring='roc_auc',verbose=2)#, refit=False)
    # grid_search.fit(X_selected, y)
    # print(grid_search.best_estimator_)
    # print(grid_search.best_score_)
    # print(grid_search.best_params_)


In [571]:

from sklearn.naive_bayes import CategoricalNB, GaussianNB, MultinomialNB


best_dt_model = DecisionTreeClassifier(max_depth=8, min_samples_leaf=900, min_samples_split=500)
# best_dt_model = MultinomialNB()

trainX, testX, trainy, testy = train_test_split(X_selected, y, test_size=0.3, random_state=5)
best_dt_model.fit(trainX,trainy)

md_probs = best_dt_model.predict_proba(testX)
md_probs = md_probs[:,1]
md_auc = roc_auc_score(testy, md_probs)
md_auc
fitting_models_CV2(best_dt_model)

 0.8491 - 2.1912


In [564]:
best_dt_model = DecisionTreeClassifier(max_depth=8, min_samples_leaf=251, min_samples_split=5)

best_dt_model.fit(X_selected,y)

In [None]:
selected_features

In [508]:
def generate_predictions_for_model(model, test_file, output_file):
    # Load the test data
    df_test = pd.read_csv(test_file)
    record_ids = df_test["RecordID"]
    
    df_test_imputed = custom_impute(df_test)
    
    df_test_onehot = pd.get_dummies(df_test_imputed)
    df_test_onehot = df_test_onehot[selected_features]
    X_test = df_test_onehot.loc[:, df_test_onehot.columns != "hospital_death"]
    
    
    # Generate predictions using the model
    probs = model.predict_proba(X_test)
    probs = probs[:, 1]
    
    # Create a DataFrame for the results
    result = pd.DataFrame({'RecordID': record_ids, 'hospital_death': probs})
    
    # Save the results to a CSV file
    result.to_csv(output_file, index=False, header=["RecordID", "hospital_death"])
    
generate_predictions_for_model(best_dt_model, "test.csv", "results23.csv")


Index(['ethnicity', 'gender', 'icu_admit_source', 'icu_stay_type', 'icu_type',
       'apache_3j_bodysystem', 'apache_2_bodysystem'],
      dtype='object')


In [None]:

df_test = pd.read_csv('test.csv')
record_ids = df_test["RecordID"]

df_test_imputed = knn_impute(df_test)


In [None]:
df_test_imputed.dtypes

In [None]:

column_indices_to_convert = range(50, 57)  # Indices 51 to 58 (inclusive)

df_test_imputed.iloc[:, column_indices_to_convert] = df_test_imputed.iloc[:, column_indices_to_convert].astype(str)

df_test_onehot = pd.get_dummies(df_test_imputed)
X_test = df_test_onehot.loc[:, df_test_onehot.columns != "hospital_death"]
X_test = X_test[selected_features]


# Generate predictions using the model
probs = model.predict_proba(X_test)
probs = probs[:, 1]

# Create a DataFrame for the results
result = pd.DataFrame({'RecordID': record_ids, 'hospital_death': probs})

# Save the results to a CSV file

In [None]:
result.to_csv('results14.csv', index=False, header=["RecordID", "hospital_death"])
