In [102]:
import sklearn
sklearn.__version__

import pandas as pd   
from matplotlib import pyplot
from numpy import mean
import time


from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import train_test_split, RepeatedKFold, GridSearchCV, cross_val_score
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler
from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder



def missing_value_counts(dataframe):
    missing_counts = dataframe.isnull().sum()
    missing_counts_df = pd.DataFrame({'Column': missing_counts.index, 'Missing_Values_Count': missing_counts.values})
    return missing_counts_df

import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder

def preprocess_data(df, k_neighbors=5):
    # Step 1: Identify Numerical and Categorical Columns
    numerical_columns = df.select_dtypes(include=['number']).columns
    categorical_columns = df.select_dtypes(exclude=['number']).columns

    # Step 2: Apply Mean Imputation to Numerical Columns
    numerical_imputer = SimpleImputer(strategy='mean')
    df[numerical_columns] = numerical_imputer.fit_transform(df[numerical_columns])

    # Step 3: Min-Max Scaling of Numerical Columns
    scaler = StandardScaler()
    df[numerical_columns] = scaler.fit_transform(df[numerical_columns])

    # Step 4: Encode Categorical Columns to Numerical
    label_encoders = {}
    for col in categorical_columns:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])
        label_encoders[col] = le

    # Step 5: Initialize the KNNImputer
    knn_imputer = KNNImputer(n_neighbors=k_neighbors)

    for cat_col in categorical_columns:
        cols_for_imputation = numerical_columns.tolist() + [cat_col]
        imputation_data = df[cols_for_imputation]

        # Identify rows with missing values in the current categorical column
        missing_rows = imputation_data[imputation_data[cat_col].isnull()]

        # Perform KNN imputation
        imputed_values = knn_imputer.fit_transform(imputation_data)

        # Update the DataFrame with the imputed values
        df[cols_for_imputation] = imputed_values

    # Step 6: Inverse Transform Categorical Columns Back to Original Values
    for col in categorical_columns:
        le = label_encoders[col]
        df[col] = le.inverse_transform(df[col].astype(int))

    return df



In [103]:
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

def custom_impute(df):
    # Separate columns into numerical and categorical
    numerical_columns = df.select_dtypes(include=['number']).columns
    categorical_columns = df.select_dtypes(exclude=['number']).columns
    print(categorical_columns)

    # Step 1: Encode Categorical Columns to Numerical
    label_encoders = {}
    for col in categorical_columns:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])
        label_encoders[col] = le

    df[numerical_columns] = df[numerical_columns].fillna(df[numerical_columns].mean())

    # imputer = SimpleImputer(strategy='mean')
    # df[numerical_columns] = imputer.fit_transform(df[numerical_columns])

    # Step 3: Impute Categorical Columns with Mode
    imputer = SimpleImputer(strategy='most_frequent')
    df[categorical_columns] = imputer.fit_transform(df[categorical_columns])

    
    df[categorical_columns] = df[categorical_columns].astype(str)
    
    scaler = MinMaxScaler()
    df[numerical_columns] = scaler.fit_transform(df[numerical_columns])

    # # Step 7: Inverse Transform Categorical Columns Back to Original Values
    # for col in categorical_columns:
    #     le = label_encoders[col]
    #     imputed_df[col] = le.inverse_transform(imputed_df[col].astype(int))

    return df

# Example usage:
# Replace 'your_dataframe' with your DataFrame.
# df_imputed = custom_impute(your_dataframe)


In [213]:
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

def median_impute(df, flag):
    if flag:
        df = df.dropna(how='any',axis=0)
    df_onehot = pd.get_dummies(df)
    column_names = df_onehot.columns
    # imputer = SimpleImputer(strategy='median')
    # df_onehot = imputer.fit_transform(df_onehot)

    # scaler = StandardScaler()
    # df_onehot = scaler.fit_transform(df_onehot)

    # # Step 7: Inverse Transform Categorical Columns Back to Original Values
    # for col in categorical_columns:
    #     le = label_encoders[col]
    #     imputed_df[col] = le.inverse_transform(imputed_df[col].astype(int))

    # return pd.DataFrame(df_onehot, columns=column_names)
    return df_onehot

# Example usage:
# Replace 'your_dataframe' with your DataFrame.
# df_imputed = custom_impute(your_dataframe)


In [105]:
import pandas as pd
from sklearn.impute import SimpleImputer

# Load your DataFrame
df = pd.read_csv("train.csv")

# Create a copy of the DataFrame for imputation
df_onehot = df.copy()

# Perform one-hot encoding on the copied DataFrame
df_onehot = pd.get_dummies(df_onehot)

# Get the column names before imputation
column_names = df_onehot.columns

# Impute missing values with median on the copied DataFrame
imputer = SimpleImputer(strategy='median')
df_onehot = imputer.fit_transform(df_onehot)

# Create a new DataFrame with the imputed values and original column names
result_df = pd.DataFrame(df_onehot, columns=column_names)
result_df

# Now, result_df contains the imputed values with the original column names retained


Unnamed: 0,RecordID,hospital_id,icu_id,age,elective_surgery,pre_icu_los_days,apache_2_diagnosis,apache_3j_diagnosis,apache_post_operative,gcs_eyes_apache,...,apache_2_bodysystem_Cardiovascular,apache_2_bodysystem_Gastrointestinal,apache_2_bodysystem_Haematologic,apache_2_bodysystem_Metabolic,apache_2_bodysystem_Neurologic,apache_2_bodysystem_Renal/Genitourinary,apache_2_bodysystem_Respiratory,apache_2_bodysystem_Trauma,apache_2_bodysystem_Undefined Diagnoses,apache_2_bodysystem_Undefined diagnoses
0,1.0,126.0,1931.0,28.0,0.0,0.009722,122.0,703.03,0.0,3.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2.0,112.0,1544.0,69.0,0.0,0.038194,302.0,109.09,0.0,4.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3.0,153.0,1517.0,84.0,0.0,11.171528,106.0,212.01,0.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,4.0,109.0,1811.0,59.0,0.0,0.090972,301.0,403.01,0.0,4.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,5.0,287.0,1845.0,85.0,0.0,0.031944,117.0,106.01,0.0,4.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,49996.0,110.0,1689.0,42.0,0.0,0.000000,308.0,1102.01,0.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
49996,49997.0,155.0,1719.0,61.0,0.0,4.824306,120.0,407.01,0.0,4.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
49997,49998.0,93.0,1912.0,50.0,0.0,0.000694,113.0,501.05,0.0,4.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
49998,49999.0,230.0,1720.0,54.0,1.0,0.420139,202.0,1211.01,1.0,4.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [157]:
imputed_df = median_impute(df)
imputed_df['hospital_death']
imputed_df


# categorical_columns = df.select_dtypes(exclude=['number']).columns
# categorical_columns

Unnamed: 0,RecordID,hospital_id,icu_id,age,elective_surgery,pre_icu_los_days,apache_2_diagnosis,apache_3j_diagnosis,apache_post_operative,gcs_eyes_apache,...,apache_2_bodysystem_Cardiovascular,apache_2_bodysystem_Gastrointestinal,apache_2_bodysystem_Haematologic,apache_2_bodysystem_Metabolic,apache_2_bodysystem_Neurologic,apache_2_bodysystem_Renal/Genitourinary,apache_2_bodysystem_Respiratory,apache_2_bodysystem_Trauma,apache_2_bodysystem_Undefined Diagnoses,apache_2_bodysystem_Undefined diagnoses
2,3,153,1517,84.0,0,11.171528,106.0,212.01,0,3.0,...,False,False,False,False,False,False,True,False,False,False
3,4,109,1811,59.0,0,0.090972,301.0,403.01,0,4.0,...,False,False,False,False,True,False,False,False,False,False
4,5,287,1845,85.0,0,0.031944,117.0,106.01,0,4.0,...,True,False,False,False,False,False,False,False,False,False
5,6,293,1463,54.0,0,0.222917,113.0,501.06,0,3.0,...,True,False,False,False,False,False,False,False,False,False
6,7,180,1169,39.0,1,2.111806,218.0,1505.01,1,4.0,...,False,False,False,False,True,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49993,49994,93,1912,66.0,1,0.265972,303.0,1301.02,1,4.0,...,False,False,False,False,False,False,True,False,False,False
49994,49995,267,1647,29.0,0,0.293750,302.0,109.01,0,4.0,...,True,False,False,False,False,False,False,False,False,False
49995,49996,110,1689,42.0,0,0.000000,308.0,1102.01,0,3.0,...,False,False,False,False,False,False,False,False,True,False
49996,49997,155,1719,61.0,0,4.824306,120.0,407.01,0,4.0,...,False,False,False,False,True,False,False,False,False,False


In [209]:
imputed_df

Unnamed: 0,RecordID,hospital_id,icu_id,age,elective_surgery,pre_icu_los_days,apache_2_diagnosis,apache_3j_diagnosis,apache_post_operative,gcs_eyes_apache,...,apache_2_bodysystem_Cardiovascular,apache_2_bodysystem_Gastrointestinal,apache_2_bodysystem_Haematologic,apache_2_bodysystem_Metabolic,apache_2_bodysystem_Neurologic,apache_2_bodysystem_Renal/Genitourinary,apache_2_bodysystem_Respiratory,apache_2_bodysystem_Trauma,apache_2_bodysystem_Undefined Diagnoses,apache_2_bodysystem_Undefined diagnoses
2,3,153,1517,84.0,0,11.171528,106.0,212.01,0,3.0,...,False,False,False,False,False,False,True,False,False,False
3,4,109,1811,59.0,0,0.090972,301.0,403.01,0,4.0,...,False,False,False,False,True,False,False,False,False,False
4,5,287,1845,85.0,0,0.031944,117.0,106.01,0,4.0,...,True,False,False,False,False,False,False,False,False,False
5,6,293,1463,54.0,0,0.222917,113.0,501.06,0,3.0,...,True,False,False,False,False,False,False,False,False,False
6,7,180,1169,39.0,1,2.111806,218.0,1505.01,1,4.0,...,False,False,False,False,True,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49993,49994,93,1912,66.0,1,0.265972,303.0,1301.02,1,4.0,...,False,False,False,False,False,False,True,False,False,False
49994,49995,267,1647,29.0,0,0.293750,302.0,109.01,0,4.0,...,True,False,False,False,False,False,False,False,False,False
49995,49996,110,1689,42.0,0,0.000000,308.0,1102.01,0,3.0,...,False,False,False,False,False,False,False,False,True,False
49996,49997,155,1719,61.0,0,4.824306,120.0,407.01,0,4.0,...,False,False,False,False,True,False,False,False,False,False


In [158]:
imputed_df.head()

Unnamed: 0,RecordID,hospital_id,icu_id,age,elective_surgery,pre_icu_los_days,apache_2_diagnosis,apache_3j_diagnosis,apache_post_operative,gcs_eyes_apache,...,apache_2_bodysystem_Cardiovascular,apache_2_bodysystem_Gastrointestinal,apache_2_bodysystem_Haematologic,apache_2_bodysystem_Metabolic,apache_2_bodysystem_Neurologic,apache_2_bodysystem_Renal/Genitourinary,apache_2_bodysystem_Respiratory,apache_2_bodysystem_Trauma,apache_2_bodysystem_Undefined Diagnoses,apache_2_bodysystem_Undefined diagnoses
2,3,153,1517,84.0,0,11.171528,106.0,212.01,0,3.0,...,False,False,False,False,False,False,True,False,False,False
3,4,109,1811,59.0,0,0.090972,301.0,403.01,0,4.0,...,False,False,False,False,True,False,False,False,False,False
4,5,287,1845,85.0,0,0.031944,117.0,106.01,0,4.0,...,True,False,False,False,False,False,False,False,False,False
5,6,293,1463,54.0,0,0.222917,113.0,501.06,0,3.0,...,True,False,False,False,False,False,False,False,False,False
6,7,180,1169,39.0,1,2.111806,218.0,1505.01,1,4.0,...,False,False,False,False,True,False,False,False,False,False


In [159]:
missing_value_counts(imputed_df)

Unnamed: 0,Column,Missing_Values_Count
0,RecordID,0
1,hospital_id,0
2,icu_id,0
3,age,0
4,elective_surgery,0
...,...,...
91,apache_2_bodysystem_Renal/Genitourinary,0
92,apache_2_bodysystem_Respiratory,0
93,apache_2_bodysystem_Trauma,0
94,apache_2_bodysystem_Undefined Diagnoses,0


In [160]:
# df = preprocess_data(df)
# print(imputed_df.dtypes)
df_onehot = pd.get_dummies(imputed_df)
df_onehot.dtypes


RecordID                                     int64
hospital_id                                  int64
icu_id                                       int64
age                                        float64
elective_surgery                             int64
                                            ...   
apache_2_bodysystem_Renal/Genitourinary       bool
apache_2_bodysystem_Respiratory               bool
apache_2_bodysystem_Trauma                    bool
apache_2_bodysystem_Undefined Diagnoses       bool
apache_2_bodysystem_Undefined diagnoses       bool
Length: 96, dtype: object

In [161]:
# df_copy = df_onehot.copy()
X = df_onehot.loc[:, df_onehot.columns != "hospital_death"]
y = df_onehot[["hospital_death"]]

In [210]:
X

Unnamed: 0,RecordID,hospital_id,icu_id,age,elective_surgery,pre_icu_los_days,apache_2_diagnosis,apache_3j_diagnosis,apache_post_operative,gcs_eyes_apache,...,apache_2_bodysystem_Cardiovascular,apache_2_bodysystem_Gastrointestinal,apache_2_bodysystem_Haematologic,apache_2_bodysystem_Metabolic,apache_2_bodysystem_Neurologic,apache_2_bodysystem_Renal/Genitourinary,apache_2_bodysystem_Respiratory,apache_2_bodysystem_Trauma,apache_2_bodysystem_Undefined Diagnoses,apache_2_bodysystem_Undefined diagnoses
2,3,153,1517,84.0,0,11.171528,106.0,212.01,0,3.0,...,False,False,False,False,False,False,True,False,False,False
3,4,109,1811,59.0,0,0.090972,301.0,403.01,0,4.0,...,False,False,False,False,True,False,False,False,False,False
4,5,287,1845,85.0,0,0.031944,117.0,106.01,0,4.0,...,True,False,False,False,False,False,False,False,False,False
5,6,293,1463,54.0,0,0.222917,113.0,501.06,0,3.0,...,True,False,False,False,False,False,False,False,False,False
6,7,180,1169,39.0,1,2.111806,218.0,1505.01,1,4.0,...,False,False,False,False,True,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49993,49994,93,1912,66.0,1,0.265972,303.0,1301.02,1,4.0,...,False,False,False,False,False,False,True,False,False,False
49994,49995,267,1647,29.0,0,0.293750,302.0,109.01,0,4.0,...,True,False,False,False,False,False,False,False,False,False
49995,49996,110,1689,42.0,0,0.000000,308.0,1102.01,0,3.0,...,False,False,False,False,False,False,False,False,True,False
49996,49997,155,1719,61.0,0,4.824306,120.0,407.01,0,4.0,...,False,False,False,False,True,False,False,False,False,False


In [131]:


# Initialize a RandomForestClassifier (or any other tree-based model)
model = DecisionTreeClassifier(max_depth=5, min_samples_leaf=3, min_samples_split=3)

# Fit the model to your data
model.fit(X, y)  # Use y if you have a target variable, otherwise omit it

# Get feature importances
feature_importances = model.feature_importances_

# Create a DataFrame to store feature names and their importance scores
feature_importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': feature_importances
})

# Sort features by importance in descending order
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Select the top 20 features
selected_features = feature_importance_df['Feature'][:20]

# Create a new DataFrame with only the selected features
X_selected = X[selected_features]


In [126]:
from sklearn.decomposition import PCA
pca = PCA(n_components=30)  # Reduce to 2 principal components
X_new = pca.fit_transform(X)


In [127]:
pd.DataFrame(X_new)
X_new = pd.DataFrame(X_new)
X_new

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
0,24999.527976,157.887735,-384.908615,8.521745,-94.967996,-57.487797,76.180550,2.456428,-7.738923,-23.812389,...,4.747729,-14.627328,-6.760888,4.279475,6.450322,0.155182,-0.125862,0.463130,0.010137,-0.509702
1,24998.513600,-416.642929,8.482580,95.082245,179.007728,-107.393583,79.263887,-35.450162,-45.317660,-4.574555,...,2.194274,-5.669542,-1.418183,-1.219523,0.990576,0.059978,0.231840,-1.014907,-0.144902,-0.912153
2,24997.501246,-330.348999,35.688687,1.181446,-23.617405,110.426264,44.086544,11.734995,11.075760,43.261096,...,-6.169253,-2.486883,3.518053,3.460331,-1.645463,0.001553,0.038106,-5.590530,1.330701,10.186855
3,24996.523250,-127.925660,-259.047153,14.539260,178.782001,60.941905,86.098674,-47.801723,-35.315164,12.736748,...,4.381439,-3.436134,-4.005850,2.214806,-3.305088,-0.026503,-0.010538,1.499096,0.025195,-0.334132
4,24995.537093,-437.477908,-293.260217,-33.830169,-67.094026,-12.854542,-84.275993,119.438310,-18.013632,20.307145,...,-2.630069,0.618008,-0.067631,-1.931462,2.990652,-0.039385,0.038955,1.110791,-0.168379,-1.520121
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,-24995.505228,570.472929,-149.411090,-97.992283,31.470118,-76.267061,89.306273,-4.164297,-20.576095,-19.127586,...,-4.704477,2.873696,0.818508,-1.584241,0.642879,-0.009891,-0.118464,1.767003,-0.028010,-1.073351
49996,-24996.485625,-137.441054,-172.854236,-17.437247,-33.113352,88.009609,41.535256,-27.801917,-40.596774,12.863301,...,0.689837,-6.597579,-7.155204,1.101833,0.779296,-0.046285,0.017381,-0.350614,0.702004,4.579014
49997,-24997.471685,-45.757063,-367.381625,-17.091917,-94.629322,-26.868690,108.071685,59.320740,-16.502850,-4.662540,...,3.194707,-8.614838,2.427598,1.403678,-4.468487,-0.014352,-0.064442,1.285516,-0.041306,-0.896053
49998,-24998.502772,670.443986,-181.901336,-48.051116,-31.811182,66.600006,-29.853720,9.428442,-22.564866,-7.205130,...,-9.244343,-0.527508,0.860700,-0.219488,0.664668,-0.091318,-0.097970,-1.121165,0.157927,-0.445778


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SequentialFeatureSelector

X = df_onehot.loc[:, df_onehot.columns != "hospital_death"]
y = df_onehot[["hospital_death"]]

# Define the classifier (replace with your choice of classifier)
clf = DecisionTreeClassifier(max_depth=6, min_samples_leaf=3, min_samples_split=3)

# Initialize the SequentialFeatureSelector
sfs = SequentialFeatureSelector(clf, n_features_to_select=10, direction='forward', cv=5)
sfs.fit(X, y)


In [None]:
selected_feature_indices = sfs.get_support(indices=True)

In [None]:
selected_feature_names = X.columns[selected_feature_indices]
selected_feature_names


In [None]:
selected_feature_names
X_selected = X[selected_feature_names]

In [None]:
X_selected

In [None]:

# dt_classifier = DecisionTreeClassifier()

# # Define the hyperparameters and their possible values to search
# param_grid = {
#     'criterion': ['gini'],
#     'max_depth': [6],
#     'min_samples_split': [4,],
#     'min_samples_leaf': [ 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30 ,33]
# }

# # Create a GridSearchCV object with cross-validation
# grid_search = GridSearchCV(estimator=dt_classifier, param_grid=param_grid, cv=5, scoring='roc_auc')

# # Fit the GridSearchCV to your data
# grid_search.fit(X, y)

# # Get the best hyperparameters from the grid search
# best_params = grid_search.best_params_

# # Get the best estimator (model) with the optimal hyperparameters
# best_dt_model = grid_search.best_estimator_

In [None]:
# best_dt_model

In [None]:
max = -1
ans = 200

for depth in range(8, 9, 1):

    for leaf in range(70, 91, 1):

        for split in range(800, 801, 5):
            
            
            best_dt_model = DecisionTreeClassifier(max_depth=depth, min_samples_leaf=leaf, min_samples_split=split)
            trainX, testX, trainy, testy = train_test_split(X, y, test_size=0.1, random_state=5)
            best_dt_model.fit(trainX,trainy)

            md_probs = best_dt_model.predict_proba(testX)
            md_probs = md_probs[:,1]
            md_auc = roc_auc_score(testy, md_probs)
            md_auc
        
    
            if md_auc > max:
                max = md_auc
                ans = depth, leaf, split
                print(ans, max)
            # print(min_sam)
        

print("BEST:", ans, max) 

In [163]:

def fitting_models_CV2(model):
    cv = RepeatedKFold(n_splits=10, n_repeats=2)#, random_state=1)
    
    start = time.perf_counter()
    #scores = cross_val_score(clf, X, y, scoring="accuracy", cv=cv) 
    scores = cross_val_score(model, X, y, scoring="roc_auc", cv=cv, n_jobs=-1) 
    end = time.perf_counter()        
    score = format(mean(scores), '.4f')
    duration = format((end-start),'.4f')
    print(" {} - {}".format(score,duration))
    
    
    # dt = DecisionTreeClassifier()
    # cv = RepeatedKFold(n_splits=10, n_repeats=1)#, random_state=1)
    # grid_search = GridSearchCV(dt, parameters, cv=cv, n_jobs=-1, scoring='roc_auc',verbose=2)#, refit=False)
    # grid_search.fit(X_selected, y)
    # print(grid_search.best_estimator_)
    # print(grid_search.best_score_)
    # print(grid_search.best_params_)


In [133]:
X_selected

Unnamed: 0,apache_4a_hospital_death_prob,d1_spo2_min,d1_sysbp_min,d1_sysbp_noninvasive_min,d1_mbp_min,d1_mbp_noninvasive_min,apache_4a_icu_death_prob,d1_temp_min,h1_resprate_min,h1_sysbp_max,gcs_motor_apache,d1_resprate_max,resprate_apache,icu_id,h1_diasbp_noninvasive_min,temp_apache,apache_2_bodysystem_Metabolic,icu_type_Med-Surg ICU,icu_type_MICU,icu_type_Cardiac ICU
0,0.01,95.0,71.0,71.0,55.0,55.0,0.00,36.2,19.0,93.0,6.0,77.0,51.0,1931.0,51.0,36.2,1.0,1.0,0.0,0.0
1,0.05,96.0,95.0,95.0,67.0,67.0,0.02,35.5,15.0,95.0,6.0,28.0,31.0,1544.0,58.0,35.5,0.0,1.0,0.0,0.0
2,0.38,86.0,133.0,133.0,77.0,77.0,0.15,35.5,16.0,174.0,5.0,26.0,26.0,1517.0,64.0,36.1,0.0,0.0,1.0,0.0
3,0.12,92.0,140.0,140.0,104.0,104.0,0.06,36.1,16.0,163.0,6.0,45.0,58.0,1811.0,76.0,36.3,0.0,1.0,0.0,0.0
4,0.15,89.0,90.0,90.0,63.0,63.0,0.07,36.7,27.0,119.0,6.0,32.0,37.0,1845.0,66.0,36.7,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,0.02,91.0,84.0,84.0,56.0,56.0,0.01,36.7,17.0,98.0,6.0,26.0,5.0,1689.0,62.0,36.9,0.0,1.0,0.0,0.0
49996,0.02,98.0,131.0,131.0,92.0,92.0,0.00,36.4,12.0,145.0,6.0,21.0,12.0,1719.0,80.0,36.4,0.0,1.0,0.0,0.0
49997,0.09,92.0,85.0,85.0,61.0,61.0,0.05,36.2,11.0,115.0,6.0,33.0,33.0,1912.0,64.0,36.2,0.0,1.0,0.0,0.0
49998,0.00,97.0,112.0,112.0,82.0,82.0,0.00,37.5,13.0,137.0,6.0,19.0,13.0,1720.0,75.0,36.5,0.0,0.0,0.0,1.0


In [None]:
# from imblearn.over_sampling import RandomOverSampler

# # Create an instance of RandomOverSampler
# ros = RandomOverSampler(random_state=42)

# # Fit and apply the oversampler to your data
# X_resampled, y_resampled = ros.fit_resample(X, y)


In [171]:
df = df_onehot.copy()
# Count the number of samples in each class
class_counts = df['hospital_death'].value_counts()

# Find the minority class
minority_class = class_counts.idxmin()

# Get the data of the minority class
minority_data = df[df['hospital_death'] == minority_class]

# Oversample the minority class by duplicating its rows
oversampled_df = pd.concat([df, minority_data])

# Shuffle the oversampled DataFrame (optional but recommended)
oversampled_df = oversampled_df.sample(frac=1, random_state=42)

In [172]:
oversampled_df

Unnamed: 0,RecordID,hospital_id,icu_id,age,elective_surgery,pre_icu_los_days,apache_2_diagnosis,apache_3j_diagnosis,apache_post_operative,gcs_eyes_apache,...,apache_2_bodysystem_Cardiovascular,apache_2_bodysystem_Gastrointestinal,apache_2_bodysystem_Haematologic,apache_2_bodysystem_Metabolic,apache_2_bodysystem_Neurologic,apache_2_bodysystem_Renal/Genitourinary,apache_2_bodysystem_Respiratory,apache_2_bodysystem_Trauma,apache_2_bodysystem_Undefined Diagnoses,apache_2_bodysystem_Undefined diagnoses
7772,7773,170,1373,57.0,0,0.041667,122.0,703.03,0,3.0,...,False,False,False,True,False,False,False,False,False,False
19712,19713,161,1487,64.0,0,13.763194,301.0,410.01,0,1.0,...,False,False,False,False,True,False,False,False,False,False
32747,32748,161,1481,83.0,0,0.000000,114.0,102.01,0,1.0,...,True,False,False,False,False,False,False,False,False,False
12399,12400,109,1802,55.0,0,0.004167,122.0,703.03,0,1.0,...,False,False,False,True,False,False,False,False,False,False
26247,26248,252,1449,58.0,1,0.487500,213.0,1405.07,1,4.0,...,False,True,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8544,8545,223,1535,76.0,0,4.061806,110.0,104.01,0,4.0,...,True,False,False,False,False,False,False,False,False,False
15442,15443,118,1644,60.0,0,0.281250,102.0,206.01,0,4.0,...,False,False,False,False,False,False,True,False,False,False
26475,26476,287,1867,76.0,1,0.159028,217.0,1502.02,1,1.0,...,False,False,False,False,True,False,False,False,False,False
1179,1180,207,1343,70.0,0,0.018750,306.0,801.03,0,4.0,...,False,False,True,False,False,False,False,False,False,False


In [211]:
X

Unnamed: 0,RecordID,hospital_id,icu_id,age,elective_surgery,pre_icu_los_days,apache_2_diagnosis,apache_3j_diagnosis,apache_post_operative,gcs_eyes_apache,...,apache_2_bodysystem_Cardiovascular,apache_2_bodysystem_Gastrointestinal,apache_2_bodysystem_Haematologic,apache_2_bodysystem_Metabolic,apache_2_bodysystem_Neurologic,apache_2_bodysystem_Renal/Genitourinary,apache_2_bodysystem_Respiratory,apache_2_bodysystem_Trauma,apache_2_bodysystem_Undefined Diagnoses,apache_2_bodysystem_Undefined diagnoses
2,3,153,1517,84.0,0,11.171528,106.0,212.01,0,3.0,...,False,False,False,False,False,False,True,False,False,False
3,4,109,1811,59.0,0,0.090972,301.0,403.01,0,4.0,...,False,False,False,False,True,False,False,False,False,False
4,5,287,1845,85.0,0,0.031944,117.0,106.01,0,4.0,...,True,False,False,False,False,False,False,False,False,False
5,6,293,1463,54.0,0,0.222917,113.0,501.06,0,3.0,...,True,False,False,False,False,False,False,False,False,False
6,7,180,1169,39.0,1,2.111806,218.0,1505.01,1,4.0,...,False,False,False,False,True,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49993,49994,93,1912,66.0,1,0.265972,303.0,1301.02,1,4.0,...,False,False,False,False,False,False,True,False,False,False
49994,49995,267,1647,29.0,0,0.293750,302.0,109.01,0,4.0,...,True,False,False,False,False,False,False,False,False,False
49995,49996,110,1689,42.0,0,0.000000,308.0,1102.01,0,3.0,...,False,False,False,False,False,False,False,False,True,False
49996,49997,155,1719,61.0,0,4.824306,120.0,407.01,0,4.0,...,False,False,False,False,True,False,False,False,False,False


In [207]:


from sklearn.naive_bayes import CategoricalNB, GaussianNB, MultinomialNB


best_dt_model = DecisionTreeClassifier(max_depth=7, min_samples_leaf=10, min_samples_split=300, criterion='gini')
# best_dt_model = MultinomialNB()
# best_dt_model = GaussianNB()
# best_dt_model = CategoricalNB()
# best_dt_model = KNeighborsClassifier(n_neighbors=500)

trainX, testX, trainy, testy = train_test_split(X, y, test_size=0.2, random_state=70)
best_dt_model.fit(trainX,trainy)

md_probs = best_dt_model.predict_proba(testX)
md_probs = md_probs[:,1]
md_auc = roc_auc_score(testy, md_probs)
md_auc
fitting_models_CV2(best_dt_model)
md_auc

 0.8564 - 1.9069


0.8761151871593688

In [189]:
from sklearn.naive_bayes import CategoricalNB, GaussianNB, MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import numpy as np

# Initialize variables to keep track of the highest AUC score and its corresponding random_state
best_auc = -1
best_random_state = None

# Loop over random_state values from 1 to 30
for random_state in range(1, 91):
    # Create and fit the Decision Tree model
    best_dt_model = DecisionTreeClassifier(max_depth=7, min_samples_leaf=10, min_samples_split=10, criterion='gini')
    trainX, testX, trainy, testy = train_test_split(X, y, test_size=0.2, random_state=random_state)
    best_dt_model.fit(trainX, trainy)

    # Calculate ROC AUC score for the current random_state
    md_probs = best_dt_model.predict_proba(testX)
    md_probs = md_probs[:, 1]
    md_auc = roc_auc_score(testy, md_probs)
    
    # Check if the current AUC score is higher than the best so far
    if md_auc > best_auc:
        best_auc = md_auc
        best_random_state = random_state

# Output the best AUC score and its corresponding random_state
print(f"Best ROC AUC: {best_auc} (Random State: {best_random_state})")


Best ROC AUC: 0.8741931134199331 (Random State: 70)


In [191]:
X.shape

(36496, 95)

In [208]:
best_dt_model = DecisionTreeClassifier(max_depth=7, min_samples_leaf=10, min_samples_split=300)

best_dt_model.fit(trainX,trainy)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

def explore_data(df):
    # Create box plots and scatter plots for numerical columns
    numerical_columns = df.select_dtypes(include='number').columns

    for col in numerical_columns:
        plt.figure(figsize=(12, 4))

        # Box Plot
        plt.subplot(1, 2, 1)
        sns.boxplot(x=df[col])
        plt.title(f'Box Plot of {col}')
        
        # Scatter Plot (if there's another numerical column)
        if len(numerical_columns) > 1:
            plt.subplot(1, 2, 2)
            sns.scatterplot(data=df, x=col, y=numerical_columns.drop(col).values[0])
            plt.title(f'Scatter Plot: {col} vs {numerical_columns.drop(col).values[0]}')
        
        plt.show()

        # Calculate Z-scores
        z_scores = np.abs(stats.zscore(df[col]))
        df[f'{col}_ZScore'] = z_scores

        # Calculate IQR
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        df[f'{col}_IQR'] = IQR

        # Display Z-score and IQR statistics
        print(f'{col} - Z-Score Statistics:')
        print(f"   Mean Z-Score: {z_scores.mean()}")
        print(f"   Max Z-Score: {z_scores.max()}")
        print(f"   Min Z-Score: {z_scores.min()}")
        print(f'{col} - IQR Statistics:')
        print(f"   Q1: {Q1}")
        print(f"   Q3: {Q3}")
        print(f"   IQR: {IQR}")
        print(f"   Potential Outliers (IQR method):")
        print(df[(df[col] < (Q1 - 1.5 * IQR)) | (df[col] > (Q3 + 1.5 * IQR))][[col, f'{col}_IQR']])
        print("\n" + "=" * 50 + "\n")

# Example usage:
# Replace 'your_dataframe' with your DataFrame.
# explore_data(your_dataframe)


In [None]:
explore_data(X_selected)

In [None]:
selected_features

In [214]:
def generate_predictions_for_model(model, test_file, output_file):
    # Load the test data
    df_test = pd.read_csv(test_file)
    record_ids = df_test["RecordID"]
    
    df_test_imputed = median_impute(df_test, False)
    
    df_test_onehot = pd.get_dummies(df_test_imputed)
    # df_test_onehot = df_test_onehot[selected_feature_names]
    X_test = df_test_onehot.loc[:, df_test_onehot.columns != "hospital_death"]
    # X_test = pca.transform(X_test)
    
    
    # Generate predictions using the model
    probs = model.predict_proba(X_test)
    probs = probs[:, 1]
    
    # Create a DataFrame for the results
    result = pd.DataFrame({'RecordID': record_ids, 'hospital_death': probs})
    
    # Save the results to a CSV file
    result.to_csv(output_file, index=False, header=["RecordID", "hospital_death"])
    
generate_predictions_for_model(best_dt_model, "test.csv", "results31.csv")


In [None]:

df_test = pd.read_csv('test.csv')
record_ids = df_test["RecordID"]

df_test_imputed = knn_impute(df_test)


In [None]:
df_test_imputed.dtypes

In [None]:

column_indices_to_convert = range(50, 57)  # Indices 51 to 58 (inclusive)

df_test_imputed.iloc[:, column_indices_to_convert] = df_test_imputed.iloc[:, column_indices_to_convert].astype(str)

df_test_onehot = pd.get_dummies(df_test_imputed)
X_test = df_test_onehot.loc[:, df_test_onehot.columns != "hospital_death"]
X_test = X_test[selected_features]


# Generate predictions using the model
probs = model.predict_proba(X_test)
probs = probs[:, 1]

# Create a DataFrame for the results
result = pd.DataFrame({'RecordID': record_ids, 'hospital_death': probs})

# Save the results to a CSV file

In [None]:
result.to_csv('results14.csv', index=False, header=["RecordID", "hospital_death"])
