In [652]:
import sklearn
sklearn.__version__

import pandas as pd   
from matplotlib import pyplot
from numpy import mean
import time


from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import train_test_split, RepeatedKFold, GridSearchCV, cross_val_score
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler
from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder



def missing_value_counts(dataframe):
    # Calculate the count of missing values in each column
    missing_counts = dataframe.isnull().sum()

    # Create a DataFrame to display the results
    missing_counts_df = pd.DataFrame({'Column': missing_counts.index, 'Missing_Values_Count': missing_counts.values})

    return missing_counts_df

import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder

def preprocess_data(df, k_neighbors=5):
    # Step 1: Identify Numerical and Categorical Columns
    numerical_columns = df.select_dtypes(include=['number']).columns
    categorical_columns = df.select_dtypes(exclude=['number']).columns

    # Step 2: Apply Mean Imputation to Numerical Columns
    numerical_imputer = SimpleImputer(strategy='mean')
    df[numerical_columns] = numerical_imputer.fit_transform(df[numerical_columns])

    # Step 3: Min-Max Scaling of Numerical Columns
    scaler = StandardScaler()
    df[numerical_columns] = scaler.fit_transform(df[numerical_columns])

    # Step 4: Encode Categorical Columns to Numerical
    label_encoders = {}
    for col in categorical_columns:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])
        label_encoders[col] = le

    # Step 5: Initialize the KNNImputer
    knn_imputer = KNNImputer(n_neighbors=k_neighbors)

    for cat_col in categorical_columns:
        cols_for_imputation = numerical_columns.tolist() + [cat_col]
        imputation_data = df[cols_for_imputation]

        # Identify rows with missing values in the current categorical column
        missing_rows = imputation_data[imputation_data[cat_col].isnull()]

        # Perform KNN imputation
        imputed_values = knn_imputer.fit_transform(imputation_data)

        # Update the DataFrame with the imputed values
        df[cols_for_imputation] = imputed_values

    # Step 6: Inverse Transform Categorical Columns Back to Original Values
    for col in categorical_columns:
        le = label_encoders[col]
        df[col] = le.inverse_transform(df[col].astype(int))

    return df



In [653]:
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

def custom_impute(df):
    # Separate columns into numerical and categorical
    numerical_columns = df.select_dtypes(include=['number']).columns
    categorical_columns = df.select_dtypes(exclude=['number']).columns
    print(categorical_columns)

    # Step 1: Encode Categorical Columns to Numerical
    label_encoders = {}
    for col in categorical_columns:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])
        label_encoders[col] = le

    df[numerical_columns] = df[numerical_columns].fillna(df[numerical_columns].mean())

    # imputer = SimpleImputer(strategy='mean')
    # df[numerical_columns] = imputer.fit_transform(df[numerical_columns])

    # Step 3: Impute Categorical Columns with Mode
    imputer = SimpleImputer(strategy='most_frequent')
    df[categorical_columns] = imputer.fit_transform(df[categorical_columns])

    
    df[categorical_columns] = df[categorical_columns].astype(str)
    
    scaler = MinMaxScaler()
    df[numerical_columns] = scaler.fit_transform(df[numerical_columns])

    # # Step 7: Inverse Transform Categorical Columns Back to Original Values
    # for col in categorical_columns:
    #     le = label_encoders[col]
    #     imputed_df[col] = le.inverse_transform(imputed_df[col].astype(int))

    return df

# Example usage:
# Replace 'your_dataframe' with your DataFrame.
# df_imputed = custom_impute(your_dataframe)


In [654]:
import pandas as pd
from sklearn.impute import SimpleImputer

df = pd.read_csv("train.csv")

In [655]:
imputed_df = custom_impute(df)



# categorical_columns = df.select_dtypes(exclude=['number']).columns
# categorical_columns

Index(['ethnicity', 'gender', 'icu_admit_source', 'icu_stay_type', 'icu_type',
       'apache_3j_bodysystem', 'apache_2_bodysystem'],
      dtype='object')


In [656]:
imputed_df.head()

Unnamed: 0,RecordID,hospital_id,icu_id,ethnicity,gender,icu_admit_source,icu_stay_type,icu_type,apache_3j_bodysystem,apache_2_bodysystem,...,h1_sysbp_min,h1_sysbp_noninvasive_max,h1_sysbp_noninvasive_min,d1_glucose_max,d1_potassium_max,apache_4a_hospital_death_prob,apache_4a_icu_death_prob,immunosuppression,solid_tumor_with_metastasis,hospital_death
0,0.0,0.163366,0.96213,2,1,1,2,5,5,3,...,0.234043,0.121622,0.232394,0.189108,0.34182,0.507538,0.507614,0.0,0.0,0.0
1,2e-05,0.094059,0.504142,0,1,0,0,5,0,0,...,0.297872,0.135135,0.295775,0.483271,0.214286,0.552723,0.535224,0.0,0.0,0.0
2,4e-05,0.29703,0.472189,2,1,1,0,4,8,6,...,0.77305,0.668919,0.767606,0.16171,0.333333,0.693467,0.583756,0.0,0.0,0.0
3,6e-05,0.079208,0.820118,2,1,0,0,5,7,4,...,0.617021,0.594595,0.612676,0.343866,0.238095,0.562814,0.538071,0.0,0.0,0.0
4,8e-05,0.960396,0.860355,2,0,0,0,1,0,0,...,0.468085,0.297297,0.464789,0.068773,0.261905,0.577889,0.543147,0.0,0.0,0.0


In [657]:
missing_value_counts(imputed_df)

Unnamed: 0,Column,Missing_Values_Count
0,RecordID,0
1,hospital_id,0
2,icu_id,0
3,ethnicity,0
4,gender,0
5,icu_admit_source,0
6,icu_stay_type,0
7,icu_type,0
8,apache_3j_bodysystem,0
9,apache_2_bodysystem,0


In [658]:
# df = preprocess_data(df)
# print(imputed_df.dtypes)
df_onehot = pd.get_dummies(imputed_df)
df_onehot.dtypes


RecordID                 float64
hospital_id              float64
icu_id                   float64
age                      float64
elective_surgery         float64
                          ...   
apache_2_bodysystem_5       bool
apache_2_bodysystem_6       bool
apache_2_bodysystem_7       bool
apache_2_bodysystem_8       bool
apache_2_bodysystem_9       bool
Length: 101, dtype: object

In [578]:
# df_copy = df_onehot.copy()
X = df_onehot.loc[:, df_onehot.columns != "hospital_death"]
y = df_onehot[["hospital_death"]]

# Initialize a RandomForestClassifier (or any other tree-based model)
model = DecisionTreeClassifier(max_depth=5, min_samples_leaf=3, min_samples_split=3)

# Fit the model to your data
model.fit(X, y)  # Use y if you have a target variable, otherwise omit it

# Get feature importances
feature_importances = model.feature_importances_

# Create a DataFrame to store feature names and their importance scores
feature_importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': feature_importances
})

# Sort features by importance in descending order
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Select the top 20 features
selected_features = feature_importance_df['Feature'][:10]

# Create a new DataFrame with only the selected features
X_selected = X[selected_features]


In [659]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SequentialFeatureSelector

X = df_onehot.loc[:, df_onehot.columns != "hospital_death"]
y = df_onehot[["hospital_death"]]

# Define the classifier (replace with your choice of classifier)
clf = DecisionTreeClassifier(max_depth=6, min_samples_leaf=3, min_samples_split=3)

# Initialize the SequentialFeatureSelector
sfs = SequentialFeatureSelector(clf, n_features_to_select=10, direction='forward', cv=5)
sfs.fit(X, y)


In [660]:
selected_feature_indices = sfs.get_support(indices=True)

In [661]:
selected_feature_names = X.columns[selected_feature_indices]
selected_feature_names


Index(['apache_post_operative', 'gcs_unable_apache', 'd1_spo2_min',
       'apache_4a_icu_death_prob', 'ethnicity_0', 'ethnicity_3', 'icu_type_1',
       'icu_type_4', 'apache_3j_bodysystem_5', 'apache_2_bodysystem_0'],
      dtype='object')

In [662]:
selected_feature_names
X_selected = X[selected_feature_names]

In [663]:
X_selected

Unnamed: 0,apache_post_operative,gcs_unable_apache,d1_spo2_min,apache_4a_icu_death_prob,ethnicity_0,ethnicity_3,icu_type_1,icu_type_4,apache_3j_bodysystem_5,apache_2_bodysystem_0
0,0.0,0.0,0.95,0.507614,False,False,False,False,True,False
1,0.0,0.0,0.96,0.535224,True,False,False,False,False,True
2,0.0,0.0,0.86,0.583756,False,False,False,True,False,False
3,0.0,0.0,0.92,0.538071,False,False,False,False,False,False
4,0.0,0.0,0.89,0.543147,False,False,True,False,False,True
...,...,...,...,...,...,...,...,...,...,...
49995,0.0,0.0,0.91,0.512690,False,False,False,False,False,False
49996,0.0,0.0,0.98,0.507614,False,False,False,False,False,False
49997,0.0,0.0,0.92,0.532995,False,False,False,False,False,True
49998,1.0,0.0,0.97,0.507614,False,False,False,False,False,True


In [None]:

# dt_classifier = DecisionTreeClassifier()

# # Define the hyperparameters and their possible values to search
# param_grid = {
#     'criterion': ['gini'],
#     'max_depth': [6],
#     'min_samples_split': [4,],
#     'min_samples_leaf': [ 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30 ,33]
# }

# # Create a GridSearchCV object with cross-validation
# grid_search = GridSearchCV(estimator=dt_classifier, param_grid=param_grid, cv=5, scoring='roc_auc')

# # Fit the GridSearchCV to your data
# grid_search.fit(X, y)

# # Get the best hyperparameters from the grid search
# best_params = grid_search.best_params_

# # Get the best estimator (model) with the optimal hyperparameters
# best_dt_model = grid_search.best_estimator_

In [None]:
# best_dt_model

In [None]:
max = -1
ans = 200

for depth in range(8, 9, 1):

    for leaf in range(70, 91, 1):

        for split in range(800, 801, 5):
            
            
            best_dt_model = DecisionTreeClassifier(max_depth=depth, min_samples_leaf=leaf, min_samples_split=split)
            trainX, testX, trainy, testy = train_test_split(X, y, test_size=0.1, random_state=5)
            best_dt_model.fit(trainX,trainy)

            md_probs = best_dt_model.predict_proba(testX)
            md_probs = md_probs[:,1]
            md_auc = roc_auc_score(testy, md_probs)
            md_auc
        
    
            if md_auc > max:
                max = md_auc
                ans = depth, leaf, split
                print(ans, max)
            # print(min_sam)
        

print("BEST:", ans, max) 

In [579]:

def fitting_models_CV2(model):
    cv = RepeatedKFold(n_splits=10, n_repeats=2)#, random_state=1)
    
    start = time.perf_counter()
    #scores = cross_val_score(clf, X, y, scoring="accuracy", cv=cv) 
    scores = cross_val_score(model, X, y, scoring="roc_auc", cv=cv, n_jobs=-1) 
    end = time.perf_counter()        
    score = format(mean(scores), '.4f')
    duration = format((end-start),'.4f')
    print(" {} - {}".format(score,duration))
    
    
    # dt = DecisionTreeClassifier()
    # cv = RepeatedKFold(n_splits=10, n_repeats=1)#, random_state=1)
    # grid_search = GridSearchCV(dt, parameters, cv=cv, n_jobs=-1, scoring='roc_auc',verbose=2)#, refit=False)
    # grid_search.fit(X_selected, y)
    # print(grid_search.best_estimator_)
    # print(grid_search.best_score_)
    # print(grid_search.best_params_)


In [664]:
X_selected

Unnamed: 0,apache_post_operative,gcs_unable_apache,d1_spo2_min,apache_4a_icu_death_prob,ethnicity_0,ethnicity_3,icu_type_1,icu_type_4,apache_3j_bodysystem_5,apache_2_bodysystem_0
0,0.0,0.0,0.95,0.507614,False,False,False,False,True,False
1,0.0,0.0,0.96,0.535224,True,False,False,False,False,True
2,0.0,0.0,0.86,0.583756,False,False,False,True,False,False
3,0.0,0.0,0.92,0.538071,False,False,False,False,False,False
4,0.0,0.0,0.89,0.543147,False,False,True,False,False,True
...,...,...,...,...,...,...,...,...,...,...
49995,0.0,0.0,0.91,0.512690,False,False,False,False,False,False
49996,0.0,0.0,0.98,0.507614,False,False,False,False,False,False
49997,0.0,0.0,0.92,0.532995,False,False,False,False,False,True
49998,1.0,0.0,0.97,0.507614,False,False,False,False,False,True


In [724]:

from sklearn.naive_bayes import CategoricalNB, GaussianNB, MultinomialNB


best_dt_model = DecisionTreeClassifier(max_depth=7, min_samples_leaf=251, min_samples_split=3, criterion='gini')
# best_dt_model = MultinomialNB()

trainX, testX, trainy, testy = train_test_split(X_selected, y, test_size=0.3, random_state=5)
best_dt_model.fit(trainX,trainy)

md_probs = best_dt_model.predict_proba(testX)
md_probs = md_probs[:,1]
md_auc = roc_auc_score(testy, md_probs)
md_auc
fitting_models_CV2(best_dt_model)

 0.8546 - 2.2434


In [725]:
best_dt_model = DecisionTreeClassifier(max_depth=7, min_samples_leaf=251, min_samples_split=3)

best_dt_model.fit(X_selected,y)

In [600]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

def explore_data(df):
    # Create box plots and scatter plots for numerical columns
    numerical_columns = df.select_dtypes(include='number').columns

    for col in numerical_columns:
        plt.figure(figsize=(12, 4))

        # Box Plot
        plt.subplot(1, 2, 1)
        sns.boxplot(x=df[col])
        plt.title(f'Box Plot of {col}')
        
        # Scatter Plot (if there's another numerical column)
        if len(numerical_columns) > 1:
            plt.subplot(1, 2, 2)
            sns.scatterplot(data=df, x=col, y=numerical_columns.drop(col).values[0])
            plt.title(f'Scatter Plot: {col} vs {numerical_columns.drop(col).values[0]}')
        
        plt.show()

        # Calculate Z-scores
        z_scores = np.abs(stats.zscore(df[col]))
        df[f'{col}_ZScore'] = z_scores

        # Calculate IQR
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        df[f'{col}_IQR'] = IQR

        # Display Z-score and IQR statistics
        print(f'{col} - Z-Score Statistics:')
        print(f"   Mean Z-Score: {z_scores.mean()}")
        print(f"   Max Z-Score: {z_scores.max()}")
        print(f"   Min Z-Score: {z_scores.min()}")
        print(f'{col} - IQR Statistics:')
        print(f"   Q1: {Q1}")
        print(f"   Q3: {Q3}")
        print(f"   IQR: {IQR}")
        print(f"   Potential Outliers (IQR method):")
        print(df[(df[col] < (Q1 - 1.5 * IQR)) | (df[col] > (Q3 + 1.5 * IQR))][[col, f'{col}_IQR']])
        print("\n" + "=" * 50 + "\n")

# Example usage:
# Replace 'your_dataframe' with your DataFrame.
# explore_data(your_dataframe)


In [None]:
explore_data(X_selected)

In [None]:
selected_features

In [726]:
def generate_predictions_for_model(model, test_file, output_file):
    # Load the test data
    df_test = pd.read_csv(test_file)
    record_ids = df_test["RecordID"]
    
    df_test_imputed = custom_impute(df_test)
    
    df_test_onehot = pd.get_dummies(df_test_imputed)
    df_test_onehot = df_test_onehot[selected_feature_names]
    X_test = df_test_onehot.loc[:, df_test_onehot.columns != "hospital_death"]
    
    
    # Generate predictions using the model
    probs = model.predict_proba(X_test)
    probs = probs[:, 1]
    
    # Create a DataFrame for the results
    result = pd.DataFrame({'RecordID': record_ids, 'hospital_death': probs})
    
    # Save the results to a CSV file
    result.to_csv(output_file, index=False, header=["RecordID", "hospital_death"])
    
generate_predictions_for_model(best_dt_model, "test.csv", "results23.csv")


Index(['ethnicity', 'gender', 'icu_admit_source', 'icu_stay_type', 'icu_type',
       'apache_3j_bodysystem', 'apache_2_bodysystem'],
      dtype='object')


In [None]:

df_test = pd.read_csv('test.csv')
record_ids = df_test["RecordID"]

df_test_imputed = knn_impute(df_test)


In [None]:
df_test_imputed.dtypes

In [None]:

column_indices_to_convert = range(50, 57)  # Indices 51 to 58 (inclusive)

df_test_imputed.iloc[:, column_indices_to_convert] = df_test_imputed.iloc[:, column_indices_to_convert].astype(str)

df_test_onehot = pd.get_dummies(df_test_imputed)
X_test = df_test_onehot.loc[:, df_test_onehot.columns != "hospital_death"]
X_test = X_test[selected_features]


# Generate predictions using the model
probs = model.predict_proba(X_test)
probs = probs[:, 1]

# Create a DataFrame for the results
result = pd.DataFrame({'RecordID': record_ids, 'hospital_death': probs})

# Save the results to a CSV file

In [None]:
result.to_csv('results14.csv', index=False, header=["RecordID", "hospital_death"])
