In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

# Data imports
data = pd.read_csv('Data/cleaned_data.csv', index_col=0)

from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import RFE

# Model imports
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

# Metrics imports
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [3]:
def test_model(model, X_train, y_train, X_test, y_test, just_score=False): 
    fit = model.fit(X_train, y_train)
    y_pred = fit.predict(X_test)
    if just_score:
        return fit, f1_score(y_test, y_pred)
    
    print('Accuracy: ', accuracy_score(y_test, y_pred))
    print('Precision: ', precision_score(y_test, y_pred))
    print('Recall: ', recall_score(y_test, y_pred))
    print('F1: ', f1_score(y_test, y_pred))
    print('Confusion matrix: \n', confusion_matrix(y_test, y_pred))
    print('Classification report: \n', classification_report(y_test, y_pred))
    return fit

Models !!

In [4]:
numerical_features = ['invisits', 'non_lab_procedures', 'num_diag', 'num_meds', 'num_tests', 'length_stay', 'sum_visits']
categorical_features = ['admission_source','diag_3','admission_type','glucose_test_result','diag_2','diag_1','race','disposition','a1c_test_result']

In [5]:
X = data[data['readmitted_binary'].notna()]
y = X['readmitted_binary']
X = X.drop(['readmitted_binary', 'readmitted_multiclass', 'diag_1_description', 'diag_2_description', 'diag_3_description', 'patient_id'], axis=1)

# For the final test set
X_final_test = data[data['readmitted_binary'].isna()]
X_final_test = X_final_test.drop(['readmitted_binary', 'readmitted_multiclass', 'diag_1_description', 'diag_2_description', 'diag_3_description', 'patient_id'], axis=1)

# Age should be considered a categorical variable (ordinal)
categorical_features = list(set(categorical_features) - set(['age']))

# Change the categorical features to strings
for feature in categorical_features:
    X[feature] = X[feature].astype(str)
    X_final_test[feature] = X_final_test[feature].astype(str)

# Initialize the one hot encoder
encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')

# Fit and transform the encoder on the categorical features
encoded = encoder.fit_transform(X[categorical_features])
encoded_final_test = encoder.transform(X_final_test[categorical_features])

# Convert the encoded features into a DataFrame with appropriate column names
encoded_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out(categorical_features), index=X.index)
encoded_df_final_test = pd.DataFrame(encoded_final_test, columns=encoder.get_feature_names_out(categorical_features), index=X_final_test.index)

# Concatenate the original DataFrame X with the encoded DataFrame
X = pd.concat([X, encoded_df], axis=1)
X_final_test = pd.concat([X_final_test, encoded_df_final_test], axis=1)

# Drop the original categorical features as they have been encoded now
X.drop(categorical_features, axis=1, inplace=True)
X_final_test.drop(categorical_features, axis=1, inplace=True)

# Scale the numerical features
to_scale = ['invisits', 'length_stay', 'num_meds', 'num_diag', 'num_tests', 'non_lab_procedures', 'sum_visits']

scaler = MinMaxScaler()
X[to_scale] = scaler.fit_transform(X[to_scale])
X_final_test[to_scale] = scaler.transform(X_final_test[to_scale]) 



In [14]:
# Undersample the data 
rus = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = rus.fit_resample(X, y)


# Create the RFE with a LogisticRegression estimator and 3 features to select
rfe = RFE(estimator=RandomForestClassifier(), n_features_to_select=10, verbose=1)

# Fits the eliminator to the data
rfe.fit(X_resampled, y_resampled)

# Print the features and their ranking (high = dropped early on)
for feature, rank in zip(X.columns, rfe.ranking_):
    print(f'{feature} - Rank: {rank}')

Fitting estimator with 114 features.
Fitting estimator with 113 features.
Fitting estimator with 112 features.
Fitting estimator with 111 features.
Fitting estimator with 110 features.
Fitting estimator with 109 features.
Fitting estimator with 108 features.
Fitting estimator with 107 features.
Fitting estimator with 106 features.
Fitting estimator with 105 features.
Fitting estimator with 104 features.
Fitting estimator with 103 features.
Fitting estimator with 102 features.
Fitting estimator with 101 features.
Fitting estimator with 100 features.
Fitting estimator with 99 features.
Fitting estimator with 98 features.
Fitting estimator with 97 features.
Fitting estimator with 96 features.
Fitting estimator with 95 features.
Fitting estimator with 94 features.
Fitting estimator with 93 features.
Fitting estimator with 92 features.
Fitting estimator with 91 features.
Fitting estimator with 90 features.
Fitting estimator with 89 features.
Fitting estimator with 88 features.
Fitting estim

In [16]:
rankings = pd.DataFrame({'Features': X.columns, 'Ranking' : rfe.ranking_})
rankings = rankings.sort_values(by='Ranking')
rankings

In [7]:
scores = pd.DataFrame()
temp = {}


# Initialize the model
model = MLPClassifier(hidden_layer_sizes=(4,4,4), activation='logistic', max_iter=500)

for num_features in range(20, 50, 2):
    temp = []
    for i in range(100):
        # Select the 5 features with the highest ranking
        selected_features = rankings[rankings['Ranking'] < num_features]['Features'].values

        # Fit a model using the selected features
        X_rfe = X[selected_features]

        # Split the data into training and testing sets
        X_train, X_test, y_train, y_test = train_test_split(X_rfe, y, test_size=0.2, shuffle=True)

        # Undersample the data
        rus = RandomUnderSampler(random_state=42)
        X_train_resampled, y_train_resampled = rus.fit_resample(X_train, y_train)

        # Fit the model
        fit,score = test_model(model, X_train_resampled, y_train_resampled, X_test, y_test, just_score=True)

        # Save score
        temp.append(score)

    temp = pd.DataFrame(temp, columns=[f'{num_features} features'])
    scores = pd.concat([scores, temp], axis=1)
    print(f'{num_features} features done')

NameError: name 'model' is not defined

In [35]:
high_score = 0
best_model = None

model = MLPClassifier(hidden_layer_sizes=(60,60,60,60), activation='relu', max_iter=500)

selected_features = rankings[rankings['Ranking'] < 20]['Features'].values

for i in range(100):

    # Fit a model using the selected features
    X_rfe = X[selected_features]

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X_rfe, y, test_size=0.1, shuffle=True)

    # Undersample the data
    rus = RandomUnderSampler(random_state=42)
    X_train_resampled, y_train_resampled = rus.fit_resample(X_train, y_train)

    # Fit the model
    fit,score = test_model(model, X_train_resampled, y_train_resampled, X_test, y_test, just_score=True)

    if score > high_score:
        high_score = score
        best_model = fit
        print(f'New high score: {high_score}')

New high score: 0.25113545284531125
New high score: 0.25314009661835746




New high score: 0.2599192207175101




New high score: 0.2819466248037677
New high score: 0.28290121430915655
New high score: 0.28894055131185653


In [34]:
# Lets use the fit to predict the 
y_pred = best_model.predict(X_final_test[selected_features])

# Lets save the predictions
predictions = pd.DataFrame(y_pred, index=X_final_test.index, columns=['readmitted_binary'])
predictions['readmitted_binary'].apply(lambda x: 'Yes' if x == 1 else 'No').to_csv('predictions.csv', header=True)