In [24]:
import numpy as np
import pandas as pd

from typing import List, Tuple
import matplotlib.pyplot as plt
import math

from sklearn.datasets import fetch_openml
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.inspection import DecisionBoundaryDisplay
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from ucimlrepo import fetch_ucirepo
from sklearn.neural_network import MLPClassifier

import warnings
warnings.filterwarnings('ignore')

diabetes_data = fetch_ucirepo(id=296)

X = diabetes_data.data.features
y = diabetes_data.data.targets
y = y.values.ravel()

seed = 1234
np.random.seed(seed)  

#Process data
obj_cols = X.select_dtypes(include=['object']).columns
num_cols = X.select_dtypes(include=['number']).columns

num_transform = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

obj_transform = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encode', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transform, num_cols),
        ('cat', obj_transform, obj_cols)
    ])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=seed)

In [23]:
df.describe()

Unnamed: 0,encounter_id,patient_nbr,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses
count,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0
mean,165201600.0,54330400.0,2.024006,3.715642,5.754437,4.395987,43.095641,1.33973,16.021844,0.369357,0.197836,0.635566,7.422607
std,102640300.0,38696360.0,1.445403,5.280166,4.064081,2.985108,19.674362,1.705807,8.127566,1.267265,0.930472,1.262863,1.9336
min,12522.0,135.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
25%,84961190.0,23413220.0,1.0,1.0,1.0,2.0,31.0,0.0,10.0,0.0,0.0,0.0,6.0
50%,152389000.0,45505140.0,1.0,1.0,7.0,4.0,44.0,1.0,15.0,0.0,0.0,0.0,8.0
75%,230270900.0,87545950.0,3.0,4.0,7.0,6.0,57.0,2.0,20.0,0.0,0.0,1.0,9.0
max,443867200.0,189502600.0,8.0,28.0,25.0,14.0,132.0,6.0,81.0,42.0,76.0,21.0,16.0


In [25]:
logistic_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                    ('classifier', LogisticRegression(random_state=seed, max_iter=2000))])
logistic_pipeline.fit(X_train, y_train)
y_pred_logistic = logistic_pipeline.predict(X_test)

print("Logistic Regression")
print(classification_report(y_test, y_pred_logistic))

Logistic Regression
              precision    recall  f1-score   support

         <30       0.29      0.02      0.04      2852
         >30       0.50      0.33      0.40      8884
          NO       0.60      0.85      0.71     13706

    accuracy                           0.58     25442
   macro avg       0.46      0.40      0.38     25442
weighted avg       0.53      0.58      0.52     25442



In [26]:
random_forest_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                         ('classifier', RandomForestClassifier(random_state=seed))])
random_forest_pipeline.fit(X_train, y_train)
y_pred_rf = random_forest_pipeline.predict(X_test)

print("Random Forest")
print(classification_report(y_test, y_pred_rf))

Random Forest
              precision    recall  f1-score   support

         <30       0.53      0.01      0.02      2852
         >30       0.51      0.32      0.40      8884
          NO       0.60      0.87      0.71     13706

    accuracy                           0.58     25442
   macro avg       0.55      0.40      0.38     25442
weighted avg       0.56      0.58      0.53     25442



In [36]:
knn_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                         ('classifier', KNeighborsClassifier(n_neighbors=100))])
knn_pipeline.fit(X_train, y_train)
y_pred_knn = knn_pipeline.predict(X_test)

print("K-Nearest Neighbors")
print(classification_report(y_test, y_pred_knn))

K-Nearest Neighbors
              precision    recall  f1-score   support

         <30       0.41      0.01      0.02      2852
         >30       0.51      0.26      0.34      8884
          NO       0.59      0.90      0.71     13706

    accuracy                           0.58     25442
   macro avg       0.50      0.39      0.36     25442
weighted avg       0.54      0.58      0.51     25442



In [32]:
best_hyperparams = {
    'hidden_layer_sizes': [500],
    'solver': 'sgd',
    'learning_rate_init': 0.01,
    'max_iter': 20
}
nn_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                         ('classifier', MLPClassifier(**best_hyperparams, shuffle=True, random_state=seed, verbose=False))])
nn_pipeline.fit(X_train, y_train)
y_pred_nn = nn_pipeline.predict(X_test)

print("Neural Networks")
print(classification_report(y_test, y_pred_nn))

Neural Networks
              precision    recall  f1-score   support

         <30       0.37      0.03      0.05      2852
         >30       0.51      0.33      0.40      8884
          NO       0.61      0.87      0.71     13706

    accuracy                           0.58     25442
   macro avg       0.49      0.41      0.39     25442
weighted avg       0.55      0.58      0.53     25442

