In [1]:
!pip install ucimlrepo



In [2]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
diabetes_130_us_hospitals_for_years_1999_2008 = fetch_ucirepo(id=296) 
  
# data (as pandas dataframes) 
X = diabetes_130_us_hospitals_for_years_1999_2008.data.features 
y = diabetes_130_us_hospitals_for_years_1999_2008.data.targets 
  
# metadata 
print(diabetes_130_us_hospitals_for_years_1999_2008.metadata) 
  
# variable information 
print(diabetes_130_us_hospitals_for_years_1999_2008.variables) 


{'uci_id': 296, 'name': 'Diabetes 130-US Hospitals for Years 1999-2008', 'repository_url': 'https://archive.ics.uci.edu/dataset/296/diabetes+130-us+hospitals+for+years+1999-2008', 'data_url': 'https://archive.ics.uci.edu/static/public/296/data.csv', 'abstract': 'The dataset represents ten years (1999-2008) of clinical care at 130 US hospitals and integrated delivery networks. Each row concerns hospital records of patients diagnosed with diabetes, who underwent laboratory, medications, and stayed up to 14 days. The goal is to determine the early readmission of the patient within 30 days of discharge.\nThe problem is important for the following reasons. Despite high-quality evidence showing improved clinical outcomes for diabetic patients who receive various preventive and therapeutic interventions, many patients do not receive them. This can be partially attributed to arbitrary diabetes management in hospital environments, which fail to attend to glycemic control. Failure to provide pro

  df = pd.read_csv(data_url)


In [3]:
X

Unnamed: 0,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,payer_code,medical_specialty,...,examide,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed
0,Caucasian,Female,[0-10),,6,25,1,1,,Pediatrics-Endocrinology,...,No,No,No,No,No,No,No,No,No,No
1,Caucasian,Female,[10-20),,1,1,7,3,,,...,No,No,Up,No,No,No,No,No,Ch,Yes
2,AfricanAmerican,Female,[20-30),,1,1,7,2,,,...,No,No,No,No,No,No,No,No,No,Yes
3,Caucasian,Male,[30-40),,1,1,7,2,,,...,No,No,Up,No,No,No,No,No,Ch,Yes
4,Caucasian,Male,[40-50),,1,1,7,1,,,...,No,No,Steady,No,No,No,No,No,Ch,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101761,AfricanAmerican,Male,[70-80),,1,3,7,3,MC,,...,No,No,Down,No,No,No,No,No,Ch,Yes
101762,AfricanAmerican,Female,[80-90),,1,4,5,5,MC,,...,No,No,Steady,No,No,No,No,No,No,Yes
101763,Caucasian,Male,[70-80),,1,1,7,1,MC,,...,No,No,Down,No,No,No,No,No,Ch,Yes
101764,Caucasian,Female,[80-90),,2,3,7,10,MC,Surgery-General,...,No,No,Up,No,No,No,No,No,Ch,Yes


In [4]:
y

Unnamed: 0,readmitted
0,NO
1,>30
2,NO
3,NO
4,NO
...,...
101761,>30
101762,NO
101763,NO
101764,NO


In [5]:
# Import relevant modules
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [6]:
# Data preprocessing
# Handle missing values using SimpleImputer from sklearn using strategy of using the most frequent values in place
imputer = SimpleImputer(missing_values = np.nan, strategy='most_frequent')
X_imputed = imputer.fit_transform(X)


In [7]:
# Encode categorical variables
# Converting categorical variables using sklearn oneHotCoding
# encoder = OneHotEncoder(drop='first', sparse=False)
encoder = OneHotEncoder(drop='first')
X_encoded = encoder.fit_transform(X_imputed)

In [8]:
# Split the data into training and testing sets
# 80 - 20 train test split
y = np.ravel(y)

X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

In [9]:
# Further split the training data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)  # 0.25 x 0.8 = 0.2

In [40]:
# Define hyperparameters ranges
C_values = [0.001, 0.01, 0.1, 1, 10, 100]
penalty_values = ['l2']
solver_values = ['liblinear', 'saga', 'sag']
iterations = [1500, 2000, 2500]
# sag works, liblinear works, saga works

In [47]:
# Perform hyperparameter tuning
best_accuracy = 0
best_params = {}

for C in C_values:
    for penalty in penalty_values:
        for solver in solver_values:
            for iter in iterations:
                # Create logistic regression model with the current hyperparameters
                log_reg = LogisticRegression(max_iter=iter, C=C, penalty=penalty, solver= solver)
                
                # Train the model on the training data
                log_reg.fit(X_train, np.ravel(y_train))
                
                # Make predictions on the validation set
                y_pred_val = log_reg.predict(X_val)
                
                # Calculate accuracy
                accuracy = accuracy_score(y_val, y_pred_val)
                
                # Check if this combination of hyperparameters gives better accuracy
                if accuracy > best_accuracy:
                    best_accuracy = accuracy
                    best_params = {'C': C, 'penalty': penalty, 'solver': solver}



In [57]:
# Increase max_iter parameter
# Create the best logistic regression model with the best hyperparameters
best_log_reg = LogisticRegression(max_iter=1500, **best_params)

# Train the best model on the full training set
best_log_reg.fit(X_train, np.ravel(y_train))

In [58]:
# Evaluate the best model on the test set
print("Best Hyperparameters:", best_params)

y_train_pred = best_log_reg.predict(X_train)
train_accuracy = accuracy_score(y_train, y_train_pred)
print('Training Accuracy:', train_accuracy)

y_val_pred = best_log_reg.predict(X_val)
validation_accuracy = accuracy_score(y_val, y_val_pred)
print("Validation Set Accuracy:", validation_accuracy)


y_pred_test = best_log_reg.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred_test)
print("Test Accuracy:", test_accuracy)


Best Hyperparameters: {'C': 0.1, 'penalty': 'l2', 'solver': 'saga'}
Training Accuracy: 0.6028103965017442
Validation Set Accuracy: 0.5870387657839139
Test Accuracy: 0.584946447872654
