In [13]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV

In [14]:
df = pd.read_csv("./hospitaldata.csv")
df.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,discharge_disposition_id,admission_source_id,time_in_hospital,medical_specialty,...,examide,citoglipton,insulin,glyburide_metformin,glipizide_metformin,glimepiride_pioglitazone,metformin_pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),Unknown,_Other,Referral,1,Other,...,No,No,No,No,No,No,No,No,No,No
1,149190,55629189,Caucasian,Female,[10-20),Unknown,Home,Emergency,3,Other,...,No,No,Up,No,No,No,No,Yes,Yes,After30Days
2,64410,86047875,AfricanAmerican,Female,[20-30),Unknown,Home,Emergency,2,Other,...,No,No,No,No,No,No,No,No,Yes,No
3,500364,82442376,Caucasian,Male,[30-40),Unknown,Home,Emergency,2,Other,...,No,No,Up,No,No,No,No,Yes,Yes,No
4,16680,42519267,Caucasian,Male,[40-50),Unknown,Home,Emergency,1,Other,...,No,No,Steady,No,No,No,No,Yes,Yes,No


## Data Cleaning
### Removing columns which are not useful or doesnt help much for our model.

In [15]:
# encounter_id
# patient_nbr
# weight column as it has 98000+ rows of unknown value in it where replacing with central tendency value is not correct which will be baised
# number_outpatient: number of out patient visits to the patient i think this is no where related to admitted or readmitted
# number_emergency: Number of emergency visits of the patient in the year preceding the encounter i think this is no where related to admitted or readmitted
# number_inpatient : i think this is no where related to admitted or readmitted
# diag_1, diag_2,diag_3 these 3 columns are speaking about price and some has some different values which is no where related to readmission status
# max_glu_serum: this column has max None values approx 96000+  values which are more than 85% 

In [16]:
# Listing the columns to drop
columns_to_drop = ['encounter_id', 'patient_nbr', 'weight', 'number_outpatient','number_emergency', 
                   'number_inpatient', 'diag_1','diag_2','diag_3', 'examide', 'max_glu_serum','citoglipton']
# Droping the columns
df = df.drop(columns=columns_to_drop)
df.head(2)

Unnamed: 0,race,gender,age,discharge_disposition_id,admission_source_id,time_in_hospital,medical_specialty,num_lab_procedures,num_procedures,num_medications,...,troglitazone,tolazamide,insulin,glyburide_metformin,glipizide_metformin,glimepiride_pioglitazone,metformin_pioglitazone,change,diabetesMed,readmitted
0,Caucasian,Female,[0-10),_Other,Referral,1,Other,41,0,1,...,No,No,No,No,No,No,No,No,No,No
1,Caucasian,Female,[10-20),Home,Emergency,3,Other,59,0,18,...,No,No,Up,No,No,No,No,Yes,Yes,After30Days


## Changing dependent variable value

In [17]:
df['readmitted'] = df['readmitted'].replace({'After30Days': 'Yes', 'Within30Days': 'Yes'})

## Label Encoding

In [18]:
# Intializing the LabelEncoder
label_encoder = LabelEncoder()

# List of columns to apply label encoding
columns_to_encode = [
    'race', 'gender', 'age', 'discharge_disposition_id', 'admission_source_id',
    'time_in_hospital', 'medical_specialty', 'num_lab_procedures', 'num_procedures',
    'num_medications', 'number_diagnoses', 'metformin', 'repaglinide', 'nateglinide',
    'chlorpropamide', 'glimepiride', 'acetohexamide', 'glipizide', 'glyburide',
    'tolbutamide', 'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol',
    'troglitazone', 'tolazamide', 'insulin',
    'glyburide_metformin', 'glipizide_metformin', 'glimepiride_pioglitazone',
    'metformin_pioglitazone', 'change', 'diabetesMed', 'A1Cresult',
    'readmitted'
]

# Apply label encoding to each column
for column in columns_to_encode:
    df[column] = label_encoder.fit_transform(df[column])

### Checking if any null values in categeorical data

In [19]:
print('ColName                    NullCount')
print('======================================')
df[columns_to_encode].isnull().sum()

ColName                    NullCount


race                        0
gender                      0
age                         0
discharge_disposition_id    0
admission_source_id         0
time_in_hospital            0
medical_specialty           0
num_lab_procedures          0
num_procedures              0
num_medications             0
number_diagnoses            0
metformin                   0
repaglinide                 0
nateglinide                 0
chlorpropamide              0
glimepiride                 0
acetohexamide               0
glipizide                   0
glyburide                   0
tolbutamide                 0
pioglitazone                0
rosiglitazone               0
acarbose                    0
miglitol                    0
troglitazone                0
tolazamide                  0
insulin                     0
glyburide_metformin         0
glipizide_metformin         0
glimepiride_pioglitazone    0
metformin_pioglitazone      0
change                      0
diabetesMed                 0
A1Cresult 

In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101763 entries, 0 to 101762
Data columns (total 35 columns):
 #   Column                    Non-Null Count   Dtype
---  ------                    --------------   -----
 0   race                      101763 non-null  int64
 1   gender                    101763 non-null  int64
 2   age                       101763 non-null  int64
 3   discharge_disposition_id  101763 non-null  int64
 4   admission_source_id       101763 non-null  int64
 5   time_in_hospital          101763 non-null  int64
 6   medical_specialty         101763 non-null  int64
 7   num_lab_procedures        101763 non-null  int64
 8   num_procedures            101763 non-null  int64
 9   num_medications           101763 non-null  int64
 10  number_diagnoses          101763 non-null  int64
 11  A1Cresult                 101763 non-null  int64
 12  metformin                 101763 non-null  int64
 13  repaglinide               101763 non-null  int64
 14  nateglinide         

In [21]:
X = df.drop('readmitted', axis=1)
y = df['readmitted']

scaler = MinMaxScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

print(len(X_scaled))
print(len(y))

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

101763
101763


In [22]:
model = LogisticRegression()
model.fit(X_train, y_train)


y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test,y_pred)
print(f"Accuracy is {accuracy}")

precision = precision_score(y_test, y_pred, average="macro")
recall = recall_score(y_test, y_pred, average="macro")

print(f"precision score is: {precision}")
print(f"recall score is: {recall}")

f1 = f1_score(y_test, y_pred, average="macro")
print(f"f1 score is: {f1}")


Accuracy is 0.5684758753971634
precision score is: 0.5634572580923147
recall score is: 0.5562677225627348
f1 score is: 0.5489302460462088


In [23]:
X_scaled

Unnamed: 0,race,gender,age,discharge_disposition_id,admission_source_id,time_in_hospital,medical_specialty,num_lab_procedures,num_procedures,num_medications,...,miglitol,troglitazone,tolazamide,insulin,glyburide_metformin,glipizide_metformin,glimepiride_pioglitazone,metformin_pioglitazone,change,diabetesMed
0,0.6,0.0,0.000000,1.0,1.0,0.000000,0.0,0.341880,0.000000,0.000000,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
1,0.6,0.0,0.111111,0.0,0.5,0.153846,0.0,0.495726,0.000000,0.229730,...,0.0,0.0,0.0,1.000000,0.0,0.0,0.0,0.0,1.0,1.0
2,0.2,0.0,0.222222,0.0,0.5,0.076923,0.0,0.085470,0.833333,0.162162,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,1.0
3,0.6,1.0,0.333333,0.0,0.5,0.076923,0.0,0.367521,0.166667,0.202703,...,0.0,0.0,0.0,1.000000,0.0,0.0,0.0,0.0,1.0,1.0
4,0.6,1.0,0.444444,0.0,0.5,0.000000,0.0,0.427350,0.000000,0.094595,...,0.0,0.0,0.0,0.666667,0.0,0.0,0.0,0.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101758,0.2,1.0,0.777778,1.0,0.5,0.153846,0.0,0.427350,0.000000,0.202703,...,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,1.0,1.0
101759,0.2,0.0,0.888889,1.0,0.0,0.307692,0.0,0.273504,0.500000,0.229730,...,0.0,0.0,0.0,0.666667,0.0,0.0,0.0,0.0,0.0,1.0
101760,0.6,1.0,0.777778,0.0,0.5,0.000000,0.0,0.444444,0.000000,0.108108,...,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,1.0,1.0
101761,0.6,0.0,0.888889,1.0,0.5,0.692308,0.0,0.376068,0.333333,0.270270,...,0.0,0.0,0.0,1.000000,0.0,0.0,0.0,0.0,1.0,1.0


In [24]:
# Define the parameter grid
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'solver': ['liblinear', 'saga']
}

# Create a GridSearchCV object
grid = GridSearchCV(LogisticRegression(), param_grid, verbose=3, cv=5)

# Fit the model
grid.fit(X_train, y_train)

# Best parameters
print("Best parameters found: ", grid.best_params_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5] END ..........C=0.01, solver=liblinear;, score=0.566 total time=   0.1s
[CV 2/5] END ..........C=0.01, solver=liblinear;, score=0.561 total time=   0.1s
[CV 3/5] END ..........C=0.01, solver=liblinear;, score=0.566 total time=   0.1s
[CV 4/5] END ..........C=0.01, solver=liblinear;, score=0.566 total time=   0.1s
[CV 5/5] END ..........C=0.01, solver=liblinear;, score=0.563 total time=   0.1s
[CV 1/5] END ...............C=0.01, solver=saga;, score=0.567 total time=   0.4s
[CV 2/5] END ...............C=0.01, solver=saga;, score=0.561 total time=   0.4s
[CV 3/5] END ...............C=0.01, solver=saga;, score=0.568 total time=   0.4s
[CV 4/5] END ...............C=0.01, solver=saga;, score=0.568 total time=   0.4s
[CV 5/5] END ...............C=0.01, solver=saga;, score=0.562 total time=   0.4s
[CV 1/5] END ...........C=0.1, solver=liblinear;, score=0.569 total time=   0.2s
[CV 2/5] END ...........C=0.1, solver=liblinear;



[CV 1/5] END .................C=10, solver=saga;, score=0.570 total time=   1.4s




[CV 2/5] END .................C=10, solver=saga;, score=0.558 total time=   1.7s




[CV 3/5] END .................C=10, solver=saga;, score=0.567 total time=   1.4s




[CV 4/5] END .................C=10, solver=saga;, score=0.568 total time=   1.5s




[CV 5/5] END .................C=10, solver=saga;, score=0.561 total time=   1.4s
[CV 1/5] END ...........C=100, solver=liblinear;, score=0.570 total time=   0.4s
[CV 2/5] END ...........C=100, solver=liblinear;, score=0.558 total time=   0.5s
[CV 3/5] END ...........C=100, solver=liblinear;, score=0.567 total time=   0.3s
[CV 4/5] END ...........C=100, solver=liblinear;, score=0.568 total time=   0.4s
[CV 5/5] END ...........C=100, solver=liblinear;, score=0.561 total time=   0.4s




[CV 1/5] END ................C=100, solver=saga;, score=0.570 total time=   1.4s




[CV 2/5] END ................C=100, solver=saga;, score=0.558 total time=   1.4s




[CV 3/5] END ................C=100, solver=saga;, score=0.567 total time=   1.4s




[CV 4/5] END ................C=100, solver=saga;, score=0.568 total time=   1.5s
[CV 5/5] END ................C=100, solver=saga;, score=0.561 total time=   1.4s
Best parameters found:  {'C': 0.1, 'solver': 'liblinear'}




In [25]:
# Make predictions with the best model
best_model = grid.best_estimator_
y_pred_best = best_model.predict(X_test)

# Evaluate the best model
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_best))

print("\nClassification Report:")
print(classification_report(y_test, y_pred_best))

print("Accuracy Score:", accuracy_score(y_test, y_pred_best))

Confusion Matrix:
[[11912  4464]
 [ 8732  5421]]

Classification Report:
              precision    recall  f1-score   support

           0       0.58      0.73      0.64     16376
           1       0.55      0.38      0.45     14153

    accuracy                           0.57     30529
   macro avg       0.56      0.56      0.55     30529
weighted avg       0.56      0.57      0.55     30529

Accuracy Score: 0.567755249107406
