In [72]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import MinMaxScaler

In [73]:
df = pd.read_csv("./hospitaldata.csv")
df.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,discharge_disposition_id,admission_source_id,time_in_hospital,medical_specialty,...,examide,citoglipton,insulin,glyburide_metformin,glipizide_metformin,glimepiride_pioglitazone,metformin_pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),Unknown,_Other,Referral,1,Other,...,No,No,No,No,No,No,No,No,No,No
1,149190,55629189,Caucasian,Female,[10-20),Unknown,Home,Emergency,3,Other,...,No,No,Up,No,No,No,No,Yes,Yes,After30Days
2,64410,86047875,AfricanAmerican,Female,[20-30),Unknown,Home,Emergency,2,Other,...,No,No,No,No,No,No,No,No,Yes,No
3,500364,82442376,Caucasian,Male,[30-40),Unknown,Home,Emergency,2,Other,...,No,No,Up,No,No,No,No,Yes,Yes,No
4,16680,42519267,Caucasian,Male,[40-50),Unknown,Home,Emergency,1,Other,...,No,No,Steady,No,No,No,No,Yes,Yes,No


## Data Cleaning
### Removing columns which are not useful or doesnt help much for our model.

In [74]:
# encounter_id
# patient_nbr
# weight column as it has 98000+ rows of unknown value in it where replacing with central tendency value is not correct which will be baised
# number_outpatient: number of out patient visits to the patient i think this is no where related to admitted or readmitted
# number_emergency: Number of emergency visits of the patient in the year preceding the encounter i think this is no where related to admitted or readmitted
# number_inpatient : i think this is no where related to admitted or readmitted
# diag_1, diag_2,diag_3 these 3 columns are speaking about price and some has some different values which is no where related to readmission status
# max_glu_serum: this column has max None values approx 96000+  values which are more than 85% 

In [75]:
# Listing the columns to drop
columns_to_drop = ['encounter_id', 'patient_nbr', 'weight', 'number_outpatient','number_emergency', 
                   'number_inpatient', 'diag_1','diag_2','diag_3']
# Droping the columns
df = df.drop(columns=columns_to_drop)
df.head(2)

Unnamed: 0,race,gender,age,discharge_disposition_id,admission_source_id,time_in_hospital,medical_specialty,num_lab_procedures,num_procedures,num_medications,...,examide,citoglipton,insulin,glyburide_metformin,glipizide_metformin,glimepiride_pioglitazone,metformin_pioglitazone,change,diabetesMed,readmitted
0,Caucasian,Female,[0-10),_Other,Referral,1,Other,41,0,1,...,No,No,No,No,No,No,No,No,No,No
1,Caucasian,Female,[10-20),Home,Emergency,3,Other,59,0,18,...,No,No,Up,No,No,No,No,Yes,Yes,After30Days


## Changing dependent variable value

In [76]:
df['readmitted'] = df['readmitted'].replace({'After30Days': 'Yes', 'Within30Days': 'Yes'})


## Label Encoding

In [77]:
# Initializing the LabelEncoder
label_encoder = LabelEncoder()

# Applying label encoding to each categorical column
df['race'] = label_encoder.fit_transform(df['race'])
df['gender'] = label_encoder.fit_transform(df['gender'])
df['age'] = label_encoder.fit_transform(df['age'])
df['discharge_disposition_id'] = label_encoder.fit_transform(df['discharge_disposition_id'])
df['admission_source_id'] = label_encoder.fit_transform(df['admission_source_id'])
df['time_in_hospital'] = label_encoder.fit_transform(df['time_in_hospital'])
df['medical_specialty'] = label_encoder.fit_transform(df['medical_specialty'])
df['num_lab_procedures'] = label_encoder.fit_transform(df['num_lab_procedures'])
df['num_procedures'] = label_encoder.fit_transform(df['num_procedures'])
df['num_medications'] = label_encoder.fit_transform(df['num_medications'])
df['number_diagnoses'] = label_encoder.fit_transform(df['number_diagnoses'])
df['metformin'] = label_encoder.fit_transform(df['metformin'])
df['repaglinide'] = label_encoder.fit_transform(df['repaglinide'])
df['nateglinide'] = label_encoder.fit_transform(df['nateglinide'])
df['chlorpropamide'] = label_encoder.fit_transform(df['chlorpropamide'])
df['glimepiride'] = label_encoder.fit_transform(df['glimepiride'])
df['acetohexamide'] = label_encoder.fit_transform(df['acetohexamide'])
df['glipizide'] = label_encoder.fit_transform(df['glipizide'])
df['glyburide'] = label_encoder.fit_transform(df['glyburide'])
df['tolbutamide'] = label_encoder.fit_transform(df['tolbutamide'])
df['pioglitazone'] = label_encoder.fit_transform(df['pioglitazone'])
df['rosiglitazone'] = label_encoder.fit_transform(df['rosiglitazone'])
df['acarbose'] = label_encoder.fit_transform(df['acarbose'])
df['miglitol'] = label_encoder.fit_transform(df['miglitol'])
df['troglitazone'] = label_encoder.fit_transform(df['troglitazone'])
df['tolazamide'] = label_encoder.fit_transform(df['tolazamide'])
df['examide'] = label_encoder.fit_transform(df['examide'])
df['citoglipton'] = label_encoder.fit_transform(df['citoglipton'])
df['insulin'] = label_encoder.fit_transform(df['insulin'])
df['glyburide_metformin'] = label_encoder.fit_transform(df['glyburide_metformin'])
df['glipizide_metformin'] = label_encoder.fit_transform(df['glipizide_metformin'])
df['glimepiride_pioglitazone'] = label_encoder.fit_transform(df['glimepiride_pioglitazone'])
df['metformin_pioglitazone'] = label_encoder.fit_transform(df['metformin_pioglitazone'])
df['change'] = label_encoder.fit_transform(df['change'])
df['diabetesMed'] = label_encoder.fit_transform(df['diabetesMed'])
df['max_glu_serum'] = label_encoder.fit_transform(df['max_glu_serum'])
df['A1Cresult'] = label_encoder.fit_transform(df['A1Cresult'])
df['readmitted'] = label_encoder.fit_transform(df['readmitted'])

In [78]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101763 entries, 0 to 101762
Data columns (total 38 columns):
 #   Column                    Non-Null Count   Dtype
---  ------                    --------------   -----
 0   race                      101763 non-null  int64
 1   gender                    101763 non-null  int64
 2   age                       101763 non-null  int64
 3   discharge_disposition_id  101763 non-null  int64
 4   admission_source_id       101763 non-null  int64
 5   time_in_hospital          101763 non-null  int64
 6   medical_specialty         101763 non-null  int64
 7   num_lab_procedures        101763 non-null  int64
 8   num_procedures            101763 non-null  int64
 9   num_medications           101763 non-null  int64
 10  number_diagnoses          101763 non-null  int64
 11  max_glu_serum             101763 non-null  int64
 12  A1Cresult                 101763 non-null  int64
 13  metformin                 101763 non-null  int64
 14  repaglinide         

In [79]:
X = df.drop('readmitted', axis=1)
y = df['readmitted']

scaler = MinMaxScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

print(len(X_scaled))
print(len(y))

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

101763
101763


In [80]:
model = LogisticRegression()
model.fit(X_train, y_train)


y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test,y_pred)
print(f"Accuracy is {accuracy}")

precision = precision_score(y_test, y_pred, average="macro")
recall = recall_score(y_test, y_pred, average="macro")

print(f"precision score is: {precision}")
print(f"recall score is: {recall}")

f1 = f1_score(y_test, y_pred, average="macro")
print(f"f1 score is: {f1}")


Accuracy is 0.5690327229847031
precision score is: 0.5640584935412976
recall score is: 0.5569594204808495
f1 score is: 0.5498637000082125


In [81]:
X_scaled

Unnamed: 0,race,gender,age,discharge_disposition_id,admission_source_id,time_in_hospital,medical_specialty,num_lab_procedures,num_procedures,num_medications,...,tolazamide,examide,citoglipton,insulin,glyburide_metformin,glipizide_metformin,glimepiride_pioglitazone,metformin_pioglitazone,change,diabetesMed
0,0.6,0.0,0.000000,1.0,1.0,0.000000,0.0,0.341880,0.000000,0.000000,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
1,0.6,0.0,0.111111,0.0,0.5,0.153846,0.0,0.495726,0.000000,0.229730,...,0.0,0.0,0.0,1.000000,0.0,0.0,0.0,0.0,1.0,1.0
2,0.2,0.0,0.222222,0.0,0.5,0.076923,0.0,0.085470,0.833333,0.162162,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,1.0
3,0.6,1.0,0.333333,0.0,0.5,0.076923,0.0,0.367521,0.166667,0.202703,...,0.0,0.0,0.0,1.000000,0.0,0.0,0.0,0.0,1.0,1.0
4,0.6,1.0,0.444444,0.0,0.5,0.000000,0.0,0.427350,0.000000,0.094595,...,0.0,0.0,0.0,0.666667,0.0,0.0,0.0,0.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101758,0.2,1.0,0.777778,1.0,0.5,0.153846,0.0,0.427350,0.000000,0.202703,...,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,1.0,1.0
101759,0.2,0.0,0.888889,1.0,0.0,0.307692,0.0,0.273504,0.500000,0.229730,...,0.0,0.0,0.0,0.666667,0.0,0.0,0.0,0.0,0.0,1.0
101760,0.6,1.0,0.777778,0.0,0.5,0.000000,0.0,0.444444,0.000000,0.108108,...,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,1.0,1.0
101761,0.6,0.0,0.888889,1.0,0.5,0.692308,0.0,0.376068,0.333333,0.270270,...,0.0,0.0,0.0,1.000000,0.0,0.0,0.0,0.0,1.0,1.0
