## Part 1: Building up a basic predictive model

### Model Building 

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
data = pd.read_csv('cleaned_data.csv')
data.head()

Unnamed: 0,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,payer_code,medical_specialty,num_lab_procedures,num_procedures,...,number_diagnoses,metformin,glipizide,glyburide,pioglitazone,rosiglitazone,insulin,change,diabetesMed,readmitted
0,Female,[70-80),1,22,7,7,MC,Orthopedics-Reconstructive,58,2,...,9,No,No,No,Up,No,Steady,Ch,Yes,0
1,Female,[60-70),2,1,1,3,MC,Nephrology,59,3,...,6,No,No,No,No,No,Steady,No,Yes,0
2,Female,[90-100),1,1,7,4,MC,Emergency/Trauma,56,1,...,6,No,No,No,No,No,No,No,Yes,0
3,Male,[70-80),1,2,7,10,MC,InternalMedicine,68,1,...,6,Steady,No,No,No,No,Steady,Ch,Yes,0
4,Female,[70-80),1,3,6,12,UN,InternalMedicine,77,5,...,5,No,No,No,No,No,Steady,No,Yes,0


### Logestic Regression Model 

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

In [3]:
# Encode categorical variables
label_encoders = {}
for column in data.select_dtypes(include=['object']).columns:
    label_encoders[column] = LabelEncoder()
    data[column] = label_encoders[column].fit_transform(data[column])

In [5]:
# Feature Selection
X = data.drop(columns=['readmitted'])
y = data['readmitted']
model = LogisticRegression(max_iter=10000)  # Increase max_iter
rfe = RFE(model, n_features_to_select=10)  # Select top 10 features
fit = rfe.fit(X, y)
selected_features = X.columns[fit.support_]
print(selected_features)

Index(['age', 'number_outpatient', 'number_emergency', 'number_inpatient',
       'number_diagnoses', 'metformin', 'glipizide', 'rosiglitazone', 'change',
       'diabetesMed'],
      dtype='object')


In [6]:
# Train-test split
# Splitting the data into training and test sets for model evaluation
X_train, X_test, y_train, y_test = train_test_split(X[selected_features], y, test_size=0.2, random_state=42)

In [7]:
model.fit(X_train, y_train)

In [8]:
# Evaluate the model using cross-validation
# Using 5-fold cross-validation to evaluate the model's performance considering the imbalanced nature of the data
cv = KFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='roc_auc')

In [9]:
# Print cross-validation scores
print("Cross-validation AUC-ROC scores:", cv_scores)
print("Mean AUC-ROC score:", cv_scores.mean())

Cross-validation AUC-ROC scores: [0.62259447 0.61712605 0.61355169 0.62860033 0.61746029]
Mean AUC-ROC score: 0.6198665654559461


In [11]:
# Evaluate the model on test set
# Making predictions on the test set and generating a classification report
from sklearn.metrics import classification_report, roc_auc_score
y_pred = model.predict(X_test)
print("Classification Report on Test Set:")
print(classification_report(y_test, y_pred))

Classification Report on Test Set:
              precision    recall  f1-score   support

           0       0.90      1.00      0.95      4658
           1       0.00      0.00      0.00       506

    accuracy                           0.90      5164
   macro avg       0.45      0.50      0.47      5164
weighted avg       0.81      0.90      0.86      5164



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
