## Part 1: Building up a basic predictive model

### Model Building 

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
data = pd.read_csv('cleaned_data.csv')
data.head()

Unnamed: 0,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,payer_code,medical_specialty,num_lab_procedures,num_procedures,...,number_diagnoses,metformin,glipizide,glyburide,pioglitazone,rosiglitazone,insulin,change,diabetesMed,readmitted
0,Female,[70-80),1,22,7,7,MC,Orthopedics-Reconstructive,58,2,...,9,No,No,No,Up,No,Steady,Ch,Yes,0
1,Female,[60-70),2,1,1,3,MC,Nephrology,59,3,...,6,No,No,No,No,No,Steady,No,Yes,0
2,Female,[90-100),1,1,7,4,MC,Emergency/Trauma,56,1,...,6,No,No,No,No,No,No,No,Yes,0
3,Male,[70-80),1,2,7,10,MC,InternalMedicine,68,1,...,6,Steady,No,No,No,No,Steady,Ch,Yes,0
4,Female,[70-80),1,3,6,12,UN,InternalMedicine,77,5,...,5,No,No,No,No,No,Steady,No,Yes,0


### Logestic Regression Model 

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

In [3]:
# Encode categorical variables
label_encoders = {}
for column in data.select_dtypes(include=['object']).columns:
    label_encoders[column] = LabelEncoder()
    data[column] = label_encoders[column].fit_transform(data[column])

In [4]:
# Feature Selection
X = data.drop(columns=['readmitted'])
y = data['readmitted']
model = LogisticRegression(max_iter=10000)  # Increase max_iter
rfe = RFE(model, n_features_to_select=10)  # Select top 10 features
fit = rfe.fit(X, y)
selected_features = X.columns[fit.support_]
print(selected_features)

Index(['age', 'number_outpatient', 'number_emergency', 'number_inpatient',
       'number_diagnoses', 'metformin', 'glipizide', 'rosiglitazone', 'change',
       'diabetesMed'],
      dtype='object')


In [5]:
# Train-test split
# Splitting the data into training and test sets for model evaluation
X_train, X_test, y_train, y_test = train_test_split(X[selected_features], y, test_size=0.2, random_state=42)

In [6]:
model.fit(X_train, y_train)

In [7]:
# Evaluate the model using cross-validation
# Using 5-fold cross-validation to evaluate the model's performance considering the imbalanced nature of the data
cv = KFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='roc_auc')

In [8]:
# Print cross-validation scores
print("Cross-validation AUC-ROC scores:", cv_scores)
print("Mean AUC-ROC score:", cv_scores.mean())

Cross-validation AUC-ROC scores: [0.62259447 0.61712605 0.61355169 0.62860033 0.61746029]
Mean AUC-ROC score: 0.6198665654559461


In [9]:
# Evaluate the model on test set
# Making predictions on the test set and generating a classification report
from sklearn.metrics import classification_report, roc_auc_score
y_pred = model.predict(X_test)
print("Classification Report on Test Set:")
print(classification_report(y_test, y_pred))

Classification Report on Test Set:
              precision    recall  f1-score   support

           0       0.90      1.00      0.95      4658
           1       0.00      0.00      0.00       506

    accuracy                           0.90      5164
   macro avg       0.45      0.50      0.47      5164
weighted avg       0.81      0.90      0.86      5164



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [10]:
# Calculate class imbalance ratio
class_counts = data['readmitted'].value_counts()
imbalance_ratio = class_counts[1] / class_counts[0]
print("Class Imbalance Ratio:", imbalance_ratio)

Class Imbalance Ratio: 0.10907294441103188


#### OverSampling

In [11]:
from imblearn.over_sampling import RandomOverSampler

# Balance the data using random oversampling
random_oversampler = RandomOverSampler(random_state=42)
X_train_resampled, y_train_resampled = random_oversampler.fit_resample(X_train, y_train)

In [12]:
# Train the model
model.fit(X_train_resampled, y_train_resampled)

In [13]:
# Evaluate the model using cross-validation
cv_scores = cross_val_score(model, X_train_resampled, y_train_resampled, cv=5, scoring='roc_auc')

In [14]:
# Print cross-validation scores
print("Cross-validation AUC-ROC scores:", cv_scores)
print("Mean AUC-ROC score:", cv_scores.mean())

Cross-validation AUC-ROC scores: [0.62019118 0.62619781 0.62208015 0.62753238 0.6190137 ]
Mean AUC-ROC score: 0.6230030466876105


In [15]:
# Evaluate the model on the test set
y_pred = model.predict(X_test)
print("Classification Report on Test Set:")
print(classification_report(y_test, y_pred))

Classification Report on Test Set:
              precision    recall  f1-score   support

           0       0.93      0.66      0.77      4658
           1       0.15      0.54      0.23       506

    accuracy                           0.65      5164
   macro avg       0.54      0.60      0.50      5164
weighted avg       0.85      0.65      0.72      5164



In [16]:
# Evaluate the AUC-ROC score on test set
auc_roc = roc_auc_score(y_test, y_pred)
print("AUC-ROC score on Test Set:", auc_roc)

AUC-ROC score on Test Set: 0.5974565412558953


#### Tuning 

In [17]:
# Step 1: Tune Logistic Regression Model
from sklearn.model_selection import GridSearchCV
parameters = {'C': [0.001, 0.01, 0.1, 1, 10, 100], 
              'penalty': ['l1', 'l2'], 
              'solver': ['liblinear', 'saga']} 

log_reg = LogisticRegression(max_iter=10000)
grid_search = GridSearchCV(log_reg, parameters, cv=5, scoring='roc_auc')
grid_search.fit(X_train_resampled, y_train_resampled)

In [18]:
# Get the best parameters
best_params = grid_search.best_params_
print("Best Parameters:", best_params)

Best Parameters: {'C': 0.1, 'penalty': 'l1', 'solver': 'liblinear'}


In [19]:
# Step 2: Evaluate the Logistic Regression Model
best_log_reg = grid_search.best_estimator_

In [20]:
# Predictions on the test set
y_pred = best_log_reg.predict(X_test)

In [21]:
# Classification Report
print("Classification Report on Test Set:")
print(classification_report(y_test, y_pred))

Classification Report on Test Set:
              precision    recall  f1-score   support

           0       0.93      0.66      0.77      4658
           1       0.14      0.53      0.23       506

    accuracy                           0.65      5164
   macro avg       0.54      0.59      0.50      5164
weighted avg       0.85      0.65      0.72      5164



In [22]:
# AUC-ROC Score
auc_roc = roc_auc_score(y_test, y_pred)
print("AUC-ROC score on Test Set:", auc_roc)

AUC-ROC score on Test Set: 0.594233305104737


#### Feature Engineering and Selection

In [23]:
from sklearn.feature_selection import SelectKBest, chi2
# Perform feature selection using chi-squared test
selector = SelectKBest(score_func=chi2, k=10)  # Select top 10 features
X_selected = selector.fit_transform(X, y)

In [24]:
# Get the selected feature indices
selected_feature_indices = selector.get_support(indices=True)

In [25]:
# Get the selected feature names
selected_feature_names = X.columns[selected_feature_indices]
print("Selected Features:", selected_feature_names)

Selected Features: Index(['age', 'discharge_disposition_id', 'time_in_hospital',
       'num_lab_procedures', 'num_medications', 'number_emergency',
       'number_inpatient', 'number_diagnoses', 'change', 'diabetesMed'],
      dtype='object')


In [26]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)

In [27]:
# Train logistic regression model on the selected features
log_reg = LogisticRegression(max_iter=10000)
log_reg.fit(X_train, y_train)

In [28]:
# Predictions on the test set
y_pred = log_reg.predict(X_test)

In [29]:
# Classification Report
print("Classification Report on Test Set:")
print(classification_report(y_test, y_pred))

Classification Report on Test Set:
              precision    recall  f1-score   support

           0       0.90      1.00      0.95      4658
           1       0.00      0.00      0.00       506

    accuracy                           0.90      5164
   macro avg       0.45      0.50      0.47      5164
weighted avg       0.81      0.90      0.86      5164



In [30]:
# AUC-ROC Score
auc_roc = roc_auc_score(y_test, y_pred)
print("AUC-ROC score on Test Set:", auc_roc)

AUC-ROC score on Test Set: 0.4998926577930442


#### Undersampling

In [31]:
from imblearn.under_sampling import RandomUnderSampler
# Undersample the majority class
undersampler = RandomUnderSampler(random_state=42)
X_undersampled, y_undersampled = undersampler.fit_resample(X, y)

In [32]:
# Split the undersampled data into training and test sets
X_train_undersampled, X_test_undersampled, y_train_undersampled, y_test_undersampled = train_test_split(
    X_undersampled, y_undersampled, test_size=0.2, random_state=42
)

In [33]:
# Train logistic regression model on the undersampled data
log_reg_undersampled = LogisticRegression(max_iter=1000)
log_reg_undersampled.fit(X_train_undersampled, y_train_undersampled)

In [34]:
# Predictions on the test set
y_pred_undersampled = log_reg_undersampled.predict(X_test_undersampled)

In [35]:
# Classification Report
print("Classification Report on Test Set after Undersampling:")
print(classification_report(y_test_undersampled, y_pred_undersampled))

Classification Report on Test Set after Undersampling:
              precision    recall  f1-score   support

           0       0.59      0.63      0.61       514
           1       0.59      0.55      0.57       502

    accuracy                           0.59      1016
   macro avg       0.59      0.59      0.59      1016
weighted avg       0.59      0.59      0.59      1016



In [36]:
# AUC-ROC Score
auc_roc_undersampled = roc_auc_score(y_test_undersampled, y_pred_undersampled)
print("AUC-ROC score on Test Set after Undersampling:", auc_roc_undersampled)

AUC-ROC score on Test Set after Undersampling: 0.5881299703908103
