## Part 1: Building up a basic predictive model

### Model Building 

In [204]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
data = pd.read_csv('cleaned_copy.csv')
data.head()

Unnamed: 0,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,payer_code,medical_specialty,num_lab_procedures,...,glipizide,glyburide,pioglitazone,rosiglitazone,tolazamide,insulin,glyburide-metformin,change,diabetesMed,readmitted
0,Caucasian,1,8,1,22,7,0.461538,MC,Orthopedics-Reconstructive,58,...,0,0,1,0,No,1,No,1,1,False
1,Caucasian,1,7,2,1,1,0.153846,MC,Nephrology,59,...,0,0,0,0,No,1,No,0,1,False
2,Caucasian,1,10,1,1,7,0.230769,MC,Emergency/Trauma,56,...,0,0,0,0,No,0,No,0,1,False
3,Caucasian,1,8,1,2,7,0.692308,MC,InternalMedicine,68,...,0,0,0,0,No,1,No,1,1,False
4,Caucasian,1,8,1,3,5,0.846154,UN,InternalMedicine,77,...,0,0,0,0,No,1,No,0,1,False


### Logistic Regression Model 

In [205]:
#importing 
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

In [206]:
# Encode categorical variables
label_encoders = {}
for column in data.select_dtypes(include=['object']).columns:
    label_encoders[column] = LabelEncoder()
    data[column] = label_encoders[column].fit_transform(data[column])

In [207]:
# Define the features you want to select
selected_features = [ 'num_medications','number_diagnoses','number_inpatient', 'time_in_hospital','insulin','change','diabetesMed','metformin']
# Feature Selection
X = data[selected_features]  # Select only the specified features
y = data['readmitted']
model = LogisticRegression(max_iter=10000)  # Increase max_iter
model.fit(X, y)
# Print the selected features
print("Selected Features:")
print(selected_features)

Selected Features:
['num_medications', 'number_diagnoses', 'number_inpatient', 'time_in_hospital', 'insulin', 'change', 'diabetesMed', 'metformin']


In [208]:
# Train-test split
# Splitting the data into training and test sets for model evaluation
X_train, X_test, y_train, y_test = train_test_split(X[selected_features], y, test_size=0.2, random_state=42)

In [209]:
model.fit(X_train, y_train)

LogisticRegression(max_iter=10000)

In [210]:
# Evaluate the model using cross-validation
# Using 10-fold cross-validation to evaluate the model's performance considering the imbalanced nature of the data
cv = KFold(n_splits=10, shuffle=True, random_state=42)
cv_scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='roc_auc')

In [211]:
# Print cross-validation scores
print("Cross-validation AUC-ROC scores:", cv_scores)
print("Mean AUC-ROC score:", cv_scores.mean())

Cross-validation AUC-ROC scores: [0.60217871 0.61803317 0.61942665 0.5514175  0.64022808 0.6031347
 0.58220997 0.59651781 0.58557872 0.59180688]
Mean AUC-ROC score: 0.5990532185042214


In [212]:
# Evaluate the model on test set
# Making predictions on the test set and generating a classification report
from sklearn.metrics import classification_report, roc_auc_score
y_pred = model.predict(X_test)
print("Classification Report on Test Set:")
print(classification_report(y_test, y_pred))

Classification Report on Test Set:
              precision    recall  f1-score   support

       False       0.91      1.00      0.95      3447
        True       0.00      0.00      0.00       333

    accuracy                           0.91      3780
   macro avg       0.46      0.50      0.48      3780
weighted avg       0.83      0.91      0.87      3780




Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.



In [213]:
# Calculate class imbalance ratio
class_counts = data['readmitted'].value_counts()
imbalance_ratio = class_counts[1] / class_counts[0]
print("Class Imbalance Ratio:", imbalance_ratio)

Class Imbalance Ratio: 0.10115371168861438


#### Oversampling

In [214]:
from imblearn.over_sampling import RandomOverSampler
# Balance the data using random oversampling
random_oversampler = RandomOverSampler(random_state=42)
X_train_resampled, y_train_resampled = random_oversampler.fit_resample(X_train, y_train)

In [215]:
# Train the model
model.fit(X_train_resampled, y_train_resampled)

LogisticRegression(max_iter=10000)

In [216]:
# Evaluate the model using cross-validation
cv_scores = cross_val_score(model, X_train_resampled, y_train_resampled, cv=10, scoring='roc_auc')

In [217]:
# Print cross-validation scores
print("Cross-validation AUC-ROC scores:", cv_scores)
print("Mean AUC-ROC score:", cv_scores.mean())

Cross-validation AUC-ROC scores: [0.60429093 0.59679444 0.61589081 0.6077024  0.59919581 0.61083582
 0.60064582 0.58843165 0.59976864 0.5941153 ]
Mean AUC-ROC score: 0.6017671604434209


In [218]:
# Evaluate the model on the test set
y_pred = model.predict(X_test)
print("Classification Report on Test Set:")
print(classification_report(y_test, y_pred))

Classification Report on Test Set:
              precision    recall  f1-score   support

       False       0.94      0.58      0.72      3447
        True       0.12      0.59      0.20       333

    accuracy                           0.58      3780
   macro avg       0.53      0.59      0.46      3780
weighted avg       0.86      0.58      0.67      3780



In [219]:
# Evaluate the AUC-ROC score on test set
auc_roc = roc_auc_score(y_test, y_pred)
print("AUC-ROC score on Test Set:", auc_roc)

AUC-ROC score on Test Set: 0.5873536722100691


In [220]:
import plotly.graph_objs as go
import plotly.express as px

# Calculate the counts of unique classes after oversampling
class_counts_resampled = y_train_resampled.value_counts()

# Create a bar plot
fig = go.Figure(go.Bar(
    x=class_counts_resampled.index,
    y=class_counts_resampled.values,
    text=class_counts_resampled.values,
    textposition='auto',
    marker_color='skyblue'
))

# Update layout
fig.update_layout(
    title='Distribution of Unique Classes after Oversampling',
    xaxis_title='Class',
    yaxis_title='Count'
)

# Show plot
fig.show()