## Part 1: Building up a basic predictive model

### Model Building 

In [28]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
data = pd.read_csv('Updated_data.csv')
data.head()

Unnamed: 0,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,payer_code,medical_specialty,num_lab_procedures,...,A1Cresult,metformin,glipizide,glyburide,pioglitazone,rosiglitazone,insulin,change,diabetesMed,readmitted
0,Caucasian,Female,[70-80),1,22,7,7,MC,Orthopedics-Reconstructive,58,...,no data,No,No,No,Up,No,Steady,Ch,Yes,0
1,Caucasian,Female,[60-70),2,1,1,3,MC,Nephrology,59,...,no data,No,No,No,No,No,Steady,No,Yes,0
2,Caucasian,Female,[90-100),1,1,7,4,MC,Emergency/Trauma,56,...,no data,No,No,No,No,No,No,No,Yes,0
3,Caucasian,Male,[70-80),1,2,7,10,MC,InternalMedicine,68,...,no data,Steady,No,No,No,No,Steady,Ch,Yes,0
4,Caucasian,Female,[70-80),1,3,6,12,UN,InternalMedicine,77,...,>8,No,No,No,No,No,Steady,No,Yes,0


### Logestic Regression Model 

In [30]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

In [13]:
# Encode categorical variables
label_encoders = {}
for column in data.select_dtypes(include=['object']).columns:
    label_encoders[column] = LabelEncoder()
    data[column] = label_encoders[column].fit_transform(data[column])

In [14]:
# Define the features you want to select
selected_features = ['number_emergency', 'number_inpatient', 'number_diagnoses', 'num_medications', 'time_in_hospital']
# Feature Selection
X = data[selected_features]  # Select only the specified features
y = data['readmitted']
model = LogisticRegression(max_iter=10000)  # Increase max_iter
model.fit(X, y)
# Print the selected features
print("Selected Features:")
print(selected_features)

Selected Features:
['number_emergency', 'number_inpatient', 'number_diagnoses', 'num_medications', 'time_in_hospital']


In [15]:
# Train-test split
# Splitting the data into training and test sets for model evaluation
X_train, X_test, y_train, y_test = train_test_split(X[selected_features], y, test_size=0.2, random_state=42)

In [16]:
model.fit(X_train, y_train)

In [17]:
# Evaluate the model using cross-validation
# Using 10-fold cross-validation to evaluate the model's performance considering the imbalanced nature of the data
cv = KFold(n_splits=10, shuffle=True, random_state=42)
cv_scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='roc_auc')

In [18]:
# Print cross-validation scores
print("Cross-validation AUC-ROC scores:", cv_scores)
print("Mean AUC-ROC score:", cv_scores.mean())

Cross-validation AUC-ROC scores: [0.60906445 0.64733593 0.62816658 0.68726464 0.65990446 0.62438676
 0.65265686 0.64690819 0.63280136 0.62196542]
Mean AUC-ROC score: 0.6410454651792223


In [19]:
# Evaluate the model on test set
# Making predictions on the test set and generating a classification report
from sklearn.metrics import classification_report, roc_auc_score
y_pred = model.predict(X_test)
print("Classification Report on Test Set:")
print(classification_report(y_test, y_pred))

Classification Report on Test Set:
              precision    recall  f1-score   support

           0       0.89      1.00      0.94      4622
           1       0.39      0.02      0.04       563

    accuracy                           0.89      5185
   macro avg       0.64      0.51      0.49      5185
weighted avg       0.84      0.89      0.84      5185



In [20]:
# Calculate class imbalance ratio
class_counts = data['readmitted'].value_counts()
imbalance_ratio = class_counts[1] / class_counts[0]
print("Class Imbalance Ratio:", imbalance_ratio)

Class Imbalance Ratio: 0.11958023838314044


#### Oversampling

In [21]:
from imblearn.over_sampling import RandomOverSampler
# Balance the data using random oversampling
random_oversampler = RandomOverSampler(random_state=42)
X_train_resampled, y_train_resampled = random_oversampler.fit_resample(X_train, y_train)

In [22]:
# Train the model
model.fit(X_train_resampled, y_train_resampled)

In [23]:
# Evaluate the model using cross-validation
cv_scores = cross_val_score(model, X_train_resampled, y_train_resampled, cv=10, scoring='roc_auc')

In [24]:
# Print cross-validation scores
print("Cross-validation AUC-ROC scores:", cv_scores)
print("Mean AUC-ROC score:", cv_scores.mean())

Cross-validation AUC-ROC scores: [0.63696921 0.64666281 0.66306759 0.63628473 0.63783328 0.65357745
 0.64074861 0.63505462 0.65601383 0.64355012]
Mean AUC-ROC score: 0.6449762242170908


In [25]:
# Evaluate the model on the test set
y_pred = model.predict(X_test)
print("Classification Report on Test Set:")
print(classification_report(y_test, y_pred))

Classification Report on Test Set:
              precision    recall  f1-score   support

           0       0.91      0.73      0.81      4622
           1       0.17      0.44      0.24       563

    accuracy                           0.70      5185
   macro avg       0.54      0.59      0.53      5185
weighted avg       0.83      0.70      0.75      5185



In [26]:
# Evaluate the AUC-ROC score on test set
auc_roc = roc_auc_score(y_test, y_pred)
print("AUC-ROC score on Test Set:", auc_roc)

AUC-ROC score on Test Set: 0.5863239599321494


In [27]:
import plotly.graph_objs as go
import plotly.express as px

# Calculate the counts of unique classes after oversampling
class_counts_resampled = y_train_resampled.value_counts()

# Create a bar plot
fig = go.Figure(go.Bar(
    x=class_counts_resampled.index,
    y=class_counts_resampled.values,
    text=class_counts_resampled.values,
    textposition='auto',
    marker_color='skyblue'
))

# Update layout
fig.update_layout(
    title='Distribution of Unique Classes after Oversampling',
    xaxis_title='Class',
    yaxis_title='Count'
)

# Show plot
fig.show()