# **Hospital Readmission Tracker**

**🔹 Installing Required Library (CatBoost)**

In [None]:
%pip install catboost



**🔹 Importing Libraries and Dependencies**

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score, f1_score
from catboost import CatBoostClassifier
import joblib

**Step 1: Load the Dataset**

In [None]:
# --- Step 1: Load dataset ---
data = pd.read_csv("/content/MIMIC - III Patient Dataset.csv")

**Step 2: Extract and Transform Blood Pressure**

In [None]:
# --- Step 2: Process blood pressure ---
data['systolic'] = data['blood_pressure'].map(lambda x: int(x.split('/')[0]))
data['diastolic'] = data['blood_pressure'].map(lambda x: int(x.split('/')[1]))
data['pulse_pressure'] = data['systolic'] - data['diastolic']
data.drop(columns=["blood_pressure"], inplace=True)

**Step 3: BMI Categories**

In [None]:
# --- Step 3: Keep BMI continuous + add categories ---
def bmi_category(bmi):
    bmi = float(bmi)
    if bmi < 18.5: return 0
    elif 18.5 <= bmi <= 24.9: return 1
    elif 25 <= bmi <= 29.9: return 2
    else: return 3
data['bmi_category'] = data['bmi'].map(bmi_category)

**Step 4: High Cholesterol Indicator**

In [None]:
# --- Step 4: Cholesterol (binary high risk) ---
data['high_cholesterol'] = (data['cholesterol'].astype(float) > 200).astype(int)

**Step 5: Encode Categorical Variables**

In [None]:
# --- Step 5: Encode categorical variables ---
# discharge_destination needs one-hot encoding for better signal
data = pd.get_dummies(data, columns=['discharge_destination'], drop_first=True)

In [None]:
# Encode gender, diabetes, hypertension
for col in ['gender', 'diabetes', 'hypertension']:
    data[col] = LabelEncoder().fit_transform(data[col].astype(str))

**Step 6: Feature Engineering**

In [None]:
# --- Step 6: Feature engineering ---
data['high_risk_age'] = (data['age'] >= 70).astype(int)
data['polypharmacy'] = (data['medication_count'] >= 5).astype(int)
data['long_stay'] = (data['length_of_stay'] > 14).astype(int)
data['multi_comorbidity'] = ((data['diabetes'] == 1) & (data['hypertension'] == 1)).astype(int)
data['age_bmi'] = data['age'] * data['bmi']
data['stay_meds'] = data['length_of_stay'] * data['medication_count']


**Step 7: Define Features and Target**

In [None]:
# --- Step 7: Define features and target ---
target = 'readmitted_30_days'
features = [col for col in data.columns if col != target]
X = data[features]
y = data[target]

**Step 8: Handle Missing Values**

In [None]:
# --- Step 8: Handle missing values ---
imputer = SimpleImputer(strategy='median')
X = pd.DataFrame(imputer.fit_transform(X), columns=features)

**Step 9: Train-Test Split**

In [None]:
# --- Step 9: Train-test split ---
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

**Step 10: Feature Scaling**

In [None]:
# --- Step 10: Scale features (optional for CatBoost, but good practice) ---
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

**Step 11: Train CatBoost Model**

In [None]:
# --- Step 11: Train CatBoost model ---
cat_model = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.03,
    depth=8,
    eval_metric="F1",
    random_seed=42,
    verbose=False,
    early_stopping_rounds=50,
    class_weights=[1, (y_train.value_counts()[0] / y_train.value_counts()[1])]
)

cat_model.fit(X_train, y_train, eval_set=(X_test, y_test))

<catboost.core.CatBoostClassifier at 0x7a80091651f0>

**Step 12: Evaluate Model**

In [None]:
# --- Step 12: Evaluate model ---
y_prob = cat_model.predict_proba(X_test)[:, 1]
y_pred = (y_prob >= 0.5).astype(int)

print("Accuracy (default 0.5 threshold):", accuracy_score(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_prob))
print("F1-score:", f1_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# --- Step 13: Optimize threshold ---
thresholds = np.arange(0.3, 0.7, 0.01)
best_acc, best_th = 0, 0

for th in thresholds:
    preds = (y_prob >= th).astype(int)
    acc = accuracy_score(y_test, preds)
    if acc > best_acc:
        best_acc, best_th = acc, th

print("\nBest Accuracy:", best_acc, "at threshold:", best_th)

# Recalculate metrics with optimized threshold
y_pred_opt = (y_prob >= best_th).astype(int)
print("F1-score (optimized):", f1_score(y_test, y_pred_opt))
print("Classification Report (optimized):\n", classification_report(y_test, y_pred_opt))
print("Confusion Matrix (optimized):\n", confusion_matrix(y_test, y_pred_opt))

Accuracy (default 0.5 threshold): 0.8231666666666667
ROC AUC: 0.8340513950626357
F1-score: 0.7685932388222465
Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.82      0.86      3891
           1       0.71      0.84      0.77      2109

    accuracy                           0.82      6000
   macro avg       0.81      0.83      0.81      6000
weighted avg       0.83      0.82      0.83      6000

Confusion Matrix:
 [[3177  714]
 [ 347 1762]]

Best Accuracy: 0.8233333333333334 at threshold: 0.5500000000000003
F1-score (optimized): 0.7686599738105631
Classification Report (optimized):
               precision    recall  f1-score   support

           0       0.90      0.82      0.86      3891
           1       0.71      0.83      0.77      2109

    accuracy                           0.82      6000
   macro avg       0.81      0.83      0.81      6000
weighted avg       0.83      0.82      0.83      6000

Confusion Matrix (opt

**Step 14: Save Model and Preprocessing Objects**

In [None]:
# --- Step 14: Save model, scaler, and imputer ---
joblib.dump(cat_model, "readmission_catboost_model.pkl")
joblib.dump(scaler, "scaler.pkl")
joblib.dump(imputer, "imputer.pkl")

['imputer.pkl']

**Import Libraries**

In [None]:
import pandas as pd
import joblib
import numpy as np
from sklearn.preprocessing import LabelEncoder

**Load Saved Model and Preprocessing Objects**

In [None]:
# Load saved model, scaler, and imputer
cat_model = joblib.load("readmission_catboost_model.pkl")
scaler = joblib.load("scaler.pkl")
imputer = joblib.load("imputer.pkl")

**Create Sample Patient Data**

In [None]:
# Generate diverse sample patients
sample_patients = pd.DataFrame([
    {
        # High-risk patient (likely readmitted)
        "age": 78,
        "gender": "Male",
        "bmi": 35.1,
        "cholesterol": 255,
        "blood_pressure": "160/100",
        "diabetes": "Yes",
        "hypertension": "Yes",
        "medication_count": 10,
        "length_of_stay": 20,
        "discharge_destination": "Nursing_Facility"
    },
    {
        # Low-risk patient (likely not readmitted)
        "age": 45,
        "gender": "Female",
        "bmi": 22.5,
        "cholesterol": 175,
        "blood_pressure": "115/75",
        "diabetes": "No",
        "hypertension": "No",
        "medication_count": 2,
        "length_of_stay": 4,
        "discharge_destination": "Home"
    },
    {
        # Medium-risk patient
        "age": 60,
        "gender": "Male",
        "bmi": 28.0,
        "cholesterol": 210,
        "blood_pressure": "135/85",
        "diabetes": "Yes",
        "hypertension": "No",
        "medication_count": 5,
        "length_of_stay": 8,
        "discharge_destination": "Home"
    },
    {
        # Another high-risk patient with different profile
        "age": 85,
        "gender": "Female",
        "bmi": 29.8,
        "cholesterol": 190,
        "blood_pressure": "140/90",
        "diabetes": "No",
        "hypertension": "Yes",
        "medication_count": 7,
        "length_of_stay": 15,
        "discharge_destination": "Other"
    }
])

**Preprocess Blood Pressure for Sample Data**

In [None]:
# --- Preprocess sample data (mirroring training preprocessing) ---

# Process blood pressure
sample_patients['systolic'] = sample_patients['blood_pressure'].map(lambda x: int(x.split('/')[0]))
sample_patients['diastolic'] = sample_patients['blood_pressure'].map(lambda x: int(x.split('/')[1]))
sample_patients['pulse_pressure'] = sample_patients['systolic'] - sample_patients['diastolic']
sample_patients.drop(columns=["blood_pressure"], inplace=True)

**BMI Categories for Sample Data**

In [None]:
# Keep BMI continuous + add categories
def bmi_category(bmi):
    bmi = float(bmi)
    if bmi < 18.5: return 0
    elif 18.5 <= bmi <= 24.9: return 1
    elif 25 <= bmi <= 29.9: return 2
    else: return 3
sample_patients['bmi_category'] = sample_patients['bmi'].map(bmi_category)

**High Cholesterol & Encode Categorical Variables**

In [None]:
# Cholesterol (binary high risk)
sample_patients['high_cholesterol'] = (sample_patients['cholesterol'].astype(float) > 200).astype(int)

# Encode categorical variables
sample_patients = pd.get_dummies(sample_patients, columns=['discharge_destination'], drop_first=True)

**Encode Remaining Categorical Variables**

In [None]:
# Load original data to fit label encoders
original_data = pd.read_csv("/content/MIMIC - III Patient Dataset.csv")

for col in ['gender', 'diabetes', 'hypertension']:
    le = LabelEncoder()
    # Fit on the original string labels from the training data
    le.fit(original_data[col].astype(str))
    sample_patients[col] = le.transform(sample_patients[col].astype(str))

**Feature Engineering for Sample Data**

In [None]:
# Feature engineering
sample_patients['high_risk_age'] = (sample_patients['age'] >= 70).astype(int)
sample_patients['polypharmacy'] = (sample_patients['medication_count'] >= 5).astype(int)
sample_patients['long_stay'] = (sample_patients['length_of_stay'] > 14).astype(int)
sample_patients['multi_comorbidity'] = ((sample_patients['diabetes'] == 1) & (sample_patients['hypertension'] == 1)).astype(int)
sample_patients['age_bmi'] = sample_patients['age'] * sample_patients['bmi']
sample_patients['stay_meds'] = sample_patients['length_of_stay'] * sample_patients['medication_count']

**Align Sample Data with Training Features**

In [None]:
# Align columns with training data
# Get columns from X which was defined in the previous cell
training_columns = X.columns
sample_patients_aligned = sample_patients.reindex(columns=training_columns, fill_value=0)


**Impute Missing Values in Sample Data**

In [None]:
# Apply imputer
sample_patients_imputed = pd.DataFrame(
    imputer.transform(sample_patients_aligned),
    columns=training_columns
)

**Scale Sample Data Features**

In [None]:
# Scale the features
sample_patients_scaled = pd.DataFrame(
    scaler.transform(sample_patients_imputed),
    columns=training_columns
)

**Make Predictions on Sample Patients**

In [None]:
# Make predictions
sample_predictions = cat_model.predict(sample_patients_scaled)
sample_proba = cat_model.predict_proba(sample_patients_scaled)[:, 1]

**Display Prediction Results**

In [None]:
# Print results
print("Predictions for sample patients:")
for i, (pred, prob) in enumerate(zip(sample_predictions, sample_proba), start=1):
    print(f"Patient {i}: Prediction = {'Readmit' if pred == 1 else 'No Readmission'}, "
          f"Probability = {prob:.2f}")

Predictions for sample patients:
Patient 1: Prediction = Readmit, Probability = 0.86
Patient 2: Prediction = No Readmission, Probability = 0.28
Patient 3: Prediction = No Readmission, Probability = 0.30
Patient 4: Prediction = Readmit, Probability = 0.74


# **Evaluate and Compare Multiple Models**

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import precision_score, recall_score

# Define models to compare
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, class_weight="balanced"),
    "Decision Tree": DecisionTreeClassifier(class_weight="balanced", random_state=42),
    "Random Forest": RandomForestClassifier(class_weight="balanced", random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42),
    "CatBoost": cat_model  # already trained
}

results = {}

for name, model in models.items():
    if name != "CatBoost":  # CatBoost already trained
        model.fit(X_train, y_train)
        y_prob = model.predict_proba(X_test)[:, 1]
        y_pred = (y_prob >= 0.5).astype(int)
    else:
        y_prob = cat_model.predict_proba(X_test)[:, 1]
        y_pred = (y_prob >= 0.5).astype(int)

    results[name] = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred, zero_division=0),
        "Recall": recall_score(y_test, y_pred, zero_division=0),
        "F1 Score": f1_score(y_test, y_pred),
        "ROC AUC": roc_auc_score(y_test, y_prob)
    }

# Convert results into a DataFrame for comparison
results_df = pd.DataFrame(results).T.sort_values(by="F1 Score", ascending=False)
print("Algorithm Comparison:\n")
print(results_df)

# Highlight best performing algorithm
best_algo = results_df["F1 Score"].idxmax()
print(f"\n✅ Best Performing Algorithm: {best_algo} (chosen for prediction)")


Algorithm Comparison:

                     Accuracy  Precision    Recall  F1 Score   ROC AUC
CatBoost             0.823167   0.711632  0.835467  0.768593  0.834051
Gradient Boosting    0.819333   0.715789  0.806069  0.758252  0.832573
Random Forest        0.812000   0.711696  0.781887  0.745142  0.833943
Logistic Regression  0.731000   0.592454  0.752015  0.662766  0.797969
Decision Tree        0.722500   0.610119  0.583215  0.596364  0.690605

✅ Best Performing Algorithm: CatBoost (chosen for prediction)
