### Import dependancies

In [6]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, precision_score, recall_score, classification_report

### 1. Load dataset

In [1]:
df = pd.read_csv(r"/home/adoh/Desktop/hospital_readmissions_30k.csv")
df.head(10)

Unnamed: 0,patient_id,age,gender,blood_pressure,cholesterol,bmi,diabetes,hypertension,medication_count,length_of_stay,discharge_destination,readmitted_30_days
0,1,74,Other,130/72,240,31.5,Yes,No,5,1,Nursing_Facility,Yes
1,2,46,Female,120/92,292,36.3,No,No,4,3,Nursing_Facility,No
2,3,89,Other,135/78,153,30.3,No,Yes,1,1,Home,No
3,4,84,Female,123/80,153,31.5,No,Yes,3,10,Home,No
4,5,32,Other,135/84,205,18.4,No,Yes,6,4,Nursing_Facility,No
5,6,75,Female,139/100,194,33.5,No,Yes,0,10,Home,No
6,7,47,Male,139/79,168,27.1,No,Yes,5,8,Home,No
7,8,18,Other,153/81,210,26.3,No,No,5,2,Home,No
8,9,68,Male,111/76,218,30.4,No,Yes,3,8,Home,No
9,10,23,Female,142/72,186,21.3,No,No,3,3,Home,No


In [2]:
df.shape

(30000, 12)

In [3]:
df.isnull

<bound method DataFrame.isnull of        patient_id  age  gender blood_pressure  cholesterol   bmi diabetes  \
0               1   74   Other         130/72          240  31.5      Yes   
1               2   46  Female         120/92          292  36.3       No   
2               3   89   Other         135/78          153  30.3       No   
3               4   84  Female         123/80          153  31.5       No   
4               5   32   Other         135/84          205  18.4       No   
...           ...  ...     ...            ...          ...   ...      ...   
29995       29996   29    Male         112/80          157  31.4       No   
29996       29997   20   Other         146/83          198  30.2       No   
29997       29998   40   Other         131/96          275  23.3      Yes   
29998       29999   61   Other         124/75          232  34.3       No   
29999       30000   78  Female         148/77          240  35.5      Yes   

      hypertension  medication_count  len

### 2. Basic feature engineering

In [7]:
def parse_bp(bp):
    try:
        m = re.match(r"(\d+)[^\d]+(\d+)", str(bp))
        return int(m.group(1)), int(m.group(2))
    except:
        return np.nan, np.nan

df[["systolic", "diastolic"]] = pd.DataFrame(df["blood_pressure"].apply(parse_bp).tolist())


In [8]:
# BMI and age categories
def bmi_cat(b):
    if pd.isna(b): return "unknown"
    b = float(b)
    if b < 18.5: return "underweight"
    if b < 25: return "normal"
    if b < 30: return "overweight"
    return "obese"

df["bmi_cat"] = df["bmi"].apply(bmi_cat)
df["age_group"] = pd.cut(df["age"], bins=[0,18,35,50,65,80,120],
                         labels=["0-18","19-35","36-50","51-65","66-80","81+"])


### 3. Prepare features and target

In [9]:
# 3. Prepare features and target
df["target"] = df["readmitted_30_days"].map({"Yes": 1, "No": 0})
features = [
    "age", "systolic", "diastolic", "cholesterol", "bmi",
    "medication_count", "length_of_stay",
    "gender", "diabetes", "hypertension",
    "discharge_destination", "bmi_cat", "age_group"
]

X = df[features]
y = df["target"]

### 4. Split data

In [10]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

### 5. Preprocessing pipeline

In [11]:
# 5. Preprocessing pipeline
num_features = ["age","systolic","diastolic","cholesterol","bmi","medication_count","length_of_stay"]
cat_features = ["gender","diabetes","hypertension","discharge_destination","bmi_cat","age_group"]

num_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median"))
])
cat_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(transformers=[
    ("num", num_transformer, num_features),
    ("cat", cat_transformer, cat_features)
])

### 6. Model selection — Random Forest

In [14]:
# 6. Model selection — Random Forest with class balancing
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(
    n_estimators=300,
    random_state=42,
    class_weight="balanced_subsample",  # FIX: penalizes majority class dominance
    max_depth=10,
    min_samples_leaf=5,
    n_jobs=-1
)

clf = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", model)
])


### 7. Train model

In [15]:
# 7. Train model
clf.fit(X_train, y_train)

# 8. Evaluate model (probability-based threshold tuning)
y_pred_proba = clf.predict_proba(X_test)[:, 1]
y_pred = (y_pred_proba >= 0.3).astype(int)  # lower threshold to catch more positives

cm = confusion_matrix(y_test, y_pred)
precision = precision_score(y_test, y_pred, zero_division=0)
recall = recall_score(y_test, y_pred, zero_division=0)

print("Confusion Matrix:\n", cm)
print(f"\nPrecision: {precision:.3f}")
print(f"Recall: {recall:.3f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=["No Readmit", "Readmit"], zero_division=0))

Confusion Matrix:
 [[ 100 5165]
 [  17  718]]

Precision: 0.122
Recall: 0.977

Classification Report:
              precision    recall  f1-score   support

  No Readmit       0.85      0.02      0.04      5265
     Readmit       0.12      0.98      0.22       735

    accuracy                           0.14      6000
   macro avg       0.49      0.50      0.13      6000
weighted avg       0.76      0.14      0.06      6000



### 8. Save the model

In [17]:
import joblib
joblib.dump(clf, "readmission_model.pkl")
print("✅ Model saved as readmission_model.pkl")

✅ Model saved as readmission_model.pkl
