In [26]:
import requests
import pandas as pd
import numpy as np

from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score

# plotting (optional but useful)
import matplotlib.pyplot as plt
import seaborn as sns

- test One feature

In [27]:
BASE_URL = "http://localhost:8001"

patient_id = "2d0a99c4-1612-e78d-d540-3fbc42aecd07"
res = requests.get(f"{BASE_URL}/features/patient/{patient_id}")
res.raise_for_status()
data = res.json()
df = pd.DataFrame([data])
df

Unnamed: 0,id,patient_resource_id,readmission_30d,gender,birth_date,age,state,num_encounters,length_of_stay_days,avg_los,...,vital_signs_available,num_med_requests,num_procedures,polypharmacy,medication_codes,ner_entities,embedding_mean,clinical_complexity_score,created_at,updated_at
0,793,2d0a99c4-1612-e78d-d540-3fbc42aecd07,False,male,2011-06-22,14.5,KS,0,0.0,0.0,...,True,8,6,True,"[{'code': '309097', 'display': 'Cefuroxime 250...","[{'text': '2015-06-19', 'label': 'DATE'}, {'te...","[-0.00046721264758768183, 0.03949631533275048,...",0.5,2025-12-07T16:11:59.291961+01:00,2025-12-07T17:01:08.251247+01:00


- bulk features

In [28]:
num_of_data = 580
training_data = requests.get(f"{BASE_URL}/features/all/?skip=0&limit={num_of_data}")
training_data.raise_for_status()
data = training_data.json()
df_train = pd.DataFrame(data["rows"])
df_train.head()

Unnamed: 0,id,patient_resource_id,readmission_30d,gender,birth_date,age,state,num_encounters,length_of_stay_days,avg_los,...,vital_signs_available,num_med_requests,num_procedures,polypharmacy,medication_codes,ner_entities,embedding_mean,clinical_complexity_score,created_at,updated_at
0,793,2d0a99c4-1612-e78d-d540-3fbc42aecd07,False,male,2011-06-22,14.5,KS,0,0.0,0.0,...,True,8,6,True,"[{'code': '309097', 'display': 'Cefuroxime 250...","[{'text': '2015-06-19', 'label': 'DATE'}, {'te...","[-0.00046721264758768183, 0.03949631533275048,...",0.5,2025-12-07T16:11:59.291961+01:00,2025-12-07T17:01:08.251247+01:00
1,798,3099f27c-e16b-1c88-6163-d0bca6d028db,False,female,1992-06-30,33.4,KS,0,0.0,0.0,...,False,22,20,True,"[{'code': '197454', 'display': 'cephalexin 500...","[{'text': '2011-09-05', 'label': 'DATE'}, {'te...","[-0.0007081706642306277, 0.064067399927548, -0...",0.5,2025-12-07T16:12:30.437778+01:00,2025-12-07T17:01:39.847941+01:00
2,802,31731ce3-65df-74df-d03f-ae76d6ea4ce2,False,male,1955-02-03,70.8,KS,553,0.0,0.079566,...,False,0,396,False,[],"[{'text': '2002-01-12', 'label': 'DATE'}, {'te...","[-0.06977455932547383, -0.005456079248649275, ...",0.3,2025-12-07T16:12:51.559765+01:00,2025-12-07T17:02:02.786699+01:00
3,803,3255205e-4d50-2eef-bc73-1124ecc0805e,False,male,2002-07-03,23.4,KS,0,0.0,0.0,...,False,4,0,False,"[{'code': '1535362', 'display': 'sodium fluori...","[{'text': '2017-07-31', 'label': 'DATE'}, {'te...","[-0.028944958104855485, 0.03349053104304605, -...",0.3,2025-12-07T16:13:13.892878+01:00,2025-12-07T17:02:30.243156+01:00
4,807,344b24ed-3abb-a28d-b000-86310192378c,False,female,1970-06-29,55.4,KS,0,0.0,0.0,...,False,0,5,False,[],"[{'text': '2017-05-10', 'label': 'DATE'}, {'te...","[-0.03065208438783884, 0.021230044595610637, -...",0.3,2025-12-07T16:13:35.361279+01:00,2025-12-07T17:02:53.320966+01:00


In [29]:
df_train["num_encounters"].value_counts().sort_index()
df_train[df_train["num_encounters"] > 0][["num_encounters"]].head()

Unnamed: 0,num_encounters
2,553
5,36
6,19
10,43
12,25


In [30]:
df_train["num_encounters"].value_counts()
df_train["readmission_30d"].value_counts(dropna=False)
df_train[["num_encounters", "length_of_stay_days", "days_since_last_discharge"]].describe()

Unnamed: 0,num_encounters,length_of_stay_days,days_since_last_discharge
count,500.0,500.0,148.0
mean,17.764,0.024,1683.797297
std,56.918782,0.315632,2368.063326
min,0.0,0.0,659.0
25%,0.0,0.0,1003.75
50%,0.0,0.0,1200.5
75%,20.0,0.0,1424.75
max,623.0,5.0,20249.0


- readmission rate

In [31]:
readmission_rate = df_train['readmission_30d'].mean()
readmission_count = df_train['readmission_30d'].sum()
no_readmission_count = (~df_train['readmission_30d']).sum()
avg_stay = df_train['length_of_stay_days'].mean()

print(f" Target Variable Analysis:")
print(f"   - Readmission rate: {readmission_rate:.2%}")
print(f"   - Readmissions: {readmission_count}")
print(f"   - No readmissions: {no_readmission_count}")
print(f"   - Average length of stay (days): {avg_stay:.2f}")

 Target Variable Analysis:
   - Readmission rate: 3.80%
   - Readmissions: 19
   - No readmissions: 481
   - Average length of stay (days): 0.02


- keep/drop

In [32]:
target = "readmission_30d"
drop_cols = [
    "id",
    "patient_resource_id",
    "birth_date",
    "created_at",
    "updated_at",
    "condition_codes",      # list
    "medication_codes",     # list
    "ner_entities",         # list
    "embedding_mean",       # save for later iteration
    "primary_condition_display",  # text, redundant with code
]
df_model = df_train.drop(columns=drop_cols)
df_model.dtypes

readmission_30d                 bool
gender                        object
age                          float64
state                         object
num_encounters                 int64
length_of_stay_days          float64
avg_los                      float64
class_code                    object
type_code                     object
is_emergency                    bool
is_inpatient                    bool
days_since_last_discharge    float64
num_conditions                 int64
primary_condition_code        object
has_chronic_conditions          bool
num_observations               int64
obs_abnormal_count             int64
has_abnormal_glucose            bool
has_abnormal_hr                 bool
has_abnormal_temp               bool
has_abnormal_saturation         bool
vital_signs_available           bool
num_med_requests               int64
num_procedures                 int64
polypharmacy                    bool
clinical_complexity_score    float64
dtype: object

- Encode categoricals

In [33]:
df_model["gender"] = df_model["gender"].map({"male": 0, "female": 1}).astype("float")

for col in ["state", "class_code", "type_code", "primary_condition_code"]:
    df_model[col] = df_model[col].astype("category").cat.codes.astype("float")

- Target & Features

In [34]:
df_model = df_model.dropna(subset=[target])

y = df_model[target].astype(int)
X = df_model.drop(columns=[target])

# handle missing values in features
X = X.fillna(X.median(numeric_only=True))

print(f"Features shape: {X.shape}")
print(f"Target distribution:\n{y.value_counts()}")
X.head()

Features shape: (500, 25)
Target distribution:
readmission_30d
0    481
1     19
Name: count, dtype: int64


Unnamed: 0,gender,age,state,num_encounters,length_of_stay_days,avg_los,class_code,type_code,is_emergency,is_inpatient,...,obs_abnormal_count,has_abnormal_glucose,has_abnormal_hr,has_abnormal_temp,has_abnormal_saturation,vital_signs_available,num_med_requests,num_procedures,polypharmacy,clinical_complexity_score
0,0.0,14.5,0.0,0,0.0,0.0,-1.0,-1.0,False,False,...,0,False,False,False,False,True,8,6,True,0.5
1,1.0,33.4,0.0,0,0.0,0.0,-1.0,-1.0,False,False,...,0,False,False,False,False,False,22,20,True,0.5
2,0.0,70.8,0.0,553,0.0,0.079566,0.0,6.0,False,False,...,0,False,False,False,False,False,0,396,False,0.3
3,0.0,23.4,0.0,0,0.0,0.0,-1.0,-1.0,False,False,...,0,False,False,False,False,False,4,0,False,0.3
4,1.0,55.4,0.0,0,0.0,0.0,-1.0,-1.0,False,False,...,0,False,False,False,False,False,0,5,False,0.3


- XGBoost

In [35]:
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score, accuracy_score, classification_report, confusion_matrix

# split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# handle class imbalance
pos = (y_train == 1).sum()
neg = (y_train == 0).sum()
scale_pos_weight = neg / pos

print(f"Training samples: {len(X_train)}, Test samples: {len(X_test)}")
print(f"scale_pos_weight: {scale_pos_weight:.2f}")

# train XGBoost
model = XGBClassifier(
    objective="binary:logistic",
    eval_metric="logloss",
    n_estimators=300,
    max_depth=4,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight=scale_pos_weight,
    random_state=42
)

model.fit(X_train, y_train)

# predict
y_pred_proba = model.predict_proba(X_test)[:, 1]
y_pred = (y_pred_proba >= 0.5).astype(int)

# evaluate
print("\n=== Model Performance ===")
print(f"AUC: {roc_auc_score(y_test, y_pred_proba):.4f}")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Training samples: 400, Test samples: 100
scale_pos_weight: 25.67

=== Model Performance ===
AUC: 0.9661
Accuracy: 0.9600

Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.97      0.98        96
           1       0.50      0.75      0.60         4

    accuracy                           0.96       100
   macro avg       0.74      0.86      0.79       100
weighted avg       0.97      0.96      0.96       100


Confusion Matrix:
[[93  3]
 [ 1  3]]


## Save Model & Artifacts

Save the trained model, feature metadata, and scalers for deployment

In [36]:
import joblib
import json
import os
from datetime import datetime

# Create directories
model_dir = "../trained_models/xgboost"
metadata_dir = "../trained_models/metadata"
os.makedirs(model_dir, exist_ok=True)
os.makedirs(metadata_dir, exist_ok=True)

# 1. Save the trained model
model_path = os.path.join(model_dir, "readmission_model.pkl")
joblib.dump(model, model_path)
print(f"✓ Model saved to {model_path}")

# 2. Save feature columns (list of feature names in order)
feature_columns = X.columns.tolist()
feature_columns_path = os.path.join(metadata_dir, "feature_columns.json")
with open(feature_columns_path, "w") as f:
    json.dump(feature_columns, f, indent=2)
print(f"✓ Feature columns saved to {feature_columns_path}")

# 3. Save model metadata
metadata = {
    "model_version": "v1.0.0",
    "trained_at": datetime.now().isoformat(),
    "n_samples_train": len(X_train),
    "n_samples_test": len(X_test),
    "n_features": len(feature_columns),
    "feature_names": feature_columns,
    "metrics": {
        "auc": float(roc_auc_score(y_test, y_pred_proba)),
        "accuracy": float(accuracy_score(y_test, y_pred)),
        "precision": float(classification_report(y_test, y_pred, output_dict=True)["1"]["precision"]),
        "recall": float(classification_report(y_test, y_pred, output_dict=True)["1"]["recall"]),
        "f1": float(classification_report(y_test, y_pred, output_dict=True)["1"]["f1-score"])
    },
    "hyperparameters": {
        "n_estimators": model.n_estimators,
        "max_depth": model.max_depth,
        "learning_rate": model.learning_rate,
        "scale_pos_weight": float(scale_pos_weight)
    },
    "class_distribution_train": {
        "negative": int(neg),
        "positive": int(pos)
    }
}

metadata_path = os.path.join(metadata_dir, "model_metadata.json")
with open(metadata_path, "w") as f:
    json.dump(metadata, f, indent=2)
print(f"✓ Metadata saved to {metadata_path}")

print("\n=== Model Deployment Ready ===")
print(f"Model version: {metadata['model_version']}")
print(f"Features: {metadata['n_features']}")
print(f"AUC: {metadata['metrics']['auc']:.4f}")

✓ Model saved to ../trained_models/xgboost\readmission_model.pkl
✓ Feature columns saved to ../trained_models/metadata\feature_columns.json
✓ Metadata saved to ../trained_models/metadata\model_metadata.json

=== Model Deployment Ready ===
Model version: v1.0.0
Features: 25
AUC: 0.9661
