In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score
from pathlib import Path
import joblib

# --- Configuration ---
OUTPUT_DIR = Path("../output")
DATA_FILE = OUTPUT_DIR / "readmissions_dataset.parquet"
MODEL_DIR = Path("../models")
MODEL_DIR.mkdir(exist_ok=True) # Create models directory if it doesn't exist

# --- Plotting Style ---
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (10, 6)

# 1. Data Loading

In [5]:
df = pd.read_parquet(DATA_FILE)

print(f"Dataset loaded with {df.shape[0]:,} rows and {df.shape[1]} columns.")
print("\nFirst 5 rows:")
display(df.head())

print("\nData Info:")
df.info()

Dataset loaded with 104,068 rows and 14 columns.

First 5 rows:


Unnamed: 0,encounter_id,patient_id,readmitted_within_30_days,length_of_stay,age_at_admission,gender,race,marital_status,admission_reason,admission_reason_detail,prior_admissions_last_year,num_diagnoses,num_procedures,num_medications
0,f32b1a30-3f4a-2b23-831d-c45357e8d9f0,ff1ffc37-c7f1-c69e-db86-97e58dd06a38,0,2,36,female,White,M,Admission to surgical department (procedure),Sterilization requested (situation),0,1.0,0.0,0.0
1,0fc59893-26d9-4651-ee0d-3b91506cbeb9,ff1ffc37-c7f1-c69e-db86-97e58dd06a38,1,1,65,female,White,M,Patient transfer to intensive care unit (proce...,History of coronary artery bypass grafting (si...,0,0.0,5.0,0.0
2,65a91317-1fa0-cf87-27a1-7b2daccba883,ff1ffc37-c7f1-c69e-db86-97e58dd06a38,0,1,65,female,White,M,Admission to ward (procedure),History of coronary artery bypass grafting (si...,1,0.0,4.0,0.0
3,e2477992-082b-69ca-3152-6fecf4442626,45ccdf82-db5c-3947-2d97-ee18a8a9c4e3,0,12,53,female,White,W,Hospital admission for isolation (procedure),Disease caused by severe acute respiratory syn...,0,4.0,27.0,0.0
4,735f3287-d205-1ec8-9668-fcdac03f306a,a5a065d0-a35b-d8e3-fda6-0e92c0a40926,0,1,63,female,White,M,Admission to intensive care unit (procedure),Chronic congestive heart failure (disorder),0,0.0,5.0,0.0



Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 104068 entries, 0 to 104067
Data columns (total 14 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   encounter_id                104068 non-null  object 
 1   patient_id                  104068 non-null  object 
 2   readmitted_within_30_days   104068 non-null  int32  
 3   length_of_stay              104068 non-null  int64  
 4   age_at_admission            104068 non-null  int64  
 5   gender                      104068 non-null  object 
 6   race                        104068 non-null  object 
 7   marital_status              104068 non-null  object 
 8   admission_reason            104068 non-null  object 
 9   admission_reason_detail     104063 non-null  object 
 10  prior_admissions_last_year  104068 non-null  int64  
 11  num_diagnoses               104068 non-null  float64
 12  num_procedures              104068 non-null  float64
 13  nu

# 2. Feature Selection and Data Splitting

In [6]:
# Define target and features
TARGET = 'readmitted_within_30_days'
# Dropping ID columns and high-cardinality text columns for this initial model
features = [col for col in df.columns if col not in [TARGET, 'encounter_id', 'patient_id', 'admission_reason', 'admission_reason_detail']]

X = df[features]
y = df[TARGET]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"Training set shape: {X_train.shape}")
print(f"Testing set shape: {X_test.shape}")

Training set shape: (83254, 9)
Testing set shape: (20814, 9)


# 3. Preprocessing Pipeline

In [7]:
# Identify numeric and categorical features
numeric_features = X.select_dtypes(include=np.number).columns.tolist()
categorical_features = X.select_dtypes(exclude=np.number).columns.tolist()

print(f"Numeric features: {numeric_features}")
print(f"Categorical features: {categorical_features}")

# Create preprocessing pipelines for numeric and categorical data
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# Create a column transformer to apply different transformations to different columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

Numeric features: ['length_of_stay', 'age_at_admission', 'prior_admissions_last_year', 'num_diagnoses', 'num_procedures', 'num_medications']
Categorical features: ['gender', 'race', 'marital_status']


# 4. Model Training

In [8]:
# Define the model
# Using class_weight='balanced' is a good practice for imbalanced datasets like this one.
model = LogisticRegression(class_weight='balanced', random_state=42, max_iter=1000)

# Create the full pipeline
full_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                ('classifier', model)])

# Train the model
print("Training the Logistic Regression model...")
full_pipeline.fit(X_train, y_train)
print("Training complete.")

Training the Logistic Regression model...
Training complete.


# 5. Model Evaluation

In [9]:
# Make predictions
y_pred = full_pipeline.predict(X_test)
y_pred_proba = full_pipeline.predict_proba(X_test)[:, 1]

# Evaluate the model
print("\\n--- Classification Report ---")
print(classification_report(y_test, y_pred))

print(f"--- ROC AUC Score ---")
print(f"{roc_auc_score(y_test, y_pred_proba):.4f}")

\n--- Classification Report ---
              precision    recall  f1-score   support

           0       0.94      0.74      0.83     17107
           1       0.39      0.78      0.52      3707

    accuracy                           0.75     20814
   macro avg       0.67      0.76      0.68     20814
weighted avg       0.84      0.75      0.77     20814

--- ROC AUC Score ---
0.8157


# 6. Save the Model

In [10]:
# Save the pipeline to a file
model_filename = MODEL_DIR / 'logistic_regression_pipeline_v1.joblib'
joblib.dump(full_pipeline, model_filename)

print(f"\\nModel pipeline saved to: {model_filename}")

\nModel pipeline saved to: ../models/logistic_regression_pipeline_v1.joblib


# 7. Gradient Boosting Model (XGBoost)

In [11]:
import xgboost as xgb

# --- XGBoost Model Definition ---

# Calculate the scale_pos_weight for handling class imbalance
# It's the ratio of negative class to positive class
scale_pos_weight = y_train.value_counts()[0] / y_train.value_counts()[1]

xgb_model = xgb.XGBClassifier(
    objective='binary:logistic',
    scale_pos_weight=scale_pos_weight, # Handles imbalance
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42
)

# Create the full pipeline with the XGBoost model
xgb_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('classifier', xgb_model)])

# Train the model
print("Training the XGBoost model...")
xgb_pipeline.fit(X_train, y_train)
print("Training complete.")

Training the XGBoost model...
Training complete.


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


# 8. XGBoost Model Evaluation

In [12]:
# Make predictions with the XGBoost pipeline
y_pred_xgb = xgb_pipeline.predict(X_test)
y_pred_proba_xgb = xgb_pipeline.predict_proba(X_test)[:, 1]

# Evaluate the model
print("\\n--- Classification Report (XGBoost) ---")
print(classification_report(y_test, y_pred_xgb))

print(f"--- ROC AUC Score (XGBoost) ---")
print(f"{roc_auc_score(y_test, y_pred_proba_xgb):.4f}")

\n--- Classification Report (XGBoost) ---
              precision    recall  f1-score   support

           0       0.98      0.79      0.88     17107
           1       0.49      0.92      0.64      3707

    accuracy                           0.81     20814
   macro avg       0.73      0.85      0.76     20814
weighted avg       0.89      0.81      0.83     20814

--- ROC AUC Score (XGBoost) ---
0.9265


# 9. Save the XGBoost Model

In [13]:
# Save the XGBoost pipeline to a file
model_filename_xgb = MODEL_DIR / 'xgboost_pipeline_v1.joblib'
joblib.dump(xgb_pipeline, model_filename_xgb)

print(f"\\nModel pipeline saved to: {model_filename_xgb}")

\nModel pipeline saved to: ../models/xgboost_pipeline_v1.joblib
