In [2]:
import mlflow
import mlflow.sklearn
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score, f1_score

StatementMeta(, e0d39fb9-ffeb-45d3-8c16-7f26dcb3740d, 4, Finished, Available, Finished)

In [3]:
workspace = "HospAdmission"
write_lakehouse = "Gold_LH"
table = "hosp_training_data"


def load_table(table: str, ws='HospAdmission', lh='Gold_LH'):
    path = f"abfss://{ws}@onelake.dfs.fabric.microsoft.com/{lh}.Lakehouse/Tables/{table}"
    df = spark.read.format('delta').load(path)
    return df

StatementMeta(, e0d39fb9-ffeb-45d3-8c16-7f26dcb3740d, 5, Finished, Available, Finished)

In [4]:


# -----------------------
# Assume df_final is your ML-ready dataset (Pandas DataFrame)
# -----------------------
# Split features and target

df_spark  = load_table('hosp_training_data')

df = df_spark.toPandas()

X = df.drop(columns=["ReadmittedWithin30Days", "PatientID", "CostPerStay"])
y = df["ReadmittedWithin30Days"]

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# -----------------------
# Identify categorical and numeric columns
# -----------------------
categorical_cols = X.select_dtypes(include=["object"]).columns.tolist()
numeric_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols)
    ]
)

# -----------------------
# Define models to explore
# -----------------------
models = {
    "LogisticRegression": LogisticRegression(max_iter=1000, random_state=42),
    "RandomForest": RandomForestClassifier(n_estimators=100, random_state=42),
    "XGBoost": XGBClassifier(
        n_estimators=200,
        max_depth=5,
        learning_rate=0.1,
        subsample=0.8,
        colsample_bytree=0.8,
        use_label_encoder=False,
        eval_metric="logloss",
        random_state=42
    )
}

experiment_name = "exp-admissionrisk-prediction"

mlflow.set_experiment(experiment_name)

# -----------------------
# Train, evaluate, and log with MLflow
# -----------------------
for model_name, model in models.items():
    with mlflow.start_run(run_name=model_name):
        mlflow.autolog()  # Automatically logs parameters, metrics, and model

        # Create pipeline
        pipeline = Pipeline([
            ("preprocessor", preprocessor),
            ("classifier", model)
        ])

        # Train
        pipeline.fit(X_train, y_train)

        # Predict
        y_pred = pipeline.predict(X_test)
        y_proba = pipeline.predict_proba(X_test)[:, 1]

        # Evaluate metrics
        acc = accuracy_score(y_test, y_pred)
        roc_auc = roc_auc_score(y_test, y_proba)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)

        # Log metrics manually (optional, autolog logs basic metrics)
        mlflow.log_metric("test_accuracy", acc)
        mlflow.log_metric("test_roc_auc", roc_auc)
        mlflow.log_metric("test_precision", precision)
        mlflow.log_metric("test_recall", recall)
        mlflow.log_metric("test_f1_score", f1)

        print(f"{model_name} - Accuracy: {acc:.4f}, ROC AUC: {roc_auc:.4f}, "
              f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}")


StatementMeta(, e0d39fb9-ffeb-45d3-8c16-7f26dcb3740d, 6, Finished, Available, Finished)

2025/09/04 13:20:49 INFO mlflow.tracking.fluent: Experiment with name 'exp-admissionrisk-prediction' does not exist. Creating a new experiment.
2025/09/04 13:20:58 INFO mlflow.tracking.fluent: Autologging successfully enabled for xgboost.
2025/09/04 13:20:58 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
                        ...' (569 characters) is truncated to 500 characters to meet the length limit.


LogisticRegression - Accuracy: 0.9584, ROC AUC: 0.9185, Precision: 0.2500, Recall: 0.0556, F1: 0.0909


2025/09/04 13:21:46 INFO mlflow.tracking.fluent: Autologging successfully enabled for xgboost.
2025/09/04 13:21:46 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
                        ...' (558 characters) is truncated to 500 characters to meet the length limit.


RandomForest - Accuracy: 0.9605, ROC AUC: 0.8942, Precision: 0.3333, Recall: 0.0556, F1: 0.0952


2025/09/04 13:21:54 INFO mlflow.tracking.fluent: Autologging successfully enabled for xgboost.
2025/09/04 13:21:54 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
                        ...' (1273 characters) is truncated to 500 characters to meet the length limit.
              colsample_bylevel=None, c...' (754 characters) is truncated to 500 characters to meet the length limit.


XGBoost - Accuracy: 0.9543, ROC AUC: 0.8870, Precision: 0.2500, Recall: 0.1111, F1: 0.1538
