In [4]:
# ============================================================
# NOTEBOOK: 6_automl_xgboost_compare.ipynb
# GOAL:
#   - Run an AutoML Classification job on the SAME enriched dataset
#   - Compare its best model against your tuned manual XGBoost model
#
# PREREQUISITES:
#   - config.json present one level up from this notebook (../config.json)
#   - Workspace, resource group, and subscription correctly set in config.json
#   - Enriched MLTable asset already registered (e.g. "diabetes-clinical-enriched-130us")
#   - Compute cluster "clinical-cluster-cpu" exists
# ============================================================

# -------------------------------
# 0. IMPORTS AND WORKSPACE SETUP
# -------------------------------
from azure.ai.ml import MLClient, Input
from azure.identity import DefaultAzureCredential, InteractiveBrowserCredential
from azure.ai.ml.constants import AssetTypes

# Try silent auth first, fall back to interactive login in browser
try:
    credential = DefaultAzureCredential()
    credential.get_token("https://management.azure.com/.default")
except Exception:
    print("DefaultAzureCredential failed; falling back to InteractiveBrowserCredential...")
    credential = InteractiveBrowserCredential()

# Connect using config.json located one level up from notebooks/
ml_client = MLClient.from_config(credential=credential, path="../config.json")
print(f"Connected to workspace: {ml_client.workspace_name}")

# -------------------------------
# 1. PREPARE ENRICHED DATA INPUT
# -------------------------------
# IMPORTANT:
# - Use the SAME enriched dataset you used in your manual pipeline.
# - Update ENRICHED_ASSET_NAME and ENRICHED_VERSION if needed.
#   You can confirm in Azure ML Studio under Data → Assets.

ENRICHED_ASSET_NAME = "diabetes-clinical-enriched-130us"  # MLTable name
ENRICHED_VERSION = "1124_1331"                            # MLTable version

# AutoML training input: enriched clinical data (MLTable)
training_data_input = Input(
    type=AssetTypes.MLTABLE,
    path=f"azureml:{ENRICHED_ASSET_NAME}:{ENRICHED_VERSION}",
)

TARGET_COL = "readmitted"  # same target used in your manual model

# -------------------------------
# 2. DEFINE AUTOML CLASSIFICATION JOB
# -------------------------------
from azure.ai.ml import automl

# Create an AutoML classification job
automl_job = automl.classification(
    compute="clinical-cluster-cpu",         # existing compute target
    experiment_name="Clinical_Readmission_AutoML",
    training_data=training_data_input,      # enriched MLTable
    target_column_name=TARGET_COL,          # label column
    primary_metric="AUC_weighted",          # AutoML optimization metric
)

# Set resource/time limits for AutoML search
automl_job.set_limits(
    timeout_minutes=60,          # total time budget for the AutoML run
    max_trials=20,               # maximum child runs (model candidates)
    max_concurrent_trials=3,     # parallel trials
)

# (Optional) Configure training: filter or focus on certain algorithms
# You can leave this commented for a broad search.
# automl_job.set_training(
#     enable_onnx_compatible_models=True,
#     allowed_training_algorithms=[
#         "LightGBM",
#         "XGBoostClassifier",
#         "RandomForest",
#     ],
# )

automl_job.display_name = "Clinical_Readmission_AutoML_Enriched"

# -------------------------------
# 3. SUBMIT THE AUTOML JOB
# -------------------------------
returned_automl = ml_client.jobs.create_or_update(automl_job)

print("AutoML job submitted.")
print("Track it in Azure ML Studio at:")
print(returned_automl.studio_url)

# ------------------------------------------------------------
# NEXT STEPS (MANUAL, NOT CODE):
# ------------------------------------------------------------
# 1. Go to the printed Studio URL.
# 2. Wait for the AutoML run to complete.
# 3. In the "Models" or "Best model summary" tab:
#    - Note the best model type (e.g., LightGBM, XGBoost, etc.).
#    - Record its AUC_weighted and accuracy.
#    - Register the best model under a clear name, e.g.:
#         clinical-readmission-automl-best
# 4. Compare:
#    - Manual tuned XGBoost: AUC ≈ 0.6838, accuracy ≈ 0.8884
#    - AutoML best: AUC_weighted and accuracy from the run
# 5. Decide which one to deploy first (manual vs AutoML).


Found the config file in: ..\config.json
Overriding of current TracerProvider is not allowed
Overriding of current LoggerProvider is not allowed
Overriding of current MeterProvider is not allowed
Attempting to instrument while already instrumented
Attempting to instrument while already instrumented
Attempting to instrument while already instrumented
Attempting to instrument while already instrumented


Connected to workspace: AML-Clinical-Readmission
AutoML job submitted.
Track it in Azure ML Studio at:
https://ml.azure.com/runs/plum_celery_7580p0pqdn?wsid=/subscriptions/3aeb63fe-f831-47f0-8175-3732f2efd2a1/resourcegroups/RG-Clinical-Readmission/workspaces/AML-Clinical-Readmission&tid=deb5bf9d-8bb0-4783-8f54-42a424392492


In [2]:
import pandas as pd

df1 = pd.read_csv("../data/processed_icd/diabetes_clinical_enriched.csv")
df2 = pd.read_csv("../data_clinical_upload/diabetes_clinical.csv")

print(df1.shape, df2.shape)
print("Same columns:", list(df1.columns) == list(df2.columns))
print("Exactly equal:", df1.equals(df2))


  df1 = pd.read_csv("../data/processed_icd/diabetes_clinical_enriched.csv")
  df2 = pd.read_csv("../data_clinical_upload/diabetes_clinical.csv")


(101766, 52) (101766, 52)
Same columns: True
Exactly equal: True
