In [None]:
# =========================================================
# CELL 1 – CONNECT + CONSTANTS
# =========================================================
from azure.ai.ml import MLClient, dsl, Input, Output, command
from azure.ai.ml.entities import Data
from azure.ai.ml.constants import AssetTypes
from azure.ai.ml.sweep import Choice, Uniform
from azure.identity import DefaultAzureCredential, InteractiveBrowserCredential

# 1. CONNECT TO WORKSPACE (silent first, then interactive)
try:
    credential = DefaultAzureCredential()
    credential.get_token("https://management.azure.com/.default")
except Exception:
    print("DefaultAzureCredential failed; falling back to InteractiveBrowserCredential...")
    credential = InteractiveBrowserCredential()

# config.json one level up from this notebook
ml_client = MLClient.from_config(credential=credential, path="../config.json")
print(f"Connected to workspace: {ml_client.workspace_name}")

# 2. ENVIRONMENT (same one you used everywhere else)
ENV_STR = "azureml:clinical-prep-env:2"
print(f"Using environment: {ENV_STR}")

# 3. LOCAL TRAIN/TEST FOLDERS PRODUCED BY prep.py
TRAIN_LOCAL_PATH = "../data/processed_icd/train"  # contains train.csv
TEST_LOCAL_PATH  = "../data/processed_icd/test"   # contains test.csv

TRAIN_ASSET_NAME = "diabetes-train-130us"
TEST_ASSET_NAME  = "diabetes-test-130us"

Found the config file in: ..\config.json
Class DeploymentTemplateOperations: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.


Connected to workspace: AML-Clinical-Readmission
Using environment: azureml:clinical-prep-env:2


In [3]:
# =========================================================
# CELL 2 – REGISTER TRAIN / TEST URI_FOLDER ASSETS
# =========================================================

# Register TRAIN folder as a URI_FOLDER data asset
train_data_asset = Data(
    name=TRAIN_ASSET_NAME,
    description="Train split for clinical readmission (enriched + binary target).",
    type=AssetTypes.URI_FOLDER,
    path=TRAIN_LOCAL_PATH,
)
train_data_asset = ml_client.data.create_or_update(train_data_asset)
print(f"Train asset: {train_data_asset.name}:{train_data_asset.version}")

# Register TEST folder as a URI_FOLDER data asset
test_data_asset = Data(
    name=TEST_ASSET_NAME,
    description="Test split for clinical readmission (enriched + binary target).",
    type=AssetTypes.URI_FOLDER,
    path=TEST_LOCAL_PATH,
)
test_data_asset = ml_client.data.create_or_update(test_data_asset)
print(f"Test asset:  {test_data_asset.name}:{test_data_asset.version}")

# Capture versions so we can reference them below
TRAIN_VER = train_data_asset.version
TEST_VER  = test_data_asset.version

print("Using TRAIN_VER =", TRAIN_VER)
print("Using TEST_VER  =", TEST_VER)

Train asset: diabetes-train-130us:3
Test asset:  diabetes-test-130us:3
Using TRAIN_VER = 3
Using TEST_VER  = 3


In [15]:
#=========================================================
# CELL 3 – DEFINE BASE TRAINING JOB (XGBOOST + MLflow)
# =========================================================

# NOTE:
# - train.py is the version that:
#   * uses 'readmitted_30d_binary' if present
#   * logs 'accuracy_custom' and 'auc_custom' to MLflow
# - We pass target_col explicitly to avoid ambiguity.
from azure.ai.ml import command
base_train_job = command(
    display_name="xgb_manual_cli_baseline_binary",
    description=(
        "Baseline XGBoost training for 30-day readmission (binary target) "
        "using train.py on registered train/test assets."
    ),
    code="../src",  # folder that contains train.py
    command=(
        "python train.py "
        "--train_data ${{inputs.train_data}} "
        "--test_data ${{inputs.test_data}} "
        "--model_output ${{outputs.model_output}} "
        "--target_col readmitted_30d_binary "
        "--max_depth ${{inputs.max_depth}} "
        "--learning_rate ${{inputs.learning_rate}} "
        "--scale_pos_weight ${{inputs.scale_pos_weight}} "
        "--n_estimators ${{inputs.n_estimators}} "
        "--subsample ${{inputs.subsample}} "
        "--colsample_bytree ${{inputs.colsample_bytree}}"
    ),
    inputs={
        # Data inputs: your registered URI_FOLDER assets
        "train_data": Input(
            type="uri_folder",
            path=f"azureml:{TRAIN_ASSET_NAME}:{TRAIN_VER}",
        ),
        "test_data": Input(
            type="uri_folder",
            path=f"azureml:{TEST_ASSET_NAME}:{TEST_VER}",
        ),
        # Hyperparameters as inputs (baseline values)
        "max_depth": 6,
        "scale_pos_weight": 1.0,
        "learning_rate": 0.1,
        "n_estimators": 200,
        "subsample": 0.80967,
        "colsample_bytree": 0.94020,
    },
    outputs={
        "model_output": Output(type="uri_folder"),
    },
    environment=ENV_STR,
    compute="clinical-cluster-cpu",
)

# (Optional) If you want to run a single baseline job first, uncomment:
# baseline_run = ml_client.jobs.create_or_update(
#     base_train_job,
#     experiment_name="Clinical_Readmission_XGB_Manual_Binary",
# )
# print("Baseline run URL:", baseline_run.studio_url)

In [16]:
# =========================================================
# CELL 4 – DEFINE AND SUBMIT SWEEP JOB
# =========================================================

# We sweep over key XGBoost hyperparameters and optimize the custom AUC metric
# that train.py logs as 'auc_custom' via MLflow.
from azure.ai.ml.sweep import BanditPolicy

# Define the sweep job
sweep_job = base_train_job.sweep(
    compute="clinical-cluster-cpu",
    sampling_algorithm="random",      # you can switch to "bayesian" later
    primary_metric="balanced_accuracy_custom",      # must match mlflow.log_metric("auc_custom", ...)
    goal="Maximize",
    search_space={
        "max_depth": Choice(values=[8, 9, 10]),
        "scale_pos_weight":Uniform(min_value=8.0, max_value=13.0),
        "learning_rate": Uniform(min_value=0.01, max_value=0.05),
        "n_estimators": Choice(values=[200, 400]),
    },
)

# Limit total trials and parallelism
sweep_job.set_limits(
    max_total_trials=30,
    max_concurrent_trials=4,
)

# Define the Early Termination Policy
bandit_policy = BanditPolicy(
    slack_factor=0.2,            # Cancel if performance is 20% worse than the best trial so far
    evaluation_interval=2,       # Check the performance every 2 metric logging intervals (or epochs)
    delay_evaluation=5           # Start checking only after the first 5 trials/intervals have completed
)

# Apply the policy to your sweep job object
sweep_job.early_termination = bandit_policy

# Optional: You can also set the primary metric here if it wasn't set earlier
sweep_job.primary_metric = "balanced_accuracy_custom" 
sweep_job.primary_metric_goal = "maximize"

sweep_job.display_name = "xgb_manual_random_sweep_bacc_binary"

returned_sweep = ml_client.jobs.create_or_update(
    sweep_job,
    experiment_name="Clinical_Readmission__Sweep_ClassWeighting",
)

print("Sweep job submitted.")
print("Track it in Azure ML Studio at:")
print(returned_sweep.studio_url)

pathOnCompute is not a known attribute of class <class 'azure.ai.ml._restclient.v2023_04_01_preview.models._models_py3.UriFolderJobOutput'> and will be ignored


Sweep job submitted.
Track it in Azure ML Studio at:
https://ml.azure.com/runs/sleepy_square_tcsrbvqskz?wsid=/subscriptions/3aeb63fe-f831-47f0-8175-3732f2efd2a1/resourcegroups/RG-Clinical-Readmission/workspaces/AML-Clinical-Readmission&tid=deb5bf9d-8bb0-4783-8f54-42a424392492
