In [1]:
# ---------------------------------------------------------
# NOTEBOOK: 4_pipeline_build.ipynb
# GOAL: Build an end-to-end Azure ML pipeline:
#   1) Data prep (split MLTable into train/test folders)
#   2) Train XGBoost model with MLflow autologging
#
# ASSUMPTIONS:
#   - Folder structure:
#         project-root/
#           src/
#             prep.py
#             train.py
#             conda.yml
#           notebooks/
#             4_pipeline_build.ipynb
#   - Data asset "diabetes-clinical-enriched-130us:<version>" exists (MLTable)
#   - Compute cluster "clinical-cluster-cpu" exists
#   - Environment "clinical-prep-env:2" exists and includes:
#         python=3.8, pandas, scikit-learn, xgboost, azureml-mlflow
# ---------------------------------------------------------

from azure.ai.ml import MLClient, dsl, Input, Output, command
from azure.identity import DefaultAzureCredential, InteractiveBrowserCredential
from azure.ai.ml.constants import AssetTypes

# ---------------------------------------------------------
# 1. CONNECT TO WORKSPACE (silent auth first, then interactive)
# ---------------------------------------------------------
try:
    credential = DefaultAzureCredential()
    # Probe token to see if silent auth works
    credential.get_token("https://management.azure.com/.default")
except Exception:
    print("DefaultAzureCredential failed; falling back to InteractiveBrowserCredential...")
    credential = InteractiveBrowserCredential()

# config.json should be one level up from this notebook (../config.json)
ml_client = MLClient.from_config(credential=credential, path="../config.json")
print(f"Connected to workspace: {ml_client.workspace_name}")

from azure.ai.ml.entities import Data
from azure.ai.ml.constants import AssetTypes

# ---------------------------------------------------------
# 2. FIXED ENVIRONMENT STRING
# ---------------------------------------------------------
# Name  : clinical-prep-env
# Version: 2
ENV_STR = "azureml:clinical-prep-env:2"
print(f"Using environment: {ENV_STR}")

# ---------------------------------------------------------
# 3. DEFINE PIPELINE
# ---------------------------------------------------------
@dsl.pipeline(
    description="E2E Clinical Readmission Pipeline (Enrich + Prep + Train)"
)
def clinical_readmission_pipeline(
    raw_data: Input(type="uri_file"),  # Pipeline input: raw CSV data asset
):
    # -------------------------------------------------
    # STEP 1: ICD ENRICHMENT
    #   - Input: raw diabetic_data.csv (uri_file)
    #   - Output: folder with diabetes_clinical_enriched.csv
    # -------------------------------------------------
    enrich_node = command(
        name="icd_enrich",
        display_name="ICD-9 to ICD-10 Enrichment",
        description="Enrich raw US diabetic data with ICD-10 style grouping and HbA1c risk flag.",
        inputs={
            "raw_data": Input(type="uri_file")
        },
        outputs={
            "enriched_data": Output(type="uri_folder")
        },
        code="../src",  # where icd_enrich.py lives
        command=(
            "python icd_enrich.py "
            "--raw_data ${{inputs.raw_data}} "
            "--output_dir ${{outputs.enriched_data}}"
        ),
        environment=ENV_STR,
    )(
        raw_data=raw_data
    )

    # -------------------------------------------------
    # STEP 2: DATA PREP
    #   - Input: enriched data folder (from enrich_node)
    #   - Output: two uri_folder outputs with train.csv and test.csv
    # -------------------------------------------------
    prep_node = command(
        name="prep_data",
        display_name="Data Prep",
        description="Splits enriched clinical data into train and test CSVs",
        inputs={
            "data": Input(type="uri_folder")
        },
        outputs={
            "train_data": Output(type="uri_folder"),
            "test_data": Output(type="uri_folder"),
        },
        code="../src",
        command=(
            "python prep.py "
            "--data ${{inputs.data}} "
            "--train_data ${{outputs.train_data}} "
            "--test_data ${{outputs.test_data}}"
        ),
        environment=ENV_STR,
    )(
        data=enrich_node.outputs.enriched_data
    )

    # -------------------------------------------------
    # STEP 3: TRAINING
    #   - Inputs: the two folders from prep_node (train_data, test_data)
    #   - Output: model folder (uri_folder) with model.json
    #   - train.py uses MLflow autolog + azureml-mlflow
    # -------------------------------------------------
    train_node = command(
        name="train_model",
        display_name="Train XGBoost",
        description="Trains XGBoost model to predict 30-day readmission",
        inputs={
            "train_data": Input(type="uri_folder"),
            "test_data": Input(type="uri_folder"),
        },
        outputs={
            # Folder where train.py saves the trained model (e.g., model.json)
            "model_output": Output(type="uri_folder")
        },
        code="../src",
        command=(
            "python train.py "
            "--train_data ${{inputs.train_data}} "
            "--test_data ${{inputs.test_data}} "
            "--model_output ${{outputs.model_output}}"
        ),
        environment=ENV_STR,
    )(
        train_data=prep_node.outputs.train_data,
        test_data=prep_node.outputs.test_data,
    )

    # Pipeline return: expose the trained model as the pipeline's output
    return {"trained_model": train_node.outputs.model_output}


# ---------------------------------------------------------
# 4. INSTANTIATE AND SUBMIT PIPELINE JOB
# ---------------------------------------------------------
# Raw data asset (uri_file) you registered for diabetic_data.csv
raw_data_asset_name = "diabetic-data-raw-130us"   # raw dataset name
raw_version = "1"                       # first version

pipeline_job = clinical_readmission_pipeline(
    raw_data=Input(
        type="uri_file",
        path=f"azureml:{raw_data_asset_name}:{raw_version}",
    )
)

# Use the same compute you used for standalone jobs
pipeline_job.settings.default_compute = "clinical-cluster-cpu"

# Optional: give the pipeline run a friendly display name
pipeline_job.display_name = "Clinical_Readmission_E2E_Full_Manual"

print("Submitting E2E Clinical Readmission Pipeline...")
returned_job = ml_client.jobs.create_or_update(
    pipeline_job,
    experiment_name="Clinical_Readmission_Pipeline_E2E",
)

print("Pipeline submitted. Track it in Azure ML Studio:")
print(returned_job.studio_url)


Found the config file in: ..\config.json
Class DeploymentTemplateOperations: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class AutoDeleteSettingSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class AutoDeleteConditionSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.


Connected to workspace: AML-Clinical-Readmission
Using environment: azureml:clinical-prep-env:2
Submitting E2E Clinical Readmission Pipeline...


Class BaseAutoDeleteSettingSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class IntellectualPropertySchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class ProtectionLevelSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class BaseIntellectualPropertySchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
[32mUploading src (0.02 MBs): 100%|##########| 19618/19618 [00:01<00:00, 17868.75it/s]
[39m

pathOnCompute is not a known attribute of class <class 'azure.ai.ml._restclient.v2023_04_01_preview.models._models_py3.UriFolderJobOutput'> and will be ignored
pathOnCompute is not a known attribute of class <class 'azure.ai.ml._restclient.v2023_04_01_preview.m

Pipeline submitted. Track it in Azure ML Studio:
https://ml.azure.com/runs/silver_ticket_2pnzx6mvf9?wsid=/subscriptions/3aeb63fe-f831-47f0-8175-3732f2efd2a1/resourcegroups/RG-Clinical-Readmission/workspaces/AML-Clinical-Readmission&tid=deb5bf9d-8bb0-4783-8f54-42a424392492
