In [24]:
%%writefile local_src/train.py
import os
import argparse
import pandas as pd
import mlflow
import mlflow.sklearn
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

def main():
    """Main function of the script."""

    # input and output arguments
    parser = argparse.ArgumentParser()
    parser.add_argument("--data", type=str, help="path to input data")
    parser.add_argument("--test_train_ratio", type=float, required=False, default=0.25)
    parser.add_argument("--n_estimators", required=False, default=100, type=int)
    parser.add_argument("--learning_rate", required=False, default=0.1, type=float)
    parser.add_argument("--registered_model_name", type=str, help="model name")
    args = parser.parse_args()
   
    # Start Logging
    mlflow.start_run()

    # enable autologging
    mlflow.sklearn.autolog()

    ###################
    #<prepare the data>
    ###################
    print(" ".join(f"{k}={v}" for k, v in vars(args).items()))

    print("input data:", args.data)
    
    credit_df = pd.read_csv(args.data, header=1, index_col=0)

    mlflow.log_metric("num_samples", credit_df.shape[0])
    mlflow.log_metric("num_features", credit_df.shape[1] - 1)

    #Split train and test datasets
    train_df, test_df = train_test_split(
        credit_df,
        test_size=args.test_train_ratio,
    )
    ####################
    #</prepare the data>
    ####################

    ##################
    #<train the model>
    ##################
    # Extracting the label column
    y_train = train_df.pop("default payment next month")

    # convert the dataframe values to array
    X_train = train_df.values

    # Extracting the label column
    y_test = test_df.pop("default payment next month")

    # convert the dataframe values to array
    X_test = test_df.values

    print(f"Training with data of shape {X_train.shape}")

    clf = GradientBoostingClassifier(
        n_estimators=args.n_estimators, learning_rate=args.learning_rate
    )
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ####################
    # Log classifier accuracy
    ####################

    accuracy = clf.score(X_test, y_test)
    print('Accuracy of SVM classifier on test set: {:.2f}'.format(accuracy))
    mlflow.log_metric('accuracy', float(accuracy))

    print(classification_report(y_test, y_pred))
    ###################
    #</train the model>
    ###################

    ##########################
    #<save and register model>
    ##########################
    # Registering the model to the workspace
    print("Registering the model via MLFlow")
    mlflow.sklearn.log_model(
        sk_model=clf,
        registered_model_name=args.registered_model_name,
        artifact_path=args.registered_model_name,
    )

    # Saving the model to a file
    mlflow.sklearn.save_model(
        sk_model=clf,
        path=os.path.join(args.registered_model_name, "trained_model"),
    )
    ###########################
    #</save and register model>
    ###########################
    
    # Stop Logging
    mlflow.end_run()

if __name__ == "__main__":
    main()

Writing local_src/train.py


In [32]:
%%writefile sweep_job.yaml
$schema: https://azuremlschemas.azureedge.net/latest/sweepJob.schema.json
type: sweep

trial:
  command: >-
      python train.py 
      --data ${{inputs.train_data_csv}} 
      --learning_rate ${{search_space.learning_rate}}
      --registered_model_name ${{inputs.registered_model_name}} 
  code: local_src
  environment: azureml:AzureML-sklearn-1.0-ubuntu20.04-py38-cpu@latest
inputs:
  train_data_csv: 
    type: uri_file
    path: azureml:credit_cards@latest
  test_data_csv: 
    type: uri_file
    path: azureml:credit_cards@latest
  registered_model_name: sweeped_credit_default_model
sampling_algorithm: random
search_space:
  learning_rate: 
    type: uniform
    min_value: 0.1
    max_value: 3.0
objective:
  goal: maximize
  primary_metric: accuracy
limits:
  max_total_trials: 4
  max_concurrent_trials: 2
  timeout: 3600
display_name: sweeped_credit_default_model
experiment_name: sweeped_credit_default_model
description: sweeped_credit_default_model

Overwriting sweep_job.yaml


In [33]:
!az ml job create --file sweep_job.yaml

{
  "compute": "azureml:Serverless",
  "creation_context": {
    "created_at": "2023-11-07T18:58:05.926524+00:00",
    "created_by": "Anton Slutsky",
    "created_by_type": "User"
  },
  "description": "sweeped_credit_default_model",
  "display_name": "sweeped_credit_default_model",
  "experiment_name": "sweeped_credit_default_model",
  "id": "azureml:/subscriptions/781b03e7-6eb7-4506-bab8-cf3a0d89b1d4/resourceGroups/SandboxML/providers/Microsoft.MachineLearningServices/workspaces/quick-start-tutorial/jobs/bubbly_zoo_75j3kby8n2",
  "inputs": {
    "registered_model_name": "sweeped_credit_default_model",
    "test_data_csv": {
      "mode": "ro_mount",
      "path": "azureml://locations/eastus/workspaces/228a5faf-b108-49f1-aca6-2709336183e6/data/credit_cards/versions/3",
      "type": "uri_file"
    },
    "train_data_csv": {
      "mode": "ro_mount",
      "path": "azureml://locations/eastus/workspaces/228a5faf-b108-49f1-aca6-2709336183e6/data/credit_cards/versions/3",
      "type": "u

# Create Component

In [34]:
!az ml component create -f sweep_job.yaml

ERROR: Unsupported component type: sweep.
