### Use AutoML to produce baseline model

In [0]:
import pandas as pd

# load the dataset saved in prior notebook
df = pd.read_parquet('/dbfs/tmp/housing_data.parquet')

In [0]:
import databricks.automl
from datetime import datetime

summary = databricks.automl.regress(
  dataset=df,
  target_col='median_house_value',
  primary_metric='rmse',  
  timeout_minutes=5,
  experiment_name=f"Housing_AutoML_{datetime.now().strftime('%Y%m%d_%H%M%S')}",
  
)

2024/12/03 22:58:45 INFO databricks.automl.client.manager: AutoML will optimize for root mean squared error metric, which is tracked as val_root_mean_squared_error in the MLflow experiment.
2024/12/03 22:58:46 INFO databricks.automl.client.manager: MLflow Experiment ID: 1005077969890497
2024/12/03 22:58:46 INFO databricks.automl.client.manager: MLflow Experiment: https://dbc-7e9bba30-86e9.cloud.databricks.com/?o=773215260201204#mlflow/experiments/1005077969890497
2024/12/03 23:00:09 INFO databricks.automl.client.manager: Data exploration notebook: https://dbc-7e9bba30-86e9.cloud.databricks.com/?o=773215260201204#notebook/1005077969890515
2024/12/03 23:14:20 INFO databricks.automl.client.manager: AutoML experiment completed successfully.


Unnamed: 0,Train,Validation,Test
root_mean_squared_error,39297.397,49028.734,49308.953
mean_squared_error,1544285000.0,2403817000.0,2431373000.0
example_count,12471.0,4008.0,4161.0
r2_score,0.884,0.819,0.818
sum_on_target,2572507000.0,835401900.0,861594700.0
score,0.884,0.819,0.818
mean_absolute_error,26410.323,32481.416,32710.954
mean_on_target,206279.165,208433.607,207064.335
max_error,358163.328,402847.172,377160.82
mean_absolute_percentage_error,0.149,0.177,0.183


In [0]:
from mlflow.tracking import MlflowClient
import mlflow
import mlflow.sklearn


# fetch experiment ID from autoML summary
experiment_id = summary.experiment.experiment_id

# initialize mlFlow client
client = MlflowClient()

# search for best run
runs = client.search_runs(
    experiment_ids=[experiment_id],
    order_by=['metrics.val_root_mean_squared_error'],
    max_results=1
)

if runs:
    best_run = runs[0]
    # Print the details of the best run
    print(f'Best Run ID: {best_run.info.run_id}')


    # Load the best model
    model_uri = f'runs:/{best_run.info.run_id}/model'
    best_model = mlflow.pyfunc.load_model(model_uri)

else:
    print('no runs found')

Best Run ID: bc76004ba5184fd195ea34cc9fdc50e2


Downloading artifacts:   0%|          | 0/11 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/11 [00:00<?, ?it/s]

In [0]:
# Load and visualize the best model
best_model = mlflow.sklearn.load_model(model_uri)
best_model