# Q1. Install Mlflow

In [None]:
! pip install mlflow

In [6]:
! mlflow --version

mlflow, version 1.26.1


# Q2 Download and preprocess the data

In [99]:
%run ../src/preprocess_data.py --raw_data_path ../data/ --dest_path ./output

In [100]:
import os
print("Files: ", os.listdir("./output"))
print("Number of files: ", len(os.listdir("./output")))

Files:  ['dv.pkl', 'test.pkl', 'train.pkl', 'valid.pkl']
Number of files:  4


# Q3. Train a model with autolog

In [101]:
# Run MlFlow from console (not from notebook)
# located in \mlops-zoomcamp\00-homework\02-experiment-tracking> 
# Run: mlflow ui --backend-store-uri sqlite:///data/mlflow/mlflow.db

In [102]:
%run ../src/train.py --data_path ./output

In [103]:
import sqlite3
con = sqlite3.connect('../data/mlflow/mlflow.db')
cur = con.cursor()

In [104]:
cur.execute('SELECT * FROM params;')
params = []
for each in cur.fetchall():
    params.append(each[0])
params = list(set(params))
print("Params: ", params)

print("\nNumber of Params:", len(params))

Params:  ['max_samples', 'max_features', 'n_jobs', 'max_depth', 'verbose', 'warm_start', 'random_state', 'bootstrap', 'ccp_alpha', 'min_weight_fraction_leaf', 'criterion', 'min_samples_leaf', 'n_estimators', 'max_leaf_nodes', 'min_impurity_decrease', 'oob_score', 'min_samples_split']

Number of Params: 17


# Q4. Launch the tracking server locally

In [105]:
# Run MlFlow from console (not from notebook)
# located in \mlops-zoomcamp\00-homework\02-experiment-tracking> 
# Run: mlflow ui --backend-store-uri sqlite:///data/mlflow/mlflow.db --artifacts-destination ./data/mlflow/artifacts 

# Q5. Tune the hyperparameters of the model

In [106]:
%run ../src/hpo.py --data_path ./output

  0%|          | 0/50 [00:00<?, ?trial/s, best loss=?]




  2%|▏         | 1/50 [00:16<13:25, 16.44s/trial, best loss: 6.658956269343007]




  4%|▍         | 2/50 [00:19<06:58,  8.71s/trial, best loss: 6.658956269343007]




  6%|▌         | 3/50 [00:23<05:05,  6.50s/trial, best loss: 6.658956269343007]




  8%|▊         | 4/50 [00:35<06:42,  8.75s/trial, best loss: 6.651438559376775]




 10%|█         | 5/50 [00:42<06:04,  8.11s/trial, best loss: 6.651438559376775]




 12%|█▏        | 6/50 [01:01<08:31, 11.62s/trial, best loss: 6.651438559376775]




 14%|█▍        | 7/50 [01:18<09:41, 13.51s/trial, best loss: 6.651438559376775]




 16%|█▌        | 8/50 [01:22<07:23, 10.56s/trial, best loss: 6.651438559376775]




 18%|█▊        | 9/50 [01:34<07:23, 10.82s/trial, best loss: 6.651438559376775]




 20%|██        | 10/50 [01:44<07:00, 10.51s/trial, best loss: 6.651438559376775]




 22%|██▏       | 11/50 [01:52<06:26,  9.92s/trial, best loss: 6.642137287429206]




 24%|██▍       | 12/50 [01:59<05:41,  8.99s/trial, best loss: 6.642137287429206]




 26%|██▌       | 13/50 [02:03<04:37,  7.50s/trial, best loss: 6.642137287429206]




 28%|██▊       | 14/50 [02:11<04:34,  7.63s/trial, best loss: 6.642137287429206]




 30%|███       | 15/50 [02:23<05:08,  8.80s/trial, best loss: 6.642137287429206]




 32%|███▏      | 16/50 [02:30<04:45,  8.39s/trial, best loss: 6.642137287429206]




 34%|███▍      | 17/50 [02:41<05:07,  9.33s/trial, best loss: 6.642137287429206]




 36%|███▌      | 18/50 [02:59<06:18, 11.82s/trial, best loss: 6.629728007710133]




 38%|███▊      | 19/50 [03:05<05:14, 10.16s/trial, best loss: 6.629728007710133]




 40%|████      | 20/50 [03:10<04:17,  8.58s/trial, best loss: 6.629728007710133]




 42%|████▏     | 21/50 [03:34<06:24, 13.25s/trial, best loss: 6.629728007710133]




 44%|████▍     | 22/50 [03:53<06:56, 14.88s/trial, best loss: 6.629728007710133]




 46%|████▌     | 23/50 [04:21<08:25, 18.73s/trial, best loss: 6.629728007710133]




 48%|████▊     | 24/50 [04:45<08:47, 20.29s/trial, best loss: 6.629728007710133]




 50%|█████     | 25/50 [05:04<08:20, 20.03s/trial, best loss: 6.629728007710133]




 52%|█████▏    | 26/50 [05:23<07:55, 19.79s/trial, best loss: 6.629728007710133]




 54%|█████▍    | 27/50 [05:34<06:33, 17.10s/trial, best loss: 6.629728007710133]




 56%|█████▌    | 28/50 [05:56<06:46, 18.48s/trial, best loss: 6.629728007710133]




 58%|█████▊    | 29/50 [06:04<05:22, 15.35s/trial, best loss: 6.629728007710133]




 60%|██████    | 30/50 [06:24<05:35, 16.80s/trial, best loss: 6.629728007710133]




 62%|██████▏   | 31/50 [06:28<04:03, 12.82s/trial, best loss: 6.629728007710133]




 64%|██████▍   | 32/50 [06:42<03:56, 13.15s/trial, best loss: 6.629728007710133]




 66%|██████▌   | 33/50 [07:03<04:27, 15.71s/trial, best loss: 6.629728007710133]




 68%|██████▊   | 34/50 [07:22<04:23, 16.49s/trial, best loss: 6.6284257482044735]




 70%|███████   | 35/50 [07:32<03:39, 14.65s/trial, best loss: 6.6284257482044735]




 72%|███████▏  | 36/50 [07:43<03:11, 13.68s/trial, best loss: 6.6284257482044735]




 74%|███████▍  | 37/50 [07:46<02:15, 10.43s/trial, best loss: 6.6284257482044735]




 76%|███████▌  | 38/50 [08:00<02:17, 11.44s/trial, best loss: 6.6284257482044735]




 78%|███████▊  | 39/50 [08:18<02:26, 13.31s/trial, best loss: 6.6284257482044735]




 80%|████████  | 40/50 [08:35<02:26, 14.63s/trial, best loss: 6.6284257482044735]




 82%|████████▏ | 41/50 [08:41<01:47, 11.93s/trial, best loss: 6.6284257482044735]




 84%|████████▍ | 42/50 [08:49<01:26, 10.86s/trial, best loss: 6.6284257482044735]




 86%|████████▌ | 43/50 [09:01<01:18, 11.18s/trial, best loss: 6.6284257482044735]




 88%|████████▊ | 44/50 [09:20<01:21, 13.55s/trial, best loss: 6.6284257482044735]




 90%|█████████ | 45/50 [09:35<01:09, 13.84s/trial, best loss: 6.6284257482044735]




 92%|█████████▏| 46/50 [09:42<00:47, 11.94s/trial, best loss: 6.6284257482044735]




 94%|█████████▍| 47/50 [10:04<00:44, 14.83s/trial, best loss: 6.6284257482044735]




 96%|█████████▌| 48/50 [10:16<00:27, 13.97s/trial, best loss: 6.6284257482044735]




 98%|█████████▊| 49/50 [10:35<00:15, 15.60s/trial, best loss: 6.6284257482044735]




100%|██████████| 50/50 [10:58<00:00, 13.18s/trial, best loss: 6.6284257482044735]


# Q6. Promote the best model to the model registry

In [110]:
%run ../src/register_model.py --data_path ./output

Successfully registered model 'random-forest-regressor'.
2022/05/29 00:29:41 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: random-forest-regressor, version 1


<Run: data=<RunData: metrics={'mean_squared_error-2_X_test': 6.549081827157288,
 'mean_squared_error_X_valid': 6.629851022038263,
 'test_rmse': 6.549081827157288,
 'training_mae': 3.7770678522029084,
 'training_mse': 30.58215535939286,
 'training_r2_score': 0.7712712967168107,
 'training_rmse': 5.530113503301073,
 'training_score': 0.7712712967168107,
 'valid_rmse': 6.629851022038263}, params={'bootstrap': 'True',
 'ccp_alpha': '0.0',
 'criterion': 'squared_error',
 'max_depth': '20',
 'max_features': '1.0',
 'max_leaf_nodes': 'None',
 'max_samples': 'None',
 'min_impurity_decrease': '0.0',
 'min_samples_leaf': '2',
 'min_samples_split': '4',
 'min_weight_fraction_leaf': '0.0',
 'n_estimators': '33',
 'n_jobs': 'None',
 'oob_score': 'False',
 'random_state': '42',
 'verbose': '0',
 'warm_start': 'False'}, tags={'estimator_class': 'sklearn.ensemble._forest.RandomForestRegressor',
 'estimator_name': 'RandomForestRegressor',
 'mlflow.log-model.history': '[{"run_id": "2ba916b08e4747e780a78

Created version '1' of model 'random-forest-regressor'.
