In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
from pathlib import Path
from urllib.parse import urlparse

In [3]:
import pandas as pd

In [4]:
import pickle

In [5]:
import seaborn as sns
import matplotlib.pyplot as plt

In [6]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge

from sklearn.metrics import mean_squared_error

In [7]:
import mlflow
from mlflow import MlflowClient
from mlflow.entities import ViewType

In [8]:
from homework_nb import preprocess_data, train, hpo, register_model



In [9]:
WORK_DIR = Path(os.getcwd()).parent.parent

DATA_DIR = WORK_DIR / 'data'

MODEL_DIR = WORK_DIR / 'models'

DIR_LIST = [
    DATA_DIR,
    MODEL_DIR
]

for dir in DIR_LIST:
    if not dir.exists():
        os.mkdir(dir)
        
DEST_DIR = WORK_DIR / 'output'

In [10]:
# from https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page
data_uri_list = [
    'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet',
    'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet',
    'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-03.parquet',
    'https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2023-01.parquet',
    'https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2023-02.parquet',
    'https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2023-03.parquet',
    ]

for data_uri in data_uri_list:
    data_uri_name = os.path.basename(urlparse(data_uri).path)
    if not (DATA_DIR / data_uri_name).exists():
        os.system(f'wget {data_uri} -P {str((DATA_DIR / data_uri_name).parent)}')
    else:
        print('file already exist, skipping download...')

file already exist, skipping download...
file already exist, skipping download...
file already exist, skipping download...
file already exist, skipping download...
file already exist, skipping download...
file already exist, skipping download...


In [11]:
preprocess_data.run_data_prep(str(DATA_DIR), str(DEST_DIR))

In [12]:
run = train.run_train(str(DEST_DIR))





In [13]:
def fetch_logged_data(run_id):
    client = MlflowClient()
    data = client.get_run(run_id).data
    tags = {k: v for k, v in data.tags.items() if not k.startswith("mlflow.")}
    artifacts = [f.path for f in client.list_artifacts(run_id, "model")]
    return data.params, data.metrics, tags, artifacts

In [14]:
# fetch logged data
params, metrics, tags, artifacts = fetch_logged_data(run.info.run_id)

In [15]:
params

{'warm_start': 'False',
 'monotonic_cst': 'None',
 'max_samples': 'None',
 'n_estimators': '100',
 'random_state': '0',
 'n_jobs': 'None',
 'oob_score': 'False',
 'min_impurity_decrease': '0.0',
 'verbose': '0',
 'max_features': '1.0',
 'min_samples_split': '2',
 'max_leaf_nodes': 'None',
 'max_depth': '10',
 'min_weight_fraction_leaf': '0.0',
 'ccp_alpha': '0.0',
 'min_samples_leaf': '1',
 'criterion': 'squared_error',
 'bootstrap': 'True'}

In [None]:
hpo.run_optimization(str(DEST_DIR), 15)

In [16]:
# def load_pickle(filename):
#     with open(filename, "rb") as f_in:
#         return pickle.load(f_in)


# def train_and_log_model(data_path, params):
#     X_train, y_train = load_pickle(os.path.join(data_path, "train.pkl"))
#     X_val, y_val = load_pickle(os.path.join(data_path, "val.pkl"))
#     X_test, y_test = load_pickle(os.path.join(data_path, "test.pkl"))

#     with mlflow.start_run():
#         for param in RF_PARAMS:
#             params[param] = int(params[param])

#         rf = RandomForestRegressor(**params)
#         rf.fit(X_train, y_train)

#         # Evaluate model on the validation and test sets
#         val_rmse = mean_squared_error(y_val, rf.predict(X_val), squared=False)
#         mlflow.log_metric("val_rmse", val_rmse)
#         test_rmse = mean_squared_error(y_test, rf.predict(X_test), squared=False)
#         mlflow.log_metric("test_rmse", test_rmse)

# def run_register_model(data_path: str, top_n: int):

#     client = MlflowClient()

#     # Retrieve the top_n model runs and log the models
#     experiment = client.get_experiment_by_name(HPO_EXPERIMENT_NAME)
#     runs = client.search_runs(
#         experiment_ids=experiment.experiment_id,
#         run_view_type=ViewType.ACTIVE_ONLY,
#         max_results=top_n,
#         order_by=["metrics.rmse ASC"]
#     )
#     for run in runs:
#         train_and_log_model(data_path=data_path, params=run.data.params)

#     # Select the model with the lowest test RMSE
#     experiment = client.get_experiment_by_name(EXPERIMENT_NAME)
#     best_run = client.search_runs(
#         experiment_ids=experiment.experiment_id,
#         run_view_type=ViewType.ACTIVE_ONLY,
#         max_results=top_n,
#         order_by=["metrics.rmse ASC"]
#     )[0]
    
#     # Register the best model
#     run_id = best_run.info.run_id
#     model_uri = f"runs:/{run_id}/model"
#     mlflow.register_model(model_uri, name="rf-best-model")

In [17]:
# HPO_EXPERIMENT_NAME = "random-forest-hyperopt"

# client = MlflowClient("http://127.0.0.1:5000")

# # Retrieve the top_n model runs and log the models
# experiment = client.get_experiment_by_name(HPO_EXPERIMENT_NAME)

In [13]:
register_model.run_register_model(str(DEST_DIR),5)

Registered model 'rf-best-model' already exists. Creating a new version of this model...
2024/05/26 04:52:20 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: rf-best-model, version 2
Created version '2' of model 'rf-best-model'.
