In [1]:
import os
%pwd

'/Users/whysocurious/Documents/MLDSAIProjects/e2e-mlops-gcp/research'

In [2]:
os.chdir("../")
%pwd

'/Users/whysocurious/Documents/MLDSAIProjects/e2e-mlops-gcp'

In [3]:

# mlflow.set_tracking_uri("http://127.0.0.1:5000")
# mlflow.set_experiment("random-forest-hyperopt")


In [4]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class ModelTrainerConfig:
    root_dir: Path
    num_trials: int
    train_data_path: Path
    test_data_path: Path
    mlflow_uri: str

In [5]:
from mlProject.constants import *
from mlProject.utils.common import read_yaml, create_directories

class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])


    def get_model_trainer_config(self) -> ModelTrainerConfig:
        config = self.config.model_trainer
        
        create_directories([config.root_dir])

        model_trainer_config = ModelTrainerConfig(
            root_dir=config.root_dir,
            num_trials=config.num_trials,
            train_data_path = config.train_data_path,
            test_data_path = config.test_data_path,
            mlflow_uri = config.mlflow_uri
            
        )

        return model_trainer_config

In [6]:
import os
from mlProject import logger
import pickle
import mlflow
import numpy as np
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
from hyperopt.pyll import scope
from sklearn.feature_extraction import DictVectorizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error

class ModelTrainer:
    def __init__(self, config: ModelTrainerConfig):
        self.config = config

    
    def load_pickle(self, filename: str):
        with open(filename, "rb") as f_in:
            return pickle.load(f_in)
    
    
    def train(self):

        # def run_optimization(data_path: str, num_trials: int):

        X_train, y_train = self.load_pickle(os.path.join(self.config.train_data_path, "train.pkl"))
        X_val, y_val = self.load_pickle(os.path.join(self.config.test_data_path, "val.pkl"))


        mlflow.set_tracking_uri(self.config.mlflow_uri)
        mlflow.set_experiment("random-forest-hyperopt")

        def objective(params):
            
            with mlflow.start_run():
                mlflow.set_tag("model", "randomforest")
                mlflow.log_params(params)

                pipeline = make_pipeline(
                    DictVectorizer(),
                    RandomForestRegressor(**params)
                )
                pipeline.fit(X_train, y_train)
                y_pred = pipeline.predict(X_val)

                rmse = mean_squared_error(y_val, y_pred, squared=False)
                mlflow.log_metric("rmse", rmse)
                mlflow.sklearn.log_model(pipeline, artifact_path="model")
            return {'loss': rmse, 'status': STATUS_OK}


        search_space = {
            'max_depth': scope.int(hp.quniform('max_depth', 1, 20, 1)),
            'n_estimators': scope.int(hp.quniform('n_estimators', 5, 30, 1)),
            'min_samples_split': scope.int(hp.quniform('min_samples_split', 2, 10, 1)),
            'min_samples_leaf': scope.int(hp.quniform('min_samples_leaf', 1, 4, 1)),
            'random_state': 42
        }

        rstate = np.random.default_rng(42)  # for reproducible results
        fmin(
            fn=objective,
            space=search_space,
            algo=tpe.suggest,
            max_evals=self.config.num_trials,
            trials=Trials(),
            rstate=rstate
        )
        
        # joblib.dump(lr, os.path.join(self.config.root_dir, self.config.model_name))



In [7]:
try:
    config = ConfigurationManager()
    model_trainer_config = config.get_model_trainer_config()
    model_trainer_config = ModelTrainer(config=model_trainer_config)
    model_trainer_config.train()
except Exception as e:
    raise e


[2024-07-05 12:44:39,673: INFO: common: yaml file: config/config.yaml loaded successfully]
[2024-07-05 12:44:39,676: INFO: common: yaml file: params.yaml loaded successfully]
[2024-07-05 12:44:39,677: INFO: common: yaml file: schema.yaml loaded successfully]
[2024-07-05 12:44:39,678: INFO: common: created directory at: artifacts]
[2024-07-05 12:44:39,678: INFO: common: created directory at: artifacts/model_trainer]


2024/07/05 12:44:41 INFO mlflow.tracking.fluent: Experiment with name 'random-forest-hyperopt' does not exist. Creating a new experiment.


  0%|          | 0/5 [00:00<?, ?trial/s, best loss=?][2024-07-05 12:44:42,042: INFO: tpe: build_posterior_wrapper took 0.001980 seconds]
[2024-07-05 12:44:42,044: INFO: tpe: TPE using 0 trials]





 20%|██        | 1/5 [04:37<18:28, 277.21s/trial, best loss: 5.081163597340483][2024-07-05 12:49:19,252: INFO: tpe: build_posterior_wrapper took 0.001757 seconds]
[2024-07-05 12:49:19,253: INFO: tpe: TPE using 1/1 trials with best loss 5.081164]





 40%|████      | 2/5 [05:04<06:30, 130.03s/trial, best loss: 5.081163597340483][2024-07-05 12:49:46,253: INFO: tpe: build_posterior_wrapper took 0.001174 seconds]
[2024-07-05 12:49:46,254: INFO: tpe: TPE using 2/2 trials with best loss 5.081164]







 60%|██████    | 3/5 [05:34<02:49, 84.53s/trial, best loss: 5.081163597340483] [2024-07-05 12:50:16,648: INFO: tpe: build_posterior_wrapper took 0.001235 seconds]
[2024-07-05 12:50:16,649: INFO: tpe: TPE using 3/3 trials with best loss 5.081164]







 80%|████████  | 4/5 [08:43<02:05, 125.91s/trial, best loss: 5.073795325986875][2024-07-05 12:53:25,976: INFO: tpe: build_posterior_wrapper took 0.001225 seconds]
[2024-07-05 12:53:25,977: INFO: tpe: TPE using 4/4 trials with best loss 5.073795]







100%|██████████| 5/5 [10:19<00:00, 123.95s/trial, best loss: 5.073795325986875]






