In [4]:
!pwd
import os
os.chdir('../')
!pwd

/Users/Bingumalla Likith/Desktop/MLOPS/Project-2/research
/Users/Bingumalla Likith/Desktop/MLOPS/Project-2


### Modular Programming

In [9]:
from dataclasses import dataclass
from pathlib import Path

@dataclass
class ModelTrainerConfig:
    root_dir: Path
    train_data_path: Path
    test_data_path: Path
    model_name: str
    target_column: str
    params: dict
    best_params_path: Path

In [10]:
from src.data_science import logger
from src.data_science.utils.common import read_yaml, write_yaml, create_directories
from src.data_science.constants import *

class ConfigurationManager:
    def __init__(self,
                config_path = CONFIG_FILE_PATH,
                params_path = PARAMS_FILE_PATH,
                schema_path = SCHEMA_FILE_PATH):
        self.config = read_yaml(config_path)
        self.params = read_yaml(params_path)
        self.schema = read_yaml(schema_path)
        
        create_directories([self.config.artifacts_root])

    def get_model_trainer_config(self) -> ModelTrainerConfig:
        config = self.config.model_trainer
        params = self.params.RandomForest
        target_column = self.schema.TARGET.name

        create_directories([config.root_dir])
        model_trainer_config = ModelTrainerConfig(
            **config,
            target_column=target_column,
            params = params
        )

        return model_trainer_config

In [13]:
import pandas as pd
import os
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
import joblib
import mlflow
from mlflow.models import infer_signature
from dotenv import load_dotenv
from urllib.parse import urlparse

load_dotenv()

class ModelTrainer:
    def __init__(self, config: ModelTrainerConfig):
        self.config = config
        self.runs = 0

    def train(self):
        mlflow.set_tracking_uri(os.getenv('MLFLOW_TRACKING_URI'))
        mlflow.set_experiment('Taxi_Fare_Prediction')

        with mlflow.start_run(run_name=f"run-{self.runs + 1}"):
            train_data = pd.read_csv(self.config.train_data_path)
            
            y_train = train_data[self.config.target_column].values
            x_train = train_data.drop(columns=[self.config.target_column]).values

            signature = infer_signature(x_train, y_train)

            model = RandomForestRegressor()
            gscv = GridSearchCV(model, self.config.params, scoring="neg_mean_squared_error", cv=2)

            gscv.fit(x_train, y_train)
            logger.info("Model has been trained successfully !!")
            best_model = gscv.best_estimator_
            best_score = gscv.best_score_
            best_params ={ 
                "best_params" : gscv.best_params_
            }

            write_yaml(best_params, Path(os.path.join(self.config.root_dir, 'best_params.yaml')))
            mlflow.log_params(gscv.best_params_)
            logger.info("Saved best model parameters.")
            
            logger.info(f"Best training score : {abs(best_score)}")
            mlflow.log_metric("Training error", abs(best_score))

            url_type = urlparse(mlflow.get_tracking_uri()).scheme

            if url_type != "file":
                mlflow.sklearn.log_model(best_model, "model", signature=signature, registered_model_name = f"Best Model-{self.runs+1}")
            else:
                mlflow.sklearn.log_model(best_model, "model", signature=signature)

            joblib.dump(best_model, os.path.join(self.config.root_dir, self.config.model_name))
            logger.info("Best model has been saved successfully !!")

In [14]:
try: 
    config = ConfigurationManager()
    model_trainer_config = config.get_model_trainer_config()
    model_trainer = ModelTrainer(model_trainer_config)

    model_trainer.train()
except Exception as e:
    raise e

[2025-01-17 18:55:49,580 : INFO : common : Yaml file : config/config.yaml loaded successfully !!]
[2025-01-17 18:55:49,583 : INFO : common : Yaml file : params.yaml loaded successfully !!]
[2025-01-17 18:55:49,587 : INFO : common : Yaml file : schema.yaml loaded successfully !!]
[2025-01-17 18:55:49,588 : INFO : common : Created directory at : artifacts]
[2025-01-17 18:55:49,589 : INFO : common : Created directory at : artifacts/model_trainer]
[2025-01-17 18:56:49,231 : INFO : 764716464 : Model has been trained successfully !!]
[2025-01-17 18:56:49,234 : INFO : common : Dumped data into artifacts/model_trainer/best_params.yaml successfully !!]
[2025-01-17 18:56:49,722 : INFO : 764716464 : Saved best model parameters.]
[2025-01-17 18:56:49,724 : INFO : 764716464 : Best training score : 100.43783216216468]


Registered model 'Best Model-1' already exists. Creating a new version of this model...
2025/01/17 18:56:57 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Best Model-1, version 2


[2025-01-17 18:56:57,323 : INFO : 764716464 : Best model has been saved successfully !!]


Created version '2' of model 'Best Model-1'.


🏃 View run run-1 at: https://dagshub.com/blikith86/DataScienceProject.mlflow/#/experiments/0/runs/e2c64e3e6bc54a98a1401923adfa7065
🧪 View experiment at: https://dagshub.com/blikith86/DataScienceProject.mlflow/#/experiments/0


### Research

In [3]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from src.data_science.utils.common import read_yaml

In [4]:
train_data = pd.read_csv('artifacts/data_transformation/train.csv')

x_train = train_data.values
y_train = train_data['Trip_Price']
model = RandomForestRegressor()

params = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': [None, 'sqrt', 'log2']
}
gscv = GridSearchCV(model, params, scoring="neg_mean_squared_error", cv=2)

gscv.fit(x_train, y_train)

In [5]:
gscv.best_params_

{'max_depth': 20,
 'max_features': None,
 'min_samples_leaf': 2,
 'min_samples_split': 2,
 'n_estimators': 200}

In [27]:
{'best_params' : gscv.best_params_}

{'best_params': {'max_depth': None,
  'max_features': None,
  'min_samples_leaf': 2,
  'min_samples_split': 2,
  'n_estimators': 100}}