In [1]:
import os
%pwd

'/Users/whysocurious/Documents/MLDSAIProjects/e2e-mlops-gcp/research'

In [2]:
os.chdir("../")
%pwd

'/Users/whysocurious/Documents/MLDSAIProjects/e2e-mlops-gcp'

In [3]:
# os.environ["MLFLOW_TRACKING_URI"]="https://dagshub.com/entbappy/End-to-end-Machine-Learning-Project-with-MLflow.mlflow"
# os.environ["MLFLOW_TRACKING_USERNAME"]="entbappy"
# os.environ["MLFLOW_TRACKING_PASSWORD"]="6824692c47a369aa6f9eac5b10041d5c8edbcef0"

In [4]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class ModelEvaluationConfig:
    root_dir: Path
    pred_data_path: Path
    model_path: Path
    year: int
    month: int

In [5]:
from mlProject.constants import *
from mlProject.utils.common import read_yaml, create_directories, save_json

class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])

    
    def get_model_evaluation_config(self) -> ModelEvaluationConfig:
        config = self.config.model_evaluation
        params = self.params.dataDetails

        create_directories([config.root_dir])

        model_evaluation_config = ModelEvaluationConfig(
            root_dir=config.root_dir,
            pred_data_path=config.pred_data_path,
            model_path = config.model_path,
            year=params.year,
            month=params.month            
        )

        return model_evaluation_config

In [10]:
import os
import pandas as pd
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np
import pickle
from mlProject import logger

class ModelEvaluation:
    def __init__(self, config: ModelEvaluationConfig):
        self.config = config

    
    def predict_data(self):
        logger.info (f"predicting using model from {self.config.model_path}")
        with open(self.config.model_path, 'rb') as f_in:
            dv, model = pickle.load(f_in)

        dftmp = pd.read_csv(self.config.pred_data_path)
        categorical = ['PULocationID', 'DOLocationID']
        dicts = dftmp[categorical].to_dict(orient='records')
        X_val = dv.transform(dicts)
        y_pred = model.predict(X_val)

        prdStd = y_pred.std()
        prdMean = y_pred.mean()
        logger.info (f"Standard deviation of preds - {prdStd}")
        logger.info (f"Mean of preds - {prdMean}")

        logger.info ("creating results dataframe...")
        df_res = pd.DataFrame({
            'ride_id': dftmp['ride_id'],
            'predicted_duration': y_pred
        })

        logger.info (f'writing results to - {self.config.root_dir}')
        df_res.to_parquet(
            self.config.root_dir + f'/yellow_{self.config.year:04d}-{self.config.month:02d}.parquet',
            engine='pyarrow',
            compression=None,
            index=False
        )
        sz = os.path.getsize(self.config.root_dir) / (1024*1024)
        logger.info(f'df_results file-size - {sz}')
        logger.info("Results file written.")

In [11]:
try:
    config = ConfigurationManager()
    model_evaluation_config = config.get_model_evaluation_config()
    model_evaluation_config = ModelEvaluation(config=model_evaluation_config)
    model_evaluation_config.predict_data()
except Exception as e:
    raise e

[2024-07-02 13:43:41,642: INFO: common: yaml file: config/config.yaml loaded successfully]
[2024-07-02 13:43:41,644: INFO: common: yaml file: params.yaml loaded successfully]
[2024-07-02 13:43:41,645: INFO: common: yaml file: schema.yaml loaded successfully]
[2024-07-02 13:43:41,647: INFO: common: created directory at: artifacts]
[2024-07-02 13:43:41,648: INFO: common: created directory at: artifacts/model_evaluation]
[2024-07-02 13:43:41,649: INFO: 2447913764: predicting using model from research/model.bin]


  dftmp = pd.read_csv(self.config.pred_data_path)


[2024-07-02 13:43:49,916: INFO: 2447913764: Standard deviation of preds - 6.750155989720952e-14]
[2024-07-02 13:43:49,918: INFO: 2447913764: Mean of preds - 23.19714924577499]
[2024-07-02 13:43:49,918: INFO: 2447913764: creating results dataframe...]
[2024-07-02 13:43:49,974: INFO: 2447913764: writing results to - artifacts/model_evaluation]
[2024-07-02 13:43:50,252: INFO: 2447913764: df_results file-size - 9.1552734375e-05]
[2024-07-02 13:43:50,252: INFO: 2447913764: Results file written.]
