In [1]:
import os

In [2]:
%pwd

'/home/adhitizki/playground/pacmann/mlops_credit_card/mlops-credit-card/notebooks'

In [3]:
# Change to the main directory
# So, it's executed from main directory
os.chdir("../")

In [4]:
# with open('.env') as f:
#     os.environ.update(
#         line.strip().split('=') for line in f
# )

In [5]:
%pwd

'/home/adhitizki/playground/pacmann/mlops_credit_card/mlops-credit-card'

### Evaluation Config

This code will be apply in `src/MLProject/entity/config_entity.py`.

In [6]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class TrainEvaluationConfig:
    root_dir: Path
    input_train_path: Path
    input_test_path: Path
    input_valid_path: Path
    output_train_path: Path
    output_test_path: Path
    output_valid_path: Path
    scaled_train_path: Path
    scaled_test_path: Path
    scaled_valid_path: Path
    scaler_model_path: Path
    model_path: Path
    train_score_path: Path
    test_score_path: Path
    valid_score_path: Path
    mlflow_dataset_path: Path
    mlflow_dataset_column: list
    minio_endpoint_url: str
    minio_access_key_id: str
    minio_secret_access_key: str
    mlflow_tracking_uri: str
    mlflow_exp_name: str
    mlflow_dataset_bucket: str
    mlflow_run_name: str

### Evaluation Config Manager

This code will be apply in `src/MLProject/config/configurations.py`.

In [7]:
from MLProject.constants import CONFIG_FILE_PATH, PARAMS_FILE_PATH
from MLProject.utils.common import read_yaml, create_directories, save_json

In [8]:
class ConfigurationManager:
    def __init__(self, 
                 config_filepath = CONFIG_FILE_PATH,
                 params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])
    
    def get_train_eval_config(self) -> TrainEvaluationConfig:
        """read training evaluation config file and store as 
        config entity then apply the dataclasses
        
        Returns:
            config: TrainEvaluationConfig type
        """
        data_dump_config = self.config.dump_data
        scaler_config = self.config.scale_data
        train_config = self.config.train_model
        eval_config = self.config.evaluation

        create_directories([eval_config.root_dir])

        config = TrainEvaluationConfig(
            root_dir=eval_config.root_dir,
            input_train_path=Path(data_dump_config.input_train_path),
            input_test_path=Path(data_dump_config.input_test_path),
            input_valid_path=Path(data_dump_config.input_valid_path),
            output_train_path=Path(data_dump_config.output_train_path),
            output_test_path=Path(data_dump_config.output_test_path),
            output_valid_path=Path(data_dump_config.output_valid_path),
            scaled_train_path=Path(scaler_config.scaled_train_path),
            scaled_test_path=Path(scaler_config.scaled_test_path),
            scaled_valid_path=Path(scaler_config.scaled_valid_path),
            scaler_model_path=Path(scaler_config.scaler_model_path),
            model_path=Path(train_config.model_path),
            train_score_path=Path(eval_config.train_score_path),
            test_score_path=Path(eval_config.test_score_path),
            valid_score_path=Path(eval_config.valid_score_path),
            mlflow_dataset_path=Path(eval_config.mlflow_dataset_path),
            mlflow_dataset_column=eval_config.mlflow_dataset_column,
            minio_endpoint_url=os.environ['MLFLOW_S3_ENDPOINT_URL'],
            minio_access_key_id=os.environ['MINIO_ACCESS_KEY'],
            minio_secret_access_key=os.environ['MINIO_SECRET_ACCESS_KEY'],
            mlflow_tracking_uri=os.environ["MLFLOW_TRACKING_URI"],
            mlflow_exp_name=eval_config.mlflow_exp_name,
            mlflow_dataset_bucket=os.environ["PROJECT_BUCKET"],
            mlflow_run_name=eval_config.mlflow_run_name
        )

        return config

### Config in `src/MLProject/components/model_evaluation.py`

Logging is tied with a runs, which is **one cycle training and evaluating model**.
To start logging, we have to give mlflow **a context**, which is our **current run**.

Steps:
+ Load train and test data (text and scaled), its target data, and the vectorizer.
+ Pointing the mlflow client in our program to our mlflow server.
+ Set experiment.
+ Set MLflow to run and start logging.

In [9]:
import boto3
import json
import joblib
import mlflow
import string
import random
import pandas as pd

from mlflow.data.pandas_dataset import PandasDataset
from mlflow.data.dataset_source import DatasetSource
from sklearn.metrics import classification_report

from MLProject import logger

class TrainEvaluation:
    def __init__(self, config: TrainEvaluationConfig):
        self.config = config

    def get_prediction(self, model, X_input_scaled) -> pd.DataFrame:
        """predict the input data with the model
        
        Args:
            model (Any): the machine learning model
            X_input_scaled (Any): the scaled input data
        
        Returns:
            pd.Series: prediction result in dataframe
        """
        y_predict = pd.Series(model.predict(X_input_scaled))
        
        return y_predict
    
    def get_report(self, y_output, y_predict, score_path, data_type='train') -> dict:
        """generate the classification report and dump the report as json
        
        Args:
            y_output (pd.Series): the actual output data
            y_predict (pd.Series): the prediction result
            score_path (Path): Path score location
            data_type (str): data type are train, test, or validation
        
        Returns:
            dict: classification report in dict format
        """
        metrics = classification_report(y_output, y_predict, output_dict=True)
        
        logger.info(f"Save report as json.")
        save_json(path=score_path, data=metrics)
        
        logger.info(f"Show the {data_type} report.")
        print(f"\n{classification_report(y_output, y_predict)}")
        
        return metrics
    
    def get_mlflow_metrics(self, metrics, data_type='train') -> dict:
        """generate the classification report for MLflow

        Args:
            metrics (dict): the classification report
            data_type (str): data type are train, test, or validation
        
        Returns:
            dict: classification report in dict format
        """
        mlflow_metrics = {}

        for rating in range(len(metrics) - 3):
            data_metric = metrics[str(rating)]
            for name, value in data_metric.items():
                mlflow_metrics[data_type + "_" + name + "_" + str(rating)] = value
            
        return mlflow_metrics
    
    def get_dataset(self, X_input, y_output, y_predict) -> pd.DataFrame:
        """construct the dataset and save as dataframe and csv file
        
        Args:
            X_input (pd.DataFrame): the input data
            y_output (pd.Series): the actual output data
            y_predict (pd.Series): the prediction result
        
        Returns:
            pd.Dataframe: prediction result in dataframe
        """
        train_eval_result = X_input.copy()
        train_eval_result["Class"] = y_output
        train_eval_result["Prediction"] = y_predict

        train_eval_result = train_eval_result[self.config.mlflow_dataset_column].sample(10)
        train_eval_result.to_csv(self.config.mlflow_dataset_path, index=False)
        
        return train_eval_result
        
    def get_mlflow_dataset(self, mlflow_dataset, run_name) -> PandasDataset:
        """convert the dataset into MLflow's dataset format
        
        Args:
            mlflow_dataset (pd.DataFrame): the project dataset to train and the result
            run_name (str): the name of MLflow runs
        
        Returns:
            PandasDataset: the dataset in Pandas MLflow format
        """
        mlflow_dataset: PandasDataset=mlflow.data.from_pandas(
            mlflow_dataset,
            source=DatasetSource.load(f"s3://{self.config.mlflow_dataset_bucket}/{run_name}.csv"),
            name=f"{run_name}",
            targets=self.config.mlflow_dataset_column[1],
            predictions=self.config.mlflow_dataset_column[2]
        )
        
        logger.info(f"Remove {self.config.mlflow_dataset_path} file from local.")
        os.remove(self.config.mlflow_dataset_path)
        
        return mlflow_dataset
    
    def s3_upload_mlflow_dataset(self, run_name) -> None:
        """upload the dataset into MinIO with MLflow run_name
        
        Args:
            run_name (str): the name of MLflow runs
        """
        s3 = boto3.client('s3',
                              endpoint_url=self.config.minio_endpoint_url,
                              aws_access_key_id=self.config.minio_access_key_id,
                              aws_secret_access_key=self.config.minio_secret_access_key)
        
        try:
            s3.upload_file(
                self.config.mlflow_dataset_path, 
                self.config.mlflow_dataset_bucket, 
                f'{run_name}.csv'
            )    
        except Exception as e:
            logger.error(e)
            raise e
    
    def mlflow_log_train(self) -> None:
        """perform experimentation with MLflow to evaluate the training result
        """
        logger.info(f"Load scaled data train from {self.config.scaled_train_path}.")
        X_train_scaled = joblib.load(self.config.scaled_train_path)
        X_test_scaled = joblib.load(self.config.scaled_test_path)
        X_valid_scaled = joblib.load(self.config.scaled_valid_path)
        
        logger.info(f"Load data train from {self.config.input_train_path}.")
        X_train = joblib.load(self.config.input_train_path)
        X_test = joblib.load(self.config.input_test_path)
        X_valid = joblib.load(self.config.input_valid_path)
        
        logger.info(f"Load data train output from {self.config.output_train_path}.")
        y_train = joblib.load(self.config.output_train_path)
        y_test = joblib.load(self.config.output_test_path)
        y_valid = joblib.load(self.config.output_valid_path)
        
        logger.info(f"Load the model.")
        model = joblib.load(self.config.model_path)
        
        logger.info(f"Predicting the data train test valid.")
        y_train_pred = self.get_prediction(model, X_train_scaled)
        y_test_pred = self.get_prediction(model, X_test_scaled)
        y_valid_pred = self.get_prediction(model, X_valid_scaled)
        
        logger.info(f"Generate classification report.")
        train_report = self.get_report(y_train, y_train_pred, self.config.train_score_path, "train")
        test_report = self.get_report(y_test, y_test_pred, self.config.test_score_path, "test")
        valid_report = self.get_report(y_valid, y_valid_pred, self.config.valid_score_path, "valid")
        
        logger.info(f"Set tracking URI.")
        mlflow.set_tracking_uri(self.config.mlflow_tracking_uri)
        
        logger.info(f"Set experiment name.")
        mlflow.set_experiment(self.config.mlflow_exp_name)
        
        logger.info(f"Set run name.")
        flag = ''.join(random.choices(
            string.ascii_uppercase + string.ascii_lowercase + string.digits, 
            k=5))
        run_name = f"{self.config.mlflow_run_name}-{flag}"
        
        logger.info(f"Contruct report for MLflow.")
        train_metrics = self.get_mlflow_metrics(train_report, 'train')
        test_metrics = self.get_mlflow_metrics(test_report, 'test')
        valid_metrics = self.get_mlflow_metrics(valid_report, 'valid')
        
        logger.info(f"Contruct MLflow dataset file in {self.config.mlflow_dataset_path}.")
        mlflow_train_dataset = self.get_dataset(X_train, y_train, y_train_pred)

        logger.info(f"Contruct MLflow input example")
        sample = 10
        input_example = X_valid[:sample]

        logger.info(f"Experiement tracking to evaluate model with MLflow.")
        with mlflow.start_run(run_name=run_name):
            logger.info(f"Upload {self.config.mlflow_dataset_path} file to MinIO.")
            self.s3_upload_mlflow_dataset(run_name)
            
            logger.info(f"Set MLflow dataset.")
            dataset = self.get_mlflow_dataset(mlflow_train_dataset, run_name)

            logger.info(f"Logging to MLflow as an experiment.")
            model_params = model.get_params()
            mlflow.log_params(model_params)
            mlflow.log_metrics(train_metrics)
            mlflow.log_metrics(test_metrics)
            mlflow.log_metrics(valid_metrics)
            mlflow.log_input(dataset, context="training")
            mlflow.log_artifact(self.config.scaler_model_path, "scaler")
            mlflow.sklearn.log_model(
                sk_model=model,
                artifact_path="models",
                serialization_format=mlflow.sklearn.SERIALIZATION_FORMAT_CLOUDPICKLE,
                registered_model_name="logistic_regression",
                input_example=input_example
            )
            
            mlflow.set_tags(
                {
                    "dataset": "credit card fraud",
                    "model": "logistic_regression"
                }
            )

### Evaluate the Model

In [10]:
try:
    config = ConfigurationManager()
    eval_config = config.get_train_eval_config()
    evaluation = TrainEvaluation(config=eval_config)
    evaluation.mlflow_log_train()
except Exception as e:
    logger.error(e)
    raise e

[2024-07-22 21:52:22,323: INFO: common: yaml file: config/config.yaml loaded successfully]
[2024-07-22 21:52:22,325: INFO: common: yaml file: metrics/params.yaml loaded successfully]
[2024-07-22 21:52:22,327: INFO: common: created directory at: artifacts]
[2024-07-22 21:52:22,329: INFO: common: created directory at: artifacts/models]
[2024-07-22 21:52:22,331: INFO: 1579474910: Load scaled data train from artifacts/preprocessing/X_train.csv.]
[2024-07-22 21:52:22,370: INFO: 1579474910: Load data train from artifacts/data/X_train.pkl.]
[2024-07-22 21:52:22,404: INFO: 1579474910: Load data train output from artifacts/data/y_train.pkl.]
[2024-07-22 21:52:22,414: INFO: 1579474910: Load the model.]
[2024-07-22 21:52:22,467: INFO: 1579474910: Predicting the data train test valid.]
[2024-07-22 21:52:22,477: INFO: 1579474910: Generate classification report.]
[2024-07-22 21:52:22,573: INFO: 1579474910: Save report as json.]
[2024-07-22 21:52:22,575: INFO: common: json file saved at: metrics/trai

MlflowException: API request to http://localhost:5000/api/2.0/mlflow-artifacts/artifacts/2/1def852e1d3547d8ab7e6641479458a8/artifacts/scaler/scaler.pkl failed with exception HTTPConnectionPool(host='localhost', port=5000): Max retries exceeded with url: /api/2.0/mlflow-artifacts/artifacts/2/1def852e1d3547d8ab7e6641479458a8/artifacts/scaler/scaler.pkl (Caused by ResponseError('too many 500 error responses'))

**Debug**: Check the dataset in MLflow and MinIO

by checking the MLflow last active run.

In [None]:
run = mlflow.get_run(mlflow.last_active_run().info.run_id)
dataset_info = run.inputs.dataset_inputs[0].dataset
print(f"Dataset name: {dataset_info.name}")
print(f"Dataset digest: {dataset_info.digest}")
print(f"Dataset profile: {dataset_info.profile}")
print(f"Dataset schema: {dataset_info.schema}")

Dataset name: train-eval-gERE0
Dataset digest: 406ae510
Dataset profile: {"num_rows": 83555, "num_elements": 250665}
Dataset schema: {"mlflow_colspec": [{"type": "string", "name": "reviewContents", "required": true}, {"type": "long", "name": "ratings", "required": true}, {"type": "long", "name": "predictions", "required": true}]}


In [None]:
try:
    config = ConfigurationManager()
    eval_config = config.get_train_eval_config()

    s3 = boto3.client('s3',
                    endpoint_url=eval_config.minio_endpoint_url,
                    aws_access_key_id=eval_config.minio_access_key_id,
                    aws_secret_access_key=eval_config.minio_secret_access_key)

    obj = s3.get_object(Bucket=eval_config.mlflow_dataset_bucket, Key=f"{dataset_info.name}.csv") 
    df = pd.read_csv(obj['Body'])
except Exception as e:
    logger.error(e)
    raise e

[2024-07-03 22:02:40,015: INFO: common: yaml file: config/config.yaml loaded successfully]
[2024-07-03 22:02:40,018: INFO: common: yaml file: metrics/params.yaml loaded successfully]
[2024-07-03 22:02:40,019: INFO: common: created directory at: artifacts]
[2024-07-03 22:02:40,022: INFO: common: created directory at: artifacts/models]


In [None]:
df

Unnamed: 0,reviewContents,ratings,predictions
0,persen kemarin sore ...jam 2 siang datang...ga...,5,5
1,dah sampai... nyobanya nunggu pulang kerja... ...,5,5
2,Recommended seller..,5,5
3,"Pengiriman cepat sekali 2 hari sampai, packing...",5,5
4,iish..keren pisaan.. kuy laen beli.. gak akan ...,5,5
...,...,...,...
83550,Produk sesuai dengan yang ditawarkan Kendala h...,4,4
83551,fast respon 2hari smpe...tv nya bgus,4,4
83552,sesuai deskripsi produk,4,5
83553,bagus,4,5


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 83555 entries, 0 to 83554
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   reviewContents  83555 non-null  object
 1   ratings         83555 non-null  int64 
 2   predictions     83555 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 1.9+ MB
