In [1]:
import os

In [2]:
%pwd

'/home/adhitizki/playground/pacmann/mlops_credit_card/mlops-credit-card/notebooks'

In [3]:
# Change to the main directory
# So, it's executed from main directory
os.chdir("../")

In [4]:
# with open('.env') as f:
#     os.environ.update(
#         line.strip().split('=') for line in f
# )

In [5]:
%pwd

'/home/adhitizki/playground/pacmann/mlops_credit_card/mlops-credit-card'

### Evaluation Config

This code will be apply in `src/MLProject/entity/config_entity.py`.

In [6]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class TrainEvaluationConfig:
    root_dir: Path
    input_train_path: Path
    input_test_path: Path
    input_valid_path: Path
    output_train_path: Path
    output_test_path: Path
    output_valid_path: Path
    scaled_train_path: Path
    scaled_test_path: Path
    scaled_valid_path: Path
    scaler_model_path: Path
    model_path: Path
    train_score_path: Path
    test_score_path: Path
    valid_score_path: Path
    mlflow_dataset_path: Path
    mlflow_dataset_column: list
    minio_endpoint_url: str
    minio_access_key_id: str
    minio_secret_access_key: str
    mlflow_tracking_uri: str
    mlflow_exp_name: str
    mlflow_dataset_bucket: str
    mlflow_run_name: str

### Evaluation Config Manager

This code will be apply in `src/MLProject/config/configurations.py`.

In [7]:
from MLProject.constants import CONFIG_FILE_PATH, PARAMS_FILE_PATH
from MLProject.utils.common import read_yaml, create_directories, save_json

In [8]:
class ConfigurationManager:
    def __init__(self, 
                 config_filepath = CONFIG_FILE_PATH,
                 params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])
    
    def get_train_eval_config(self) -> TrainEvaluationConfig:
        """read training evaluation config file and store as 
        config entity then apply the dataclasses
        
        Returns:
            config: TrainEvaluationConfig type
        """
        data_dump_config = self.config.dump_data
        scaler_config = self.config.scale_data
        train_config = self.config.train_model
        eval_config = self.config.evaluation

        create_directories([eval_config.root_dir])

        config = TrainEvaluationConfig(
            root_dir=eval_config.root_dir,
            input_train_path=Path(data_dump_config.input_train_path),
            input_test_path=Path(data_dump_config.input_test_path),
            input_valid_path=Path(data_dump_config.input_valid_path),
            output_train_path=Path(data_dump_config.output_train_path),
            output_test_path=Path(data_dump_config.output_test_path),
            output_valid_path=Path(data_dump_config.output_valid_path),
            scaled_train_path=Path(scaler_config.scaled_train_path),
            scaled_test_path=Path(scaler_config.scaled_test_path),
            scaled_valid_path=Path(scaler_config.scaled_valid_path),
            scaler_model_path=Path(scaler_config.scaler_model_path),
            model_path=Path(train_config.model_path),
            train_score_path=Path(eval_config.train_score_path),
            test_score_path=Path(eval_config.test_score_path),
            valid_score_path=Path(eval_config.valid_score_path),
            mlflow_dataset_path=Path(eval_config.mlflow_dataset_path),
            mlflow_dataset_column=eval_config.mlflow_dataset_column,
            minio_endpoint_url=os.environ['MLFLOW_S3_ENDPOINT_URL'],
            minio_access_key_id=os.environ['MINIO_ACCESS_KEY'],
            minio_secret_access_key=os.environ['MINIO_SECRET_ACCESS_KEY'],
            mlflow_tracking_uri=os.environ["MLFLOW_TRACKING_URI"],
            mlflow_exp_name=eval_config.mlflow_exp_name,
            mlflow_dataset_bucket=os.environ["PROJECT_BUCKET"],
            mlflow_run_name=eval_config.mlflow_run_name
        )

        return config

### Config in `src/MLProject/components/model_evaluation.py`

Logging is tied with a runs, which is **one cycle training and evaluating model**.
To start logging, we have to give mlflow **a context**, which is our **current run**.

Steps:
+ Load train and test data (text and scaled), its target data, and the vectorizer.
+ Pointing the mlflow client in our program to our mlflow server.
+ Set experiment.
+ Set MLflow to run and start logging.

In [9]:
import boto3
import json
import joblib
import mlflow
import string
import random
import pandas as pd

from mlflow.data.pandas_dataset import PandasDataset
from mlflow.data.dataset_source import DatasetSource
from sklearn.metrics import classification_report

from MLProject import logger

class TrainEvaluation:
    def __init__(self, config: TrainEvaluationConfig):
        self.config = config

    def get_prediction(self, model, X_input_scaled) -> pd.DataFrame:
        """predict the input data with the model
        
        Args:
            model (Any): the machine learning model
            X_input_scaled (Any): the scaled input data
        
        Returns:
            pd.Series: prediction result in dataframe
        """
        y_predict = pd.Series(model.predict(X_input_scaled))
        
        return y_predict
    
    def get_report(self, y_output, y_predict, score_path, data_type='train') -> dict:
        """generate the classification report and dump the report as json
        
        Args:
            y_output (pd.Series): the actual output data
            y_predict (pd.Series): the prediction result
            score_path (Path): Path score location
            data_type (str): data type are train, test, or validation
        
        Returns:
            dict: classification report in dict format
        """
        metrics = classification_report(y_output, y_predict, output_dict=True)
        
        logger.info(f"Save report as json.")
        save_json(path=score_path, data=metrics)
        
        logger.info(f"Show the {data_type} report.")
        print(f"\n{classification_report(y_output, y_predict)}")
        
        return metrics
    
    def get_mlflow_metrics(self, metrics, data_type='train') -> dict:
        """generate the classification report for MLflow

        Args:
            metrics (dict): the classification report
            data_type (str): data type are train, test, or validation
        
        Returns:
            dict: classification report in dict format
        """
        mlflow_metrics = {}

        for rating in range(len(metrics) - 3):
            data_metric = metrics[str(rating)]
            for name, value in data_metric.items():
                mlflow_metrics[data_type + "_" + name + "_" + str(rating)] = value
            
        return mlflow_metrics
    
    def get_dataset(self, X_input, y_output, y_predict) -> pd.DataFrame:
        """construct the dataset and save as dataframe and csv file
        
        Args:
            X_input (pd.DataFrame): the input data
            y_output (pd.Series): the actual output data
            y_predict (pd.Series): the prediction result
        
        Returns:
            pd.Dataframe: prediction result in dataframe
        """
        train_eval_result = X_input.copy()
        train_eval_result["Class"] = y_output
        train_eval_result["Prediction"] = y_predict

        train_eval_result = train_eval_result[self.config.mlflow_dataset_column].sample(10)
        train_eval_result.to_csv(self.config.mlflow_dataset_path, index=False)
        
        return train_eval_result
        
    def get_mlflow_dataset(self, mlflow_dataset, run_name) -> PandasDataset:
        """convert the dataset into MLflow's dataset format
        
        Args:
            mlflow_dataset (pd.DataFrame): the project dataset to train and the result
            run_name (str): the name of MLflow runs
        
        Returns:
            PandasDataset: the dataset in Pandas MLflow format
        """
        mlflow_dataset: PandasDataset=mlflow.data.from_pandas(
            mlflow_dataset,
            source=DatasetSource.load(f"s3://{self.config.mlflow_dataset_bucket}/{run_name}.csv"),
            name=f"{run_name}",
            targets=self.config.mlflow_dataset_column[1],
            predictions=self.config.mlflow_dataset_column[2]
        )
        
        logger.info(f"Remove {self.config.mlflow_dataset_path} file from local.")
        os.remove(self.config.mlflow_dataset_path)
        
        return mlflow_dataset
    
    def s3_upload_mlflow_dataset(self, run_name) -> None:
        """upload the dataset into MinIO with MLflow run_name
        
        Args:
            run_name (str): the name of MLflow runs
        """
        s3 = boto3.client('s3',
                              endpoint_url=self.config.minio_endpoint_url,
                              aws_access_key_id=self.config.minio_access_key_id,
                              aws_secret_access_key=self.config.minio_secret_access_key)
        
        try:
            s3.upload_file(
                self.config.mlflow_dataset_path, 
                self.config.mlflow_dataset_bucket, 
                f'{run_name}.csv'
            )    
        except Exception as e:
            logger.error(e)
            raise e
    
    def mlflow_log_train(self) -> None:
        """perform experimentation with MLflow to evaluate the training result
        """
        logger.info(f"Load scaled data train from {self.config.scaled_train_path}.")
        X_train_scaled = joblib.load(self.config.scaled_train_path)
        X_test_scaled = joblib.load(self.config.scaled_test_path)
        X_valid_scaled = joblib.load(self.config.scaled_valid_path)
        
        logger.info(f"Load data train from {self.config.input_train_path}.")
        X_train = joblib.load(self.config.input_train_path)
        X_test = joblib.load(self.config.input_test_path)
        X_valid = joblib.load(self.config.input_valid_path)
        
        logger.info(f"Load data train output from {self.config.output_train_path}.")
        y_train = joblib.load(self.config.output_train_path)
        y_test = joblib.load(self.config.output_test_path)
        y_valid = joblib.load(self.config.output_valid_path)
        
        logger.info(f"Load the model.")
        model = joblib.load(self.config.model_path)
        
        logger.info(f"Predicting the data train test valid.")
        y_train_pred = self.get_prediction(model, X_train_scaled)
        y_test_pred = self.get_prediction(model, X_test_scaled)
        y_valid_pred = self.get_prediction(model, X_valid_scaled)
        
        logger.info(f"Generate classification report.")
        train_report = self.get_report(y_train, y_train_pred, self.config.train_score_path, "train")
        test_report = self.get_report(y_test, y_test_pred, self.config.test_score_path, "test")
        valid_report = self.get_report(y_valid, y_valid_pred, self.config.valid_score_path, "valid")
        
        logger.info(f"Set tracking URI.")
        mlflow.set_tracking_uri(self.config.mlflow_tracking_uri)
        
        logger.info(f"Set experiment name.")
        mlflow.set_experiment(self.config.mlflow_exp_name)
        
        logger.info(f"Set run name.")
        flag = ''.join(random.choices(
            string.ascii_uppercase + string.ascii_lowercase + string.digits, 
            k=5))
        run_name = f"{self.config.mlflow_run_name}-{flag}"
        
        logger.info(f"Contruct report for MLflow.")
        train_metrics = self.get_mlflow_metrics(train_report, 'train')
        test_metrics = self.get_mlflow_metrics(test_report, 'test')
        valid_metrics = self.get_mlflow_metrics(valid_report, 'valid')
        
        logger.info(f"Contruct MLflow dataset file in {self.config.mlflow_dataset_path}.")
        mlflow_train_dataset = self.get_dataset(X_train, y_train, y_train_pred)

        logger.info(f"Contruct MLflow input example")
        sample = 10
        input_example = X_valid[:sample]

        logger.info(f"Experiement tracking to evaluate model with MLflow.")
        with mlflow.start_run(run_name=run_name):
            logger.info(f"Upload {self.config.mlflow_dataset_path} file to MinIO.")
            self.s3_upload_mlflow_dataset(run_name)
            
            logger.info(f"Set MLflow dataset.")
            dataset = self.get_mlflow_dataset(mlflow_train_dataset, run_name)

            logger.info(f"Logging to MLflow as an experiment.")
            model_params = model.get_params()
            mlflow.log_params(model_params)
            mlflow.log_metrics(train_metrics)
            mlflow.log_metrics(test_metrics)
            mlflow.log_metrics(valid_metrics)
            mlflow.log_input(dataset, context="training")
            mlflow.log_artifact(self.config.scaler_model_path, "scaler")
            mlflow.sklearn.log_model(
                sk_model=model,
                artifact_path="models",
                serialization_format=mlflow.sklearn.SERIALIZATION_FORMAT_CLOUDPICKLE,
                registered_model_name="logistic_regression",
                input_example=input_example
            )
            
            mlflow.set_tags(
                {
                    "dataset": "credit card fraud",
                    "model": "logistic_regression"
                }
            )

### Evaluate the Model

In [10]:
try:
    config = ConfigurationManager()
    eval_config = config.get_train_eval_config()
    evaluation = TrainEvaluation(config=eval_config)
    evaluation.mlflow_log_train()
except Exception as e:
    logger.error(e)
    raise e

[2024-07-24 21:09:14,395: INFO: common: yaml file: config/config.yaml loaded successfully]
[2024-07-24 21:09:14,402: INFO: common: yaml file: metrics/params.yaml loaded successfully]
[2024-07-24 21:09:14,405: INFO: common: created directory at: artifacts]
[2024-07-24 21:09:14,407: INFO: common: created directory at: artifacts/models]
[2024-07-24 21:09:14,409: INFO: 3572388525: Load scaled data train from artifacts/preprocessing/X_train.csv.]
[2024-07-24 21:09:14,423: INFO: 3572388525: Load data train from artifacts/data/X_train.pkl.]
[2024-07-24 21:09:14,447: INFO: 3572388525: Load data train output from artifacts/data/y_train.pkl.]
[2024-07-24 21:09:14,452: INFO: 3572388525: Load the model.]
[2024-07-24 21:09:14,497: INFO: 3572388525: Predicting the data train test valid.]
[2024-07-24 21:09:14,502: INFO: 3572388525: Generate classification report.]
[2024-07-24 21:09:14,575: INFO: 3572388525: Save report as json.]
[2024-07-24 21:09:14,576: INFO: common: json file saved at: metrics/trai

EndpointConnectionError: Could not connect to the endpoint URL: "http://localhost:9000/credit-card-artifact/eval-hpo-lr-wyjVr.csv"

**Debug**: Check the dataset in MLflow and MinIO

by checking the MLflow last active run.

In [None]:
run = mlflow.get_run(mlflow.last_active_run().info.run_id)
dataset_info = run.inputs.dataset_inputs[0].dataset
print(f"Dataset name: {dataset_info.name}")
print(f"Dataset digest: {dataset_info.digest}")
print(f"Dataset profile: {dataset_info.profile}")
print(f"Dataset schema: {dataset_info.schema}")

Dataset name: eval-hpo-lr-MHHeH
Dataset digest: c3dc01be
Dataset profile: {"num_rows": 10, "num_elements": 290}
Dataset schema: {"mlflow_colspec": [{"type": "double", "name": "V1", "required": true}, {"type": "double", "name": "V2", "required": true}, {"type": "double", "name": "V3", "required": true}, {"type": "double", "name": "V4", "required": true}, {"type": "double", "name": "V5", "required": true}, {"type": "double", "name": "V6", "required": true}, {"type": "double", "name": "V7", "required": true}, {"type": "double", "name": "V8", "required": true}, {"type": "double", "name": "V9", "required": true}, {"type": "double", "name": "V10", "required": true}, {"type": "double", "name": "V11", "required": true}, {"type": "double", "name": "V12", "required": true}, {"type": "double", "name": "V13", "required": true}, {"type": "double", "name": "V14", "required": true}, {"type": "double", "name": "V15", "required": true}, {"type": "double", "name": "V16", "required": true}, {"type": "dou

In [None]:
try:
    config = ConfigurationManager()
    eval_config = config.get_train_eval_config()

    s3 = boto3.client('s3',
                    endpoint_url=eval_config.minio_endpoint_url,
                    aws_access_key_id=eval_config.minio_access_key_id,
                    aws_secret_access_key=eval_config.minio_secret_access_key)

    obj = s3.get_object(Bucket=eval_config.mlflow_dataset_bucket, Key=f"{dataset_info.name}.csv") 
    df = pd.read_csv(obj['Body'])
except Exception as e:
    logger.error(e)
    raise e

[2024-07-22 22:26:29,206: INFO: common: yaml file: config/config.yaml loaded successfully]
[2024-07-22 22:26:29,210: INFO: common: yaml file: metrics/params.yaml loaded successfully]
[2024-07-22 22:26:29,214: INFO: common: created directory at: artifacts]
[2024-07-22 22:26:29,216: INFO: common: created directory at: artifacts/models]


In [None]:
df

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
0,-0.032303,-0.386967,1.511421,-0.988467,0.414566,0.380053,0.435772,-0.099581,0.728787,0.305628,...,-0.225327,0.026373,0.932249,-0.298604,-0.532033,-0.374621,-0.598212,-0.034143,0.287776,12063.94
1,0.943512,-0.058707,-0.006549,0.139426,0.647875,-0.114745,0.558106,-0.164096,0.541422,-0.074425,...,-0.216519,-0.256622,-0.719603,-0.159472,-0.551657,1.048536,-0.73834,-0.119532,0.198217,21130.84
2,-0.564052,-0.404712,-0.367157,0.486011,-0.969659,1.280288,0.242152,-0.653118,-0.008776,-0.560175,...,0.442817,-0.107201,-0.302427,-3.115838,0.812421,-1.03751,0.033089,1.36195,-0.284536,3243.84
3,0.315491,0.104304,-0.335941,-0.514284,0.920041,-0.926235,1.086116,-0.191913,0.242453,0.171162,...,-0.330082,0.06226,1.481722,-0.129557,-0.250499,-1.713549,-0.957352,0.397976,0.838481,6474.84
4,1.863527,-0.963914,0.562754,-1.145705,-0.274742,0.551275,-0.0749,-0.068351,0.972102,1.059937,...,-0.302025,0.060678,1.148732,0.136445,-0.866815,-0.794387,-0.15659,-0.190612,-0.226225,7177.84
5,-0.489227,-1.396069,0.544977,-1.066563,0.101555,-0.231559,1.024158,-0.320495,0.180348,0.929183,...,-0.955429,-0.336163,-0.410506,0.087418,0.98109,-0.973555,1.645646,-0.135608,0.877584,21776.28
6,0.051572,-0.220515,0.995936,-0.207053,0.186624,0.610326,0.265219,-0.030209,0.382442,0.466535,...,0.086363,0.04692,1.065216,-0.136018,1.44664,-1.069795,0.300829,-0.051949,0.149318,3797.12
7,0.838087,-0.456157,1.63041,0.414084,0.000183,1.061429,0.211962,-0.05805,0.951568,0.68741,...,-0.350664,-0.212956,-0.354492,0.027609,0.232269,0.480217,-0.059722,-0.150388,-0.02154,6684.95
8,-1.990717,2.223395,-1.979267,1.179433,-2.302765,-2.323477,-1.885489,3.22518,-1.219738,-1.368326,...,1.333778,0.545636,-1.092073,-0.733639,0.256599,1.873922,-0.541419,2.215048,1.106875,6328.28
9,-0.752077,0.25148,-0.074932,0.209477,-0.034418,-0.244705,-0.042581,-0.667819,1.583562,0.701444,...,-0.488894,0.516356,-0.02458,-0.350563,1.003687,-0.596477,-1.499484,-2.621421,-1.500549,22538.72


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 29 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   V1      10 non-null     float64
 1   V2      10 non-null     float64
 2   V3      10 non-null     float64
 3   V4      10 non-null     float64
 4   V5      10 non-null     float64
 5   V6      10 non-null     float64
 6   V7      10 non-null     float64
 7   V8      10 non-null     float64
 8   V9      10 non-null     float64
 9   V10     10 non-null     float64
 10  V11     10 non-null     float64
 11  V12     10 non-null     float64
 12  V13     10 non-null     float64
 13  V14     10 non-null     float64
 14  V15     10 non-null     float64
 15  V16     10 non-null     float64
 16  V17     10 non-null     float64
 17  V18     10 non-null     float64
 18  V19     10 non-null     float64
 19  V20     10 non-null     float64
 20  V21     10 non-null     float64
 21  V22     10 non-null     float64
 22  V23  

In [11]:
import os
import boto3
from botocore.exceptions import NoCredentialsError, PartialCredentialsError, EndpointConnectionError

# Minio credentials and endpoint
MINIO_ENDPOINT = "http://localhost:9000"  # or your Minio server URL
MINIO_ACCESS_KEY = os.environ['MINIO_ROOT_USER']
MINIO_SECRET_KEY = os.environ['MINIO_ROOT_PASSWORD']

# Create a boto3 client for S3
s3_client = boto3.client(
    's3',
    endpoint_url=MINIO_ENDPOINT,
    aws_access_key_id=MINIO_ACCESS_KEY,
    aws_secret_access_key=MINIO_SECRET_KEY,
    use_ssl=False  # Set to True if using HTTPS
)

def check_minio_connection():
    try:
        # List buckets to verify connection
        response = s3_client.list_buckets()
        print("Connection to Minio successful. Buckets:")
        for bucket in response['Buckets']:
            print(f'  - {bucket["Name"]}')
    except NoCredentialsError:
        print("Credentials not available.")
    except PartialCredentialsError:
        print("Incomplete credentials provided.")
    except EndpointConnectionError:
        print("Could not connect to the Minio endpoint.")
    except Exception as e:
        print(f"An error occurred: {e}")


In [14]:
import boto3
from botocore.client import Config

# Create a session using MinIO credentials
session = boto3.session.Session()

# Create an S3 client with MinIO endpoint
s3 = session.client('s3',
                    endpoint_url='http://localhost:9000',
                    aws_access_key_id=MINIO_ACCESS_KEY,
                    aws_secret_access_key=MINIO_SECRET_KEY,
                    config=Config(signature_version='s3v4'))

# List buckets to test connection
try:
    response = s3.list_buckets()
    print("Buckets:", response['Buckets'])
except Exception as e:
    print("Error:", e)


Error: Could not connect to the endpoint URL: "http://localhost:9000/"


In [12]:
check_minio_connection()

Could not connect to the Minio endpoint.


In [None]:
!pip freeze | grep boto3

boto3==1.34.117


In [None]:
!curl -X GET http://localhost:9000/minio/health/ready

curl: (7) Failed to connect to localhost port 9000 after 0 ms: Connection refused


In [None]:
!curl http://localhost:3000

curl: (7) Failed to connect to localhost port 3000 after 0 ms: Connection refused
