1. 抽象化（Abstraction）
    - 數據準備: 讀取和準備紅酒品質數據集，並將其分為訓練集和測試集。
    - 模型選擇: 選擇了ElasticNet作為主模型，並用DummyRegressor作為基線模型進行比較。
2. 模式識別（Pattern Recognition）
    - MLflow Workflow: 標準工作流程：設置實驗、開始運行、訓練模型、記錄參數、記錄評估指標，並將模型保存為artifact。
3. 算法思維（Algorithmic Thinking）
    - 模型訓練和預測: 使用ElasticNet進行模型訓練並進行預測。
    - 評估指標: 計算和記錄了RMSE、MAE和R^2等指標來評估模型性能。
4. 分解（Decomposition）
    - 數據處理: 將數據集分解為特徵（train_x、test_x）和目標（train_y、test_y）。
    - 模型保存和加載: 使用joblib將訓練好的模型保存下來，並使用自定義的PythonModel類在MLflow中記錄模型。
5. 評估（Evaluation）
    - 模型性能: 通過計算和比較主模型和基線模型的性能指標（如RMSE、MAE）來評估模型的有效性。
    - MLflow評估: 使用mlflow.evaluate進行更復雜的性能評估，包括自定義指標和視覺化。
6. 通用性（Generalization）
    - MLflow的通用性: 使用MLflow進行機器學習模型管理的通用方法，這可以應用於不同的模型和數據集。

In [None]:
import warnings
import argparse
import logging
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import ElasticNet
import mlflow
import mlflow.sklearn
from pathlib import Path
import os
from mlflow.models.signature import ModelSignature, infer_signature
from mlflow.types.schema import Schema, ColSpec
import sklearn
import joblib
import cloudpickle
from mlflow.models import make_metric
import matplotlib.pyplot as plt
from sklearn.dummy import DummyRegressor
from mlflow.models import MetricThreshold

## data process

In [None]:
class DataProcessor:
    def __init__(self, filepath, test_size=0.25, random_state=40):
        self.filepath = filepath
        self.test_size = test_size
        self.random_state = random_state
        self.data = None
        self.train = None
        self.test = None

    def load_data(self):
        self.data = pd.read_csv(self.filepath)

    def split_data(self):
        self.train, self.test = train_test_split(self.data, test_size=self.test_size, random_state=self.random_state)

    def get_train_test_data(self):
        train_x = self.train.drop(["quality"], axis=1)
        test_x = self.test.drop(["quality"], axis=1)
        train_y = self.train[["quality"]]
        test_y = self.test[["quality"]]
        return train_x, train_y, test_x, test_y


## model training and valiation

In [None]:
class ModelTrainer:
    def __init__(self, alpha=0.9, l1_ratio=0.9):
        self.model = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, random_state=42)
        self.baseline_model = DummyRegressor()

    def train(self, train_x, train_y):
        self.model.fit(train_x, train_y)
        self.baseline_model.fit(train_x, train_y)

    def predict(self, test_x):
        return self.model.predict(test_x), self.baseline_model.predict(test_x)

    def evaluate(self, actual, pred):
        rmse = np.sqrt(mean_squared_error(actual, pred))
        mae = mean_absolute_error(actual, pred)
        r2 = r2_score(actual, pred)
  
        return rmse, mae, r2


## Custom model package

In [None]:
class SklearnWrapper(mlflow.pyfunc.PythonModel):
    def __init__(self, artifacts_name):
        self.artifacts_name = artifacts_name

    def load_context(self, context):
        self.sklearn_model = joblib.load(context.artifacts[self.artifacts_name])
    
    def predict(self, context, model_input):
        return self.sklearn_model.predict(model_input.values)

## MLflow manager

In [None]:
class MLflowManager:
    def __init__(self, experiment_name, model, baseline_model):
        self.experiment_name = experiment_name
        self.model = model
        self.baseline_model = baseline_model

        self.conda_env = {
        "channels": ["defaults"],
        "dependencies": [
            "python={}".format(3.10),
            "pip",
            {
                "pip": [
                    "mlflow=={}".format(mlflow.__version__),
                    "scikit-learn=={}".format(sklearn.__version__),
                    "cloudpickle=={}".format(cloudpickle.__version__),
                ],
            },
        ],
        "name": "sklearn_env",
        }

    def start_experiment(self):
        exp = mlflow.set_experiment(experiment_name=self.experiment_name)
        mlflow.start_run()
        print("Name: {}".format(exp.name))
        print("Experiment_id: {}".format(exp.experiment_id))
        print("Artifact Location: {}".format(exp.artifact_location))
        print("Tags: {}".format(exp.tags))
        print("Lifecycle_stage: {}".format(exp.lifecycle_stage))
        print("Creation timestamp: {}".format(exp.creation_time))

    # 模型相關資訊儲存位置
    def set_tracking_uri(self, uri=""):
        # 全路徑寫法 file:xxxx
        # mlflow.set_tracking_uri(uri=r"file:C:\Users\xdxd2\Sunny_VS_worksapce\Sunny_python\ML\mytracks")

        mlflow.set_tracking_uri(uri)
        print("The set tracking uri is ", mlflow.get_tracking_uri())

    # mlflow 整體實驗的 tags (key, value)
    def set_tags(self, tags):
        mlflow.set_tags(tags)

    # 參數, 預測指標紀錄
    def set_log_params_metrics(self, params, metrics):
        mlflow.log_params(params)
        mlflow.log_metrics(metrics)

    # 儲存模型, 實體模型, 模型路徑, 模型config key(artifact)
    def save_model(self, model, model_instance, model_name):
        joblib.dump(model, model_instance)
        mlflow.sklearn.log_model(model, model_name)
   
    def set_pyfunc_model(self, model_instance, model_name, artifacts, conda_env):
        mlflow.pyfunc.log_model(
        artifact_path=model_name,
        python_model=SklearnWrapper(model_name),
        artifacts=artifacts,
        code_path=["main.py"],
        conda_env=self.conda_env
        )


    # 結束實驗
    def end_experiment(self):
        mlflow.end_run()


## main()

In [43]:
def main():
    # 數據處理
    data_processor = DataProcessor("data/red-wine-quality.csv")
    data_processor.load_data()
    data_processor.split_data()
    train_x, train_y, test_x, test_y = data_processor.get_train_test_data()

    # model 創建
    alpha = 0.9
    l1_ratio = 0.9
    model_trainer = ModelTrainer(alpha=alpha, l1_ratio=l1_ratio)

    # MLflow 管理 
    mlflow_manager = MLflowManager("experiment_custom_metrics", model_trainer.model, model_trainer.baseline_model)
    mlflow_manager.set_tracking_uri() # 設定 Log 存檔位置
    mlflow_manager.start_experiment() # 設定 mlflow 當前實驗名稱

    # 模型訓練和評估
    model_trainer.train(train_x, train_y)
    predicted, baseline_predicted = model_trainer.predict(test_x)
    metrics = model_trainer.evaluate(test_y, predicted)
    baseline_metrics = model_trainer.evaluate(test_y, baseline_predicted)

    print(f"compare_model: RMSE: {metrics[0]}  MAE: {metrics[1]}  R2: {metrics[2]}")
    print(f"base_model: RMSE: {baseline_metrics[0]}  MAE: {baseline_metrics[1]}  R2: {baseline_metrics[2]}")

    # MLflow 管理
    # exp tags
    tags = {
    "engineering": "ML platform",
    "release.candidate": "RC1",
    "release.version": "2.0"
    }

    # params info 
    log_params = {
    "alpha": alpha,
    "l1_ratio": l1_ratio
    }

    # metrics info 
    metrics_params = {
    "rmse": metrics[0], 
     "mae": metrics[1], 
     "r2": metrics[2]}

    base_metrics_params = {
    "baseline rmse": baseline_metrics[0], 
     "baseline mae": baseline_metrics[1], 
     "baseline r2": baseline_metrics[2]}


    mlflow_manager.set_tags(tags)

    mlflow.sklearn.autolog(
    log_input_examples=False,
    log_model_signatures=False,
    log_models=False)


    mlflow_manager.set_log_params_metrics(log_params, metrics_params)
    mlflow_manager.set_log_params_metrics(log_params, base_metrics_params)


    # model save
    model_instance = "sklearn_model.pkl"
    model_name = "ElasticNetModel"
    base_model_instance = "baseline_sklearn_model.pkl"
    base_model_name = "BaseModel"

    mlflow_manager.save_model(model_trainer.model, model_instance, model_name)
    mlflow_manager.save_model(model_trainer.baseline_model, base_model_instance, base_model_name)
    


    data_dir = 'red-wine-data'

    if not os.path.exists(data_dir):
        os.makedirs(data_dir)

    artifacts = {
        model_name: model_instance,
        "data": data_dir
    }

    baseline_artifacts = {base_model_name: base_model_instance}

    # end_run
    mlflow_manager.end_experiment()

if __name__ == "__main__":
    
    warnings.filterwarnings("ignore")
    np.random.seed(40)

    main()




The set tracking uri is  


Name: experiment_custom_metrics
Experiment_id: 370280449364329886
Artifact Location: file:///C:/Users/xdxd2/Sunny_VS_worksapce/Sunny_python/ML/MLOps_fundamentals/MLflow/basic/mlruns/370280449364329886
Tags: {}
Lifecycle_stage: active
Creation timestamp: 1706675300388
compare_model: RMSE: 0.8312296853893981  MAE: 0.6673520215793272  R2: 0.02101549378688994
base_model: RMSE: 0.8426394848604892  MAE: 0.7025396163469558  R2: -0.006044846368814971
