In [2]:
import warnings
import logging
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import ElasticNet, Ridge, Lasso
import mlflow
import mlflow.sklearn
from pathlib import Path
import os


In [3]:
#process log tracking
logging.basicConfig(level=logging.DEBUG,
                    filename='./logfile.log',
                    filemode='w', # 'w' 表示寫模式, 'a' 表示追加模式, 'w' 表示如果文件已存在，先将其清空。如果你想在不清空现有日志的情况下向文件追加日志，可以使用 'a' 模式。
                    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')

logger = logging.getLogger(__name__)


In [4]:
#evaluation function
def eval_metrics(actual, pred):
    rmse = np.sqrt(mean_squared_error(actual, pred))
    mae = mean_absolute_error(actual, pred)
    r2 = r2_score(actual, pred)
    return rmse, mae, r2


## RAW data preparation and split data for training

In [5]:
warnings.filterwarnings("ignore")
np.random.seed(40)

# Read the wine-quality csv file from local
logger.debug("raw data ingestion")
data = pd.read_csv("data/red-wine-quality.csv")
data.to_csv("data/red-wine-quality.csv", index=False)

# Split the data into training and test sets. (0.75, 0.25) split.
logger.debug("data prep for training and testing")
train, test = train_test_split(data)
train.to_csv("data/train.csv")
test.to_csv("data/test.csv")

# The predicted column is "quality" which is a scalar from [3, 9]
train_x = train.drop(["quality"], axis=1)
test_x = test.drop(["quality"], axis=1)
train_y = train[["quality"]]
test_y = test[["quality"]]



In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Unnamed: 0            1599 non-null   int64  
 1   fixed acidity         1599 non-null   float64
 2   volatile acidity      1599 non-null   float64
 3   citric acid           1599 non-null   float64
 4   residual sugar        1599 non-null   float64
 5   chlorides             1599 non-null   float64
 6   free sulfur dioxide   1599 non-null   float64
 7   total sulfur dioxide  1599 non-null   float64
 8   density               1599 non-null   float64
 9   pH                    1599 non-null   float64
 10  sulphates             1599 non-null   float64
 11  alcohol               1599 non-null   float64
 12  quality               1599 non-null   int64  
dtypes: float64(11), int64(2)
memory usage: 162.5 KB


## Model training and prediction


### tracking uri


In [6]:
# set tracking folder
mlflow.set_tracking_uri(uri="")

# 全路徑寫法 file:xxxx
# mlflow.set_tracking_uri(uri=r"file:C:\Users\xdxd2\Sunny_VS_worksapce\Sunny_python\ML\mytracks")

print("The set tracking uri is ", mlflow.get_tracking_uri())



The set tracking uri is  


## multiple exps and runs

application field

1. hyperparameter tuning
2. incremental training
3. model checkpoint
4. feature engineering
5. cross validation


In [8]:
model_list = ["ElasticNet", "Ridge", "Lasso"]


In [9]:
for model in model_list:
    for i in range(1,4):

        exp = mlflow.set_experiment( experiment_name= model)

        print(f"Name: {exp.name}")
        print(f"Experiment_id: {exp.experiment_id}")


        mlflow.start_run(run_name=f"run_{i}")

        # hyper parameter tuning
        alpha = 0.3 * i
        l1_ratio = 0.3 * i

        # add exp tags
        tags = {
        "engineering": "ML platform",
        "release.candidate": "RC1",
        "release.version":"2.0"
        }
        mlflow.set_tags(tags)

        # check run id
        current_run = mlflow.active_run()
        print(f"active run id is {current_run.info.run_id}")
        print(f"active run name is {current_run.info.run_name}")

        if model == "ElasticNet":
            lr = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, random_state=62)
            lr.fit(train_x, train_y)
            print(f"{model} model (alpha={alpha}, l1_ratio={l1_ratio})")

            #log parameters
            params = {
                "alpha" : alpha,
                "l1_ratio": l1_ratio
            }
            mlflow.log_params(params)

        elif model == "Ridge":
            lr = Ridge(alpha=alpha, random_state=62)
            lr.fit(train_x, train_y)    
            print(f"{model} model (alpha={alpha})")

            #log parameters
            params = {
                "alpha" : alpha
            }
            mlflow.log_params(params)

        elif model == "Lasso":
            lr = Lasso(alpha=alpha, random_state=62)
            lr.fit(train_x, train_y)     
            print(f"{model} model (l1_ratio={alpha})") 

            #log parameters
            params = {
                "alpha" : alpha
            }
            mlflow.log_params(params)

        else:
            pass

        predicted_qualities = lr.predict(test_x)
        (rmse, mae, r2) = eval_metrics(test_y, predicted_qualities)


        print("  RMSE: %s" % rmse)
        print("  MAE: %s" % mae)
        print("  R2: %s" % r2)


        #log metrics
        metrics = {
            "rmse":rmse,
            "r2":r2,
            "mae":mae
        }
        mlflow.log_metrics(metrics)

        #log model
        mlflow.sklearn.log_model(lr, "ML-lr-tunning_knob-matrix")
        mlflow.log_artifacts("data/")

        
        artifact_uri = mlflow.get_artifact_uri()
        print(f"artifact uri: {artifact_uri}")
        print("--" * 20)

        mlflow.end_run()
        

run = mlflow.last_active_run()
print(f"active run id is {run.info.run_id}")
print(f"active run name is {run.info.run_name}")


2024/01/29 16:39:30 INFO mlflow.tracking.fluent: Experiment with name 'ElasticNet' does not exist. Creating a new experiment.


Name: ElasticNet
Experiment_id: 686166054761385735
active run id is 58d1bee11aac4eb99e27eee8ee153c2b
active run name is run_1
ElasticNet model (alpha=0.3, l1_ratio=0.3)
  RMSE: 0.7442929001520973
  MAE: 0.5763000946156918
  R2: 0.21508707276848893
artifact uri: file:///C:/Users/xdxd2/Sunny_VS_worksapce/Sunny_python/ML/MLOps_fundamentals/MLflow/basic/mlruns/686166054761385735/58d1bee11aac4eb99e27eee8ee153c2b/artifacts
----------------------------------------
Name: ElasticNet
Experiment_id: 686166054761385735
active run id is 4fea0c0d4525400c8f9509b1849d0abc
active run name is run_2
ElasticNet model (alpha=0.6, l1_ratio=0.6)
  RMSE: 0.8307275887203359
  MAE: 0.6623342053053785
  R2: 0.02219782981672136
artifact uri: file:///C:/Users/xdxd2/Sunny_VS_worksapce/Sunny_python/ML/MLOps_fundamentals/MLflow/basic/mlruns/686166054761385735/4fea0c0d4525400c8f9509b1849d0abc/artifacts
----------------------------------------
Name: ElasticNet
Experiment_id: 686166054761385735
active run id is 47f937dd

2024/01/29 16:39:47 INFO mlflow.tracking.fluent: Experiment with name 'Ridge' does not exist. Creating a new experiment.


artifact uri: file:///C:/Users/xdxd2/Sunny_VS_worksapce/Sunny_python/ML/MLOps_fundamentals/MLflow/basic/mlruns/686166054761385735/47f937dd4ce74b32a53f613c6962c347/artifacts
----------------------------------------
Name: Ridge
Experiment_id: 546786012574735131
active run id is 0995bd758df449c7a00edc9ad55f3bf2
active run name is run_1
Ridge model (alpha=0.3)
  RMSE: 0.661419687997459
  MAE: 0.5101467525798572
  R2: 0.3801480589838746
artifact uri: file:///C:/Users/xdxd2/Sunny_VS_worksapce/Sunny_python/ML/MLOps_fundamentals/MLflow/basic/mlruns/546786012574735131/0995bd758df449c7a00edc9ad55f3bf2/artifacts
----------------------------------------
Name: Ridge
Experiment_id: 546786012574735131
active run id is 0568a6cc4752443bb6c412d9b8f5212e
active run name is run_2
Ridge model (alpha=0.6)
  RMSE: 0.6618772171205209
  MAE: 0.5103134649089909
  R2: 0.3792902121168855
artifact uri: file:///C:/Users/xdxd2/Sunny_VS_worksapce/Sunny_python/ML/MLOps_fundamentals/MLflow/basic/mlruns/5467860125747351

2024/01/29 16:40:01 INFO mlflow.tracking.fluent: Experiment with name 'Lasso' does not exist. Creating a new experiment.


artifact uri: file:///C:/Users/xdxd2/Sunny_VS_worksapce/Sunny_python/ML/MLOps_fundamentals/MLflow/basic/mlruns/546786012574735131/eae5c38aa82f4264937a50de7f557d7a/artifacts
----------------------------------------
Name: Lasso
Experiment_id: 936210546216507297
active run id is be452396ed27457090bed3a9a3a8ceaa
active run name is run_1
Lasso model (l1_ratio=0.3)
  RMSE: 0.8061985902117953
  MAE: 0.6403263978999106
  R2: 0.0790887083567905
artifact uri: file:///C:/Users/xdxd2/Sunny_VS_worksapce/Sunny_python/ML/MLOps_fundamentals/MLflow/basic/mlruns/936210546216507297/be452396ed27457090bed3a9a3a8ceaa/artifacts
----------------------------------------
Name: Lasso
Experiment_id: 936210546216507297
active run id is 0f9215f2c2404efcbfd58f9919f90dca
active run name is run_2
Lasso model (l1_ratio=0.6)
  RMSE: 0.8314405470350451
  MAE: 0.6658128647027597
  R2: 0.020518744282618573
artifact uri: file:///C:/Users/xdxd2/Sunny_VS_worksapce/Sunny_python/ML/MLOps_fundamentals/MLflow/basic/mlruns/9362105

## EXP -1 
- alpha_values = [0.01, 0.1, 1, 10]
- l1_ratio_values = [0.1, 0.5, 0.9]

In [12]:
import mlflow
import mlflow.sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import pandas as pd
import numpy as np


warnings.filterwarnings("ignore")
np.random.seed(40)

# Read the wine-quality csv file from local
df = pd.read_csv("data/red-wine-quality.csv")
X = df.drop('quality', axis=1)
y = df['quality']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# 設定實驗
mlflow.set_experiment("wine_quality_prediction_v2")

alpha_values = [0.01, 0.1, 1, 10]
l1_ratio_values = [0.1, 0.5, 0.9]

i = 0

for alpha in alpha_values:
    for l1_ratio in l1_ratio_values:
        with mlflow.start_run(run_name = f"run-{i}_alpha_{alpha}_l1_ratio_{l1_ratio}"):
            # 設定tags和run name
            mlflow.set_tag("model", "ElasticNet")
            mlflow.set_tag("run_name", f"alpha_{alpha}_l1_{l1_ratio}")

            # 訓練模型
            model = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, random_state=42)
            model.fit(X_train, y_train)

            # 預測並計算指標
            predictions = model.predict(X_test)
            rmse = np.sqrt(mean_squared_error(y_test, predictions))
            mae = mean_absolute_error(y_test, predictions)
            r2 = r2_score(y_test, predictions)


            # 記錄實驗參數和指標
            mlflow.log_param("alpha", alpha)
            mlflow.log_param("l1_ratio", l1_ratio)
            mlflow.log_metric("rmse", rmse)
            mlflow.log_metric("mae", mae)
            mlflow.log_metric("r2", r2)

            # 記錄模型
            mlflow.sklearn.log_model(model, "model")

            # 上傳任何額外的資料件
            mlflow.log_artifacts("data/")
            artifact_uri = mlflow.get_artifact_uri()
            print(f"artifact uri: {artifact_uri}")

            # 獲取實驗ID和運行名稱
            current_run = mlflow.active_run()
            experiment_id = current_run.info.experiment_id
            run_id = current_run.info.run_id
            run_name = current_run.info.run_name
            print(f"Experiment ID: {experiment_id}, Run ID: {run_id},Run Name: {run_name}, run-{i}")
            print(f"RMSE = {rmse}, MAE = {mae}, R2 = {r2}")
            print("--" * 20)
            i += 1


run = mlflow.last_active_run()
print(f"active run id is {run.info.run_id}")
print(f"active run name is {run.info.run_name}")


# 訪問MLflow UI來查看和比較所有運行

2024/01/29 17:56:05 INFO mlflow.tracking.fluent: Experiment with name 'wine_quality_prediction_v2' does not exist. Creating a new experiment.


artifact uri: file:///C:/Users/xdxd2/Sunny_VS_worksapce/Sunny_python/ML/MLOps_fundamentals/MLflow/basic/mlruns/100994538763612379/42329ad2b7dd4678b98d4d2baaf3936f/artifacts
Experiment ID: 100994538763612379, Run ID: 42329ad2b7dd4678b98d4d2baaf3936f,Run Name: run-0_alpha_0.01_l1_ratio_0.1, run-0
RMSE = 0.6391318094321226, MAE = 0.5151371402062789, R2 = 0.37492607913469955
----------------------------------------
artifact uri: file:///C:/Users/xdxd2/Sunny_VS_worksapce/Sunny_python/ML/MLOps_fundamentals/MLflow/basic/mlruns/100994538763612379/aa1b698a584741a4bb1edfe2dc60a35c/artifacts
Experiment ID: 100994538763612379, Run ID: aa1b698a584741a4bb1edfe2dc60a35c,Run Name: run-1_alpha_0.01_l1_ratio_0.5, run-1
RMSE = 0.6426585655547938, MAE = 0.5177576535521613, R2 = 0.3680086781839418
----------------------------------------
artifact uri: file:///C:/Users/xdxd2/Sunny_VS_worksapce/Sunny_python/ML/MLOps_fundamentals/MLflow/basic/mlruns/100994538763612379/eeb1812854854473bbafef0d24224ee8/artifact

## EXP -2 
- alpha_values = [0.001, 0.005, 0.01, 0.05]
- l1_ratio_values = [0.1, 0.3, 0.5, 0.7, 0.9]

In [16]:
import mlflow
import mlflow.sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import pandas as pd
import numpy as np


warnings.filterwarnings("ignore")
np.random.seed(40)

# Read the wine-quality csv file from local
df = pd.read_csv("data/red-wine-quality.csv")
X = df.drop('quality', axis=1)
y = df['quality']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 設定MLflow實驗
mlflow.set_experiment("optimized_wine_quality_prediction")

# 調整參數範圍
alpha_values = [0.001, 0.005, 0.01, 0.05]
l1_ratio_values = [0.1, 0.3, 0.5, 0.7, 0.9]

i = 0

for alpha in alpha_values:
    for l1_ratio in l1_ratio_values:
        with mlflow.start_run(run_name = f"run-{i}_alpha_{alpha}_l1_ratio_{l1_ratio}"):
            # 設定tags和run name
            mlflow.set_tag("model", "ElasticNet")
            mlflow.set_tag("run_name", f"alpha_{alpha}_l1_{l1_ratio}")

            # 訓練模型
            model = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, random_state=42)
            model.fit(X_train, y_train)

            # 預測並計算指標
            predictions = model.predict(X_test)
            rmse = np.sqrt(mean_squared_error(y_test, predictions))
            mae = mean_absolute_error(y_test, predictions)
            r2 = r2_score(y_test, predictions)


            # 記錄實驗參數和指標
            mlflow.log_param("alpha", alpha)
            mlflow.log_param("l1_ratio", l1_ratio)
            mlflow.log_metric("rmse", rmse)
            mlflow.log_metric("mae", mae)
            mlflow.log_metric("r2", r2)

            # 記錄模型
            mlflow.sklearn.log_model(model, "model")

            # 上傳任何額外的資料件
            mlflow.log_artifacts("data/")
            artifact_uri = mlflow.get_artifact_uri()
            print(f"artifact uri: {artifact_uri}")

            # 獲取實驗ID和運行名稱
            current_run = mlflow.active_run()
            experiment_id = current_run.info.experiment_id
            run_id = current_run.info.run_id
            run_name = current_run.info.run_name
            print(f"Experiment ID: {experiment_id}, Run ID: {run_id},Run Name: {run_name}, run-{i}")
            print(f"RMSE = {rmse}, MAE = {mae}, R2 = {r2}")
            print("--" * 20)
            i += 1


run = mlflow.last_active_run()
print(f"active run id is {run.info.run_id}")
print(f"active run name is {run.info.run_name}")

artifact uri: file:///C:/Users/xdxd2/Sunny_VS_worksapce/Sunny_python/ML/MLOps_fundamentals/MLflow/basic/mlruns/489810883218037476/ed1203f1060e4ac3b533212ff2317781/artifacts
Experiment ID: 489810883218037476, Run ID: ed1203f1060e4ac3b533212ff2317781,Run Name: run-0_alpha_0.001_l1_ratio_0.1, run-0
RMSE = 0.6259603066121296, MAE = 0.5050468510983732, R2 = 0.4004241892663577
----------------------------------------
artifact uri: file:///C:/Users/xdxd2/Sunny_VS_worksapce/Sunny_python/ML/MLOps_fundamentals/MLflow/basic/mlruns/489810883218037476/c8185011ce534a17be79e4a298647ad1/artifacts
Experiment ID: 489810883218037476, Run ID: c8185011ce534a17be79e4a298647ad1,Run Name: run-1_alpha_0.001_l1_ratio_0.3, run-1
RMSE = 0.6260900285833417, MAE = 0.5051649445208537, R2 = 0.4001756552475598
----------------------------------------
artifact uri: file:///C:/Users/xdxd2/Sunny_VS_worksapce/Sunny_python/ML/MLOps_fundamentals/MLflow/basic/mlruns/489810883218037476/a4c1562ed9764a65bb8f5cfb6458cc32/artifac

## EXP -3 (BayesSearch) 
- n_iter_values = [10, 20, 30]
- cv_values = [3, 5, 10]

In [19]:
import mlflow
import mlflow.sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from skopt import BayesSearchCV
from skopt.space import Real
import pandas as pd
import numpy as np

warnings.filterwarnings("ignore")
np.random.seed(40)

# Read the wine-quality csv file from local
df = pd.read_csv("data/red-wine-quality.csv")
X = df.drop('quality', axis=1)
y = df['quality']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 设置 MLflow 实验
mlflow.set_experiment("bayesian_optimization_n_iter_cv_experiment")

# 設定貝葉斯搜索的參數範圍
search_spaces = {
    'alpha': Real(1e-3, 1e+1, prior='log-uniform'),
    'l1_ratio': Real(0.01, 1.0, prior='uniform')
}

# 要测试的 n_iter 和 cv 组合
n_iter_values = [10, 20, 30]
cv_values = [3, 5, 10]

i = 0
for n_iter in n_iter_values:
    for cv in cv_values:
        with mlflow.start_run(run_name = f"run-{i}_n_iter_{n_iter}_cv_values_{cv}"):
            # 使用贝叶斯优化的ElasticNet模型
            model = ElasticNet(random_state=42)
            opt = BayesSearchCV(
                model,
                search_spaces,
                n_iter=n_iter,
                scoring='neg_mean_squared_error',
                cv=cv,
                n_jobs=-1,
                random_state=42
            )

            # 进行贝叶斯优化
            opt.fit(X_train, y_train)

            # 最佳参数和得分
            best_params = opt.best_params_
            best_score = -opt.best_score_

            # 在测试集上评估模型
            best_model = opt.best_estimator_
            predictions = best_model.predict(X_test)
            rmse = np.sqrt(mean_squared_error(y_test, predictions))
            mae = mean_absolute_error(y_test, predictions)
            r2 = r2_score(y_test, predictions)

            # 记录到MLflow
            mlflow.log_param("n_iter", n_iter)
            mlflow.log_param("cv", cv)
            mlflow.log_params(best_params)
            mlflow.log_metric("rmse", rmse)
            mlflow.log_metric("mae", mae)
            mlflow.log_metric("r2", r2)
            mlflow.sklearn.log_model(best_model, "model")

            # 獲取實驗ID和運行名稱
            current_run = mlflow.active_run()
            experiment_id = current_run.info.experiment_id
            run_id = current_run.info.run_id
            run_name = current_run.info.run_name
            print(f"Experiment ID: {experiment_id}, Run ID: {run_id},Run Name: {run_name}, run-{i}")
            print(f"RMSE = {rmse}, MAE = {mae}, R2 = {r2}")
            print("--" * 20)

            i += 1

run = mlflow.last_active_run()
print(f"active run id is {run.info.run_id}")
print(f"active run name is {run.info.run_name}") 



Experiment ID: 182028557459085572, Run ID: 68cbb2ea8a1448ca958cdfd4a2b885c4,Run Name: run-0_n_iter_10_cv_values_3, run-0
RMSE = 0.626517709960659, MAE = 0.5054937104000669, R2 = 0.3993558967172063
----------------------------------------
Experiment ID: 182028557459085572, Run ID: 9430ea8567744dbbbcee351e6e21d104,Run Name: run-1_n_iter_10_cv_values_5, run-1
RMSE = 0.626517709960659, MAE = 0.5054937104000669, R2 = 0.3993558967172063
----------------------------------------
Experiment ID: 182028557459085572, Run ID: bf88f5472cc9407d8203447c101ff907,Run Name: run-2_n_iter_10_cv_values_10, run-2
RMSE = 0.626517709960659, MAE = 0.5054937104000669, R2 = 0.3993558967172063
----------------------------------------
Experiment ID: 182028557459085572, Run ID: 47875f19098042dfb891601285208b7c,Run Name: run-3_n_iter_20_cv_values_3, run-3
RMSE = 0.6259046748712929, MAE = 0.5049928348900881, R2 = 0.4005307582086455
----------------------------------------
Experiment ID: 182028557459085572, Run ID: 274