In [1]:
project_name = 'experiment_5'

In [2]:
import sys
import os
import logging
import mlflow

notebook_dir = os.path.dirname(os.path.abspath('__file__'))
src_dir = os.path.join(notebook_dir, '../../')
sys.path.append(src_dir)

logger = logging.getLogger(__name__)
logging.basicConfig(filename=f'../../logs/{project_name}.log',
                    level=logging.INFO,
                    format="[%(asctime)s] %(levelname)s [%(name)s.%(funcName)s:%(lineno)d] %(message)s",
                    datefmt="%d/%b/%Y %H:%M:%S"
                   )

mlflow.set_experiment(project_name)
mlflow.autolog()

2025/02/27 23:19:24 INFO mlflow.tracking.fluent: Experiment with name 'experiment_5' does not exist. Creating a new experiment.


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from src.metrics.metrics import compute_metrics, default_regression_metrics
from src.preprocessing.preprocessing import Preprocessor
from src.io.input import load_artifacts
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Lasso, Ridge, LinearRegression
from sklearn.preprocessing import StandardScaler
from src.pipelines.training_pipelines import StandardTrainerPipeline
from src.train.training import train_multiple_models
from src.models.default import get_default_regression_models
from src.train.cross_validation import cross_validate
from src.metrics.metrics import summarize_cv_metrics
from src.pipelines.hypertune_pipelines import HyperTunnerPipeline
from src.hypertune.hypertuning import objective_score
from hyperopt import hp
#from src.monitor.default_drift import get_default_drift_report

import warnings
warnings.filterwarnings('ignore')

2025/02/27 23:19:25 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2025/02/27 23:19:25 INFO mlflow.tracking.fluent: Autologging successfully enabled for xgboost.
2025/02/27 23:19:25 INFO mlflow.tracking.fluent: Autologging successfully enabled for lightgbm.


In [4]:
df = pd.read_csv('../../data/training_data.csv')
print(df.shape)

(800, 21)


In [5]:
df.head(2)

Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,...,feature_11,feature_12,feature_13,feature_14,feature_15,feature_16,feature_17,feature_18,feature_19,target
0,432.475954,289.373016,481.3156,358.755566,802.659004,176.761177,72.648102,720.969179,36.327684,83.768878,...,4.385848,516.789458,19.624422,13.16244,42.351948,35.920392,20.755984,13.8143,384.497136,14.364922
1,517.59625,330.448341,585.920055,22.684031,169.81324,335.60164,284.451476,748.101047,73.701438,358.147215,...,5.563334,2.960064,20.721878,17.740184,1.726915,167.576065,75.492679,2.480979,303.710869,19.984801


### 1. Exploratory Data Analysis

In [6]:
#report = ProfileReport(df)
#report.to_file('report.html')

In [7]:
df_train, df_val = train_test_split(df, test_size=0.2)
print(df_train.shape, df_val.shape)

(640, 21) (160, 21)


### 2. Preprocessing

In [8]:
# Preprocessing params
target_column = 'target'
numerical_features = list(df_train.drop(columns=[target_column]).columns)
categorical_features = []
preprocessing_graph = ['individual_num', 'combinated_num']
artifacts = {}
# Fit preprocessors in training mode
train_pipeline = Preprocessor(df_train,
                              'train',
                              numerical_features=numerical_features,
                              categorical_features=categorical_features,
                              target_column=target_column,
                              graph_preprocess=preprocessing_graph,
                              artifacts=artifacts
                             )
df_train_preproc = train_pipeline.preprocess()

# Load generated artifacts
artifacts = load_artifacts('../../artifacts/')
# Apply same preprocess logic to validation set
validation_pipeline = Preprocessor(df_val,
                                   'val',
                                   numerical_features=numerical_features,
                                   categorical_features=categorical_features,
                                   target_column=target_column,
                                   graph_preprocess=preprocessing_graph,
                                   artifacts=artifacts
                                  )
df_val_preproc = validation_pipeline.preprocess()

### 3. Metrics choice

In [9]:
metrics_config = default_regression_metrics()

### 4. Model training

In [10]:
list_models = [XGBRegressor(), LGBMRegressor(verbose=-1),
               RandomForestRegressor(), LinearRegression(),
               Ridge(), Lasso()#, create_neural_network(df_train_preproc.shape[1]-1),
               ]
list_model_names = ['xgboost', 'lightgbm', 'random_forest',
                    'linear_regression', 'ridge', 'lasso'#, 'dnn'
                   ]

metrics_df, fitted_models = train_multiple_models(df_train_preproc,
                                                  df_val_preproc,
                                                  list_models,
                                                  list_model_names,
                                                  target_column,
                                                  metrics_config
                                                 )

2025/02/27 23:19:50 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '3fe03fc1ec0a400cb39618dd20b4f213', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current xgboost workflow
2025/02/27 23:21:05 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '25c5d5ff2e7f4db88dcf134e1e428c82', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current lightgbm workflow
2025/02/27 23:24:04 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '0d294d2e1e4b4967a857bc110f0b255c', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow
2025/02/27 23:28:50 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '179bcf0ddb654a559ae1a65439cd87d8', which will track hyperparameters, performance metrics, model artifacts, and lineage 

In [11]:
metrics_df

Unnamed: 0,model,dataset,mae,mse,rmse,mape,r2
0,xgboost,train,0.0002072425,6.835532e-08,0.0002614485,2.064265e-05,1.0
0,xgboost,validation,1.806035,5.284825,2.298875,0.1428545,0.788489
0,lightgbm,train,0.04576032,0.01888961,0.1374395,0.01310353,0.999273
0,lightgbm,validation,1.581541,4.046624,2.011622,0.1285436,0.838044
0,random_forest,train,0.7759161,0.9600577,0.9798254,0.0808506,0.963058
0,random_forest,validation,1.884376,5.76585,2.401218,0.1539849,0.769237
0,linear_regression,train,3.622442e-09,2.3370200000000002e-17,4.834273e-09,3.023829e-10,1.0
0,linear_regression,validation,6.242786,258.8877,16.08999,0.5230357,-9.361307
0,ridge,train,0.9947995,1.650402,1.28468,0.09054498,0.936494
0,ridge,validation,3.892876,99.14693,9.957255,0.3073648,-2.968099


### 5. Complete training pipeline

In [12]:
target_column = 'target'
numerical_features = list(df.drop(columns=[target_column]).columns)
categorical_features = []
preprocessing_graph = ['individual_num']#, 'combinated_num']
artifacts = {}
test_size = 0.2

list_models, list_model_names = get_default_regression_models()
metrics_config = default_regression_metrics()

training_pipeline = StandardTrainerPipeline(df.copy(),
                                            numerical_features,
                                            categorical_features,
                                            target_column,
                                            preprocessing_graph,
                                            artifacts.copy(),
                                            list_models.copy(),
                                            list_model_names,
                                            metrics_config,
                                            test_size
                                           )
training_pipeline.run()

2025/02/27 23:29:51 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'c4312892e9f84886aab44a3f749612c2', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current xgboost workflow
2025/02/27 23:29:54 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '86be6eecfc784117b7c770d0093477d4', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current lightgbm workflow
2025/02/27 23:29:58 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'baa192a666344becba4dbc8445583dd8', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow
2025/02/27 23:30:04 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'f2e5c7974a1241ceacad1e1efd8f75f2', which will track hyperparameters, performance metrics, model artifacts, and lineage 

In [13]:
training_pipeline.metrics_df

Unnamed: 0,model,dataset,mae,mse,rmse,mape,r2
0,xgboost,train,0.000545,5.986409e-07,0.000774,4.2e-05,1.0
0,xgboost,validation,1.912291,5.696732,2.386783,0.167361,0.761718
0,lightgbm,train,0.119704,0.03834291,0.195813,0.015717,0.998545
0,lightgbm,validation,1.56739,3.704633,1.924742,0.134581,0.845043
0,random_forest,train,0.843664,1.157487,1.075866,0.088283,0.956084
0,random_forest,validation,2.022006,6.390756,2.527994,0.176872,0.732689
0,linear_regression,train,1.266085,2.737394,1.654507,0.110059,0.896141
0,linear_regression,validation,1.970527,7.788063,2.79071,0.163299,0.674242
0,ridge,train,1.366923,3.074664,1.753472,0.124462,0.883344
0,ridge,validation,1.687121,4.900097,2.213616,0.144087,0.79504


### 6. Cross-validation

In [14]:
target_column = 'target'
numerical_features = list(df.drop(columns=[target_column]).columns)
categorical_features = []
preprocessing_graph = ['individual_num']#, 'combinated_num']
artifacts = {}
test_size = 0.2
num_folds = 5

list_models, list_model_names = get_default_regression_models()
metrics_config = default_regression_metrics()

metrics_df, all_models = cross_validate(df.copy(),
                                        list_models,
                                        list_model_names,
                                        target_column,
                                        metrics_config,
                                        num_folds,
                                        artifacts,
                                        numerical_features,
                                        categorical_features,
                                        preprocessing_graph,
                                        test_size
                                       )

2025/02/27 23:30:09 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '96127c3b676b4529bc385a415b9243ef', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current xgboost workflow
2025/02/27 23:30:12 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'a1208327b3e7462686a1bf8bebe74886', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current lightgbm workflow
2025/02/27 23:30:15 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '42e8d05d538047dd9cf089986e929941', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow
2025/02/27 23:30:21 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'e4601260b20c4a0ca08d3787b0e754ea', which will track hyperparameters, performance metrics, model artifacts, and lineage 

In [15]:
summarize_cv_metrics(metrics_df, 'mape')

Unnamed: 0,dataset,model,mae_cv,mse_cv,rmse_cv,mape_cv,r2_cv
,,,,,,,
5.0,train,xgboost,0.0009 +/- 0.0005,0.0 +/- 0.0,0.0012 +/- 0.0007,0.0001 +/- 0.0,1.0 +/- 0.0
1.0,train,lightgbm,0.1173 +/- 0.0031,0.0396 +/- 0.0035,0.1989 +/- 0.0085,0.0149 +/- 0.0036,0.9985 +/- 0.0002
3.0,train,random_forest,0.8203 +/- 0.0067,1.1015 +/- 0.034,1.0494 +/- 0.0162,0.0819 +/- 0.0069,0.9579 +/- 0.0007
2.0,train,linear_regression,1.2375 +/- 0.0222,2.5967 +/- 0.0492,1.6114 +/- 0.0153,0.1054 +/- 0.005,0.9008 +/- 0.0017
4.0,train,ridge,1.3459 +/- 0.0237,2.9658 +/- 0.0565,1.7221 +/- 0.0164,0.1207 +/- 0.0068,0.8866 +/- 0.0027
0.0,train,lasso,1.5394 +/- 0.0289,3.9031 +/- 0.1081,1.9755 +/- 0.0272,0.1376 +/- 0.0057,0.8508 +/- 0.0048
6.0,validation,lasso,1.7045 +/- 0.0721,6.1551 +/- 2.5046,2.446 +/- 0.4639,0.1536 +/- 0.019,0.7481 +/- 0.1044
10.0,validation,ridge,1.7975 +/- 0.0401,6.9687 +/- 3.066,2.597 +/- 0.5294,0.1734 +/- 0.036,0.7145 +/- 0.1281
7.0,validation,lightgbm,1.747 +/- 0.0476,5.0478 +/- 0.3888,2.2454 +/- 0.0874,0.1813 +/- 0.0463,0.7924 +/- 0.0291


### 6. Hyperparameter tunning

In [16]:
target_column = 'target'
numerical_features = list(df.drop(columns=[target_column]).columns)
categorical_features = []
preprocessing_graph = ['individual_num']#, 'combinated_num']
artifacts = {}
test_size = 0.2
optimized_metric = 'mse'
model2tune = LGBMRegressor
model_name = 'lgbm'
max_iterations = 20

list_models, list_model_names = get_default_regression_models()
metrics_config = default_regression_metrics()

search_space = {}
search_space['max_depth'] = hp.uniformint('max_depth', 1, 500, q=1)
search_space['num_leaves'] = hp.uniformint('num_leaves', 30, 1000, q=1)
search_space['boosting_type'] = hp.choice('boosting_type', ['gbdt', 'dart'])
search_space['colsample_bytree'] = hp.uniform('colsample_bytree', 0.6, 1.0)
search_space['learning_rate'] = 0.1
search_space['reg_alpha'] = hp.uniform('reg_alpha', 0.0, 1000.0)
search_space['reg_lambda'] = hp.uniform('reg_lambda', 0.0, 1000.0)
search_space['verbose'] = -1



hypertune_pipeline = HyperTunnerPipeline(df,
                                        numerical_features,
                                        categorical_features,
                                        target_column,
                                        preprocessing_graph,
                                        artifacts,
                                        search_space,
                                        optimized_metric,
                                        model2tune,
                                        model_name,
                                        metrics_config,
                                        objective_score,
                                        test_size,
                                        max_iterations
                                        )
hypertune_pipeline.run()

  0%|                                      | 0/20 [00:00<?, ?trial/s, best loss=?]

2025/02/27 23:40:21 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '5ac83188b9b944dc97fa03e1c96dd45d', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current lightgbm workflow



{'mae': 2.7269087338056, 'mse': 11.354900249288036, 'rmse': 3.369703288019293, 'mape': 0.24368422913658994, 'r2': 0.515718159843556}
  5%|▌           | 1/20 [00:03<01:00,  3.16s/trial, best loss: 11.354900249288036]

2025/02/27 23:40:24 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'bc84aaf952c74779acf140f30ce0985f', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current lightgbm workflow



{'mae': 21.95028543820006, 'mse': 505.96963843627225, 'rmse': 22.4937688802093, 'mape': 1.5458673824791191, 'r2': -20.579397633244056}
 10%|█▏          | 2/20 [00:06<00:56,  3.16s/trial, best loss: 11.354900249288036]

2025/02/27 23:40:27 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '83c9e63c79a94e6b8f648c1eb0972884', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current lightgbm workflow



{'mae': 3.2700964801708183, 'mse': 15.573575478671648, 'rmse': 3.94633696973176, 'mape': 0.29028526942310134, 'r2': 0.33579339095477556}
 15%|█▊          | 3/20 [00:09<00:55,  3.26s/trial, best loss: 11.354900249288036]

2025/02/27 23:40:31 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '1df6302ea2ee4b48bb7bf21f5d284d5c', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current lightgbm workflow



{'mae': 4.056373147003034, 'mse': 23.56567864707521, 'rmse': 4.854449366001793, 'mape': 0.35285800722347427, 'r2': -0.005066532438853688}
 20%|██▍         | 4/20 [00:12<00:50,  3.13s/trial, best loss: 11.354900249288036]

2025/02/27 23:40:34 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'c8b948e905dc4e2dbd74d1d481841b5c', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current lightgbm workflow



{'mae': 18.989731174797402, 'mse': 381.85009781301335, 'rmse': 19.540985077856575, 'mape': 1.3273900695535157, 'r2': -15.285750114308499}
 25%|███         | 5/20 [00:15<00:46,  3.08s/trial, best loss: 11.354900249288036]

2025/02/27 23:40:37 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '90f2f17ca55c40d7bf2a43e7b6f6ead0', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current lightgbm workflow



{'mae': 2.835031122862147, 'mse': 12.138960551814444, 'rmse': 3.4841011110205233, 'mape': 0.25251542202104027, 'r2': 0.4822783094032216}
 30%|███▌        | 6/20 [00:18<00:43,  3.12s/trial, best loss: 11.354900249288036]

2025/02/27 23:40:40 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '73d5e06082634c7fb38903b4f51bd4b3', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current lightgbm workflow



{'mae': 3.1446257822175308, 'mse': 14.5632868741326, 'rmse': 3.81618747890255, 'mape': 0.27975360533369215, 'r2': 0.3788817857230109}
 35%|████▏       | 7/20 [00:22<00:41,  3.18s/trial, best loss: 11.354900249288036]

2025/02/27 23:40:43 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '5692f22e326c4f06ad44d3f6a4b7a40e', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current lightgbm workflow



{'mae': 17.767060996255786, 'mse': 331.59051985734084, 'rmse': 18.209627120216954, 'mape': 1.252743932606839, 'r2': -13.142199720778143}
 40%|████▊       | 8/20 [00:25<00:38,  3.19s/trial, best loss: 11.354900249288036]

2025/02/27 23:40:46 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'e2d5dea13cbe4e56a2a65528eac0a94a', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current lightgbm workflow



{'mae': 18.237298381494334, 'mse': 350.47568970411686, 'rmse': 18.720995959192898, 'mape': 1.2820612523371628, 'r2': -13.947644471879071}
 45%|█████▍      | 9/20 [00:28<00:35,  3.25s/trial, best loss: 11.354900249288036]

2025/02/27 23:40:50 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'f89e338b0d324102b465d6827c62c5bf', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current lightgbm workflow



{'mae': 2.192704845858244, 'mse': 7.884987564000727, 'rmse': 2.808022002050683, 'mape': 0.19278542552043643, 'r2': 0.6637085132170688}
 50%|██████      | 10/20 [00:31<00:32,  3.23s/trial, best loss: 7.884987564000727]

2025/02/27 23:40:53 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'ea8518d956014303bdb79a131f59c7ff', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current lightgbm workflow



{'mae': 3.9248928122635567, 'mse': 21.991197314232473, 'rmse': 4.689477296483316, 'mape': 0.3407567163948494, 'r2': 0.062084450874175245}
 55%|██████▌     | 11/20 [00:35<00:29,  3.33s/trial, best loss: 7.884987564000727]

2025/02/27 23:40:56 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '2864e5f329d04bbe8f910bc73093cbf8', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current lightgbm workflow



{'mae': 3.875845890902254, 'mse': 21.370942523421387, 'rmse': 4.622871674989626, 'mape': 0.33604704637229094, 'r2': 0.08853806339962889}
 60%|███████▏    | 12/20 [00:38<00:26,  3.28s/trial, best loss: 7.884987564000727]

2025/02/27 23:41:00 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '8739bdc7c86843559ccfa30358de4179', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current lightgbm workflow



{'mae': 4.056373147003034, 'mse': 23.56567864707521, 'rmse': 4.854449366001793, 'mape': 0.35285800722347427, 'r2': -0.005066532438853688}
 65%|███████▊    | 13/20 [00:41<00:23,  3.30s/trial, best loss: 7.884987564000727]

2025/02/27 23:41:03 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '044eacf2aca4472089893fc610a2bca9', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current lightgbm workflow



{'mae': 21.935064238836844, 'mse': 504.79390613728646, 'rmse': 22.467619058041876, 'mape': 1.5462387400752764, 'r2': -20.529253132739097}
 70%|████████▍   | 14/20 [00:45<00:19,  3.33s/trial, best loss: 7.884987564000727]

2025/02/27 23:41:06 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '6410a571df2a49be96c90d75b208c50b', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current lightgbm workflow



{'mae': 2.503290276105858, 'mse': 9.837511807709188, 'rmse': 3.136480799831108, 'mape': 0.22356215757357667, 'r2': 0.5804341547520977}
 75%|█████████   | 15/20 [00:48<00:16,  3.27s/trial, best loss: 7.884987564000727]

2025/02/27 23:41:10 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '908e79678c7249bbb53e3d5c58b09be4', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current lightgbm workflow



{'mae': 21.95169496887367, 'mse': 505.95188811728696, 'rmse': 22.493374315946618, 'mape': 1.5462828774178203, 'r2': -20.57864058941691}
 80%|█████████▌  | 16/20 [00:51<00:12,  3.22s/trial, best loss: 7.884987564000727]

2025/02/27 23:41:13 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '44ef5e781a384368a827f022c9374157', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current lightgbm workflow



{'mae': 2.920680136794103, 'mse': 12.77499601286269, 'rmse': 3.574212642367923, 'mape': 0.26059007938357037, 'r2': 0.45515165776218147}
 85%|██████████▏ | 17/20 [00:55<00:10,  3.36s/trial, best loss: 7.884987564000727]

2025/02/27 23:41:16 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '74334a4d192642a2a79a90c4f631feaf', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current lightgbm workflow



{'mae': 21.93641265797064, 'mse': 505.43733939348095, 'rmse': 22.481933622210544, 'mape': 1.5441839087727163, 'r2': -20.55669529731005}
 90%|██████████▊ | 18/20 [00:58<00:06,  3.30s/trial, best loss: 7.884987564000727]

2025/02/27 23:41:19 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '1ddc17be0f394621bcc09b31ef4711c6', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current lightgbm workflow



{'mae': 21.95598040272435, 'mse': 506.1229422417964, 'rmse': 22.49717631708025, 'mape': 1.5465842291893868, 'r2': -20.585935977695552}
 95%|███████████▍| 19/20 [01:01<00:03,  3.21s/trial, best loss: 7.884987564000727]

2025/02/27 23:41:22 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '6e3dec57f50f4e3d9de8008c93adc77c', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current lightgbm workflow



{'mae': 4.056373147003034, 'mse': 23.56567864707521, 'rmse': 4.854449366001793, 'mape': 0.35285800722347427, 'r2': -0.005066532438853688}
100%|████████████| 20/20 [01:04<00:00,  3.22s/trial, best loss: 7.884987564000727]


In [17]:
hypertune_pipeline.best_hyperparams

{'boosting_type': 0,
 'colsample_bytree': 0.6852279340179045,
 'max_depth': 149.0,
 'num_leaves': 525.0,
 'reg_alpha': 63.61983523606229,
 'reg_lambda': 343.9529944500177}

In [18]:
hypertune_pipeline.trials.best_trial

{'state': 2,
 'tid': 9,
 'spec': None,
 'result': {'loss': 7.884987564000727, 'status': 'ok'},
 'misc': {'tid': 9,
  'cmd': ('domain_attachment', 'FMinIter_Domain'),
  'workdir': None,
  'idxs': {'boosting_type': [9],
   'colsample_bytree': [9],
   'max_depth': [9],
   'num_leaves': [9],
   'reg_alpha': [9],
   'reg_lambda': [9]},
  'vals': {'boosting_type': [0],
   'colsample_bytree': [0.6852279340179045],
   'max_depth': [149.0],
   'num_leaves': [525.0],
   'reg_alpha': [63.61983523606229],
   'reg_lambda': [343.9529944500177]}},
 'exp_key': None,
 'owner': None,
 'version': 0,
 'book_time': datetime.datetime(2025, 2, 28, 4, 40, 50, 222000),
 'refresh_time': datetime.datetime(2025, 2, 28, 4, 40, 53, 417000)}

### 7. Drift evaluation

In [None]:
df = pd.read_csv('../../data/training_data.csv')
df_test = pd.read_csv('../../data/blind_test_data.csv')

drift_report = get_default_drift_report(df, df_test)
drift_report