In [None]:
import sys
import os
import logging

notebook_dir = os.path.dirname(os.path.abspath('__file__'))
src_dir = os.path.join(notebook_dir, '../../')
sys.path.append(src_dir)

logger = logging.getLogger(__name__)
logging.basicConfig(filename='../../logs/log_example.log',
                    level=logging.INFO,
                    format="[%(asctime)s] %(levelname)s [%(name)s.%(funcName)s:%(lineno)d] %(message)s",
                    datefmt="%d/%b/%Y %H:%M:%S"
                   )

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from src.metrics.metrics import compute_metrics, default_regression_metrics
from src.preprocessing.preprocessing import Preprocessor
from src.io.input import load_artifacts
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Lasso, Ridge, LinearRegression
from sklearn.preprocessing import StandardScaler
from src.pipelines.training_pipelines import StandardTrainerPipeline
from src.train.training import train_multiple_models
from src.models.default import get_default_regression_models
from src.train.cross_validation import cross_validate
from src.metrics.metrics import summarize_cv_metrics

import warnings
warnings.filterwarnings('ignore')

In [4]:
df = pd.read_csv('../../data/training_data.csv')
print(df.shape)

(800, 21)


In [5]:
df.head(2)

Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,...,feature_11,feature_12,feature_13,feature_14,feature_15,feature_16,feature_17,feature_18,feature_19,target
0,432.475954,289.373016,481.3156,358.755566,802.659004,176.761177,72.648102,720.969179,36.327684,83.768878,...,4.385848,516.789458,19.624422,13.16244,42.351948,35.920392,20.755984,13.8143,384.497136,14.364922
1,517.59625,330.448341,585.920055,22.684031,169.81324,335.60164,284.451476,748.101047,73.701438,358.147215,...,5.563334,2.960064,20.721878,17.740184,1.726915,167.576065,75.492679,2.480979,303.710869,19.984801


### 1. Exploratory Data Analysis

In [5]:
#report = ProfileReport(df)
#report.to_file('report.html')

In [6]:
df_train, df_val = train_test_split(df, test_size=0.2)
print(df_train.shape, df_val.shape)

(640, 21) (160, 21)


### 2. Preprocessing

In [7]:
# Preprocessing params
target_column = 'target'
numerical_features = list(df_train.drop(columns=[target_column]).columns)
categorical_features = []
preprocessing_graph = ['individual_num', 'combinated_num']
artifacts = {}
# Fit preprocessors in training mode
train_pipeline = Preprocessor(df_train,
                              'train',
                              numerical_features=numerical_features,
                              categorical_features=categorical_features,
                              target_column=target_column,
                              graph_preprocess=preprocessing_graph,
                              artifacts=artifacts
                             )
df_train_preproc = train_pipeline.preprocess()

# Load generated artifacts
artifacts = load_artifacts('../../artifacts/')
# Apply same preprocess logic to validation set
validation_pipeline = Preprocessor(df_val,
                                   'val',
                                   numerical_features=numerical_features,
                                   categorical_features=categorical_features,
                                   target_column=target_column,
                                   graph_preprocess=preprocessing_graph,
                                   artifacts=artifacts
                                  )
df_val_preproc = validation_pipeline.preprocess()

### 3. Metrics choice

In [8]:
metrics_config = default_regression_metrics()

### 4. Model training

In [14]:
list_models = [XGBRegressor(), LGBMRegressor(verbose=-1),
               RandomForestRegressor(), LinearRegression(),
               Ridge(), Lasso()#, create_neural_network(df_train_preproc.shape[1]-1),
               ]
list_model_names = ['xgboost', 'lightgbm', 'random_forest',
                    'linear_regression', 'ridge', 'lasso'#, 'dnn'
                   ]

metrics_df, fitted_models = train_multiple_models(df_train_preproc,
                                                  df_val_preproc,
                                                  list_models,
                                                  list_model_names,
                                                  target_column,
                                                  metrics_config
                                                 )

In [15]:
metrics_df

Unnamed: 0,model,dataset,mae,mse,rmse,mape,r2
0,xgboost,train,0.0002020502,6.616232e-08,0.0002572204,1.876692e-05,1.0
0,xgboost,validation,1.913595,5.441831,2.332773,0.1676546,0.793309
0,lightgbm,train,0.04522762,0.01520974,0.1233278,0.01232418,0.999407
0,lightgbm,validation,1.574255,4.173158,2.042831,0.1449852,0.841496
0,random_forest,train,0.7781453,0.9677586,0.9837472,0.08180614,0.962256
0,random_forest,validation,1.997142,6.256958,2.501391,0.1829239,0.762349
0,linear_regression,train,1.809481e-14,5.563086e-28,2.358619e-14,1.889896e-15,1.0
0,linear_regression,validation,8.820487,3005.985,54.82686,0.5928812,-113.173023
0,ridge,train,0.0004208623,3.016522e-07,0.0005492288,3.471786e-05,1.0
0,ridge,validation,8.818468,3004.199,54.81057,0.5927545,-113.105171


### 5. Complete training pipeline

In [6]:
target_column = 'target'
numerical_features = list(df.drop(columns=[target_column]).columns)
categorical_features = []
preprocessing_graph = ['individual_num']#, 'combinated_num']
artifacts = {}
test_size = 0.2

list_models, list_model_names = get_default_regression_models()
metrics_config = default_regression_metrics()

training_pipeline = StandardTrainerPipeline(df.copy(),
                                            numerical_features,
                                            categorical_features,
                                            target_column,
                                            preprocessing_graph,
                                            artifacts.copy(),
                                            list_models.copy(),
                                            list_model_names,
                                            metrics_config,
                                            test_size
                                           )
training_pipeline.run()

In [12]:
training_pipeline.metrics_df

Unnamed: 0,model,dataset,mae,mse,rmse,mape,r2
0,xgboost,train,0.001036,2e-06,0.001466,7.9e-05,1.0
0,xgboost,validation,2.034968,6.360839,2.52207,0.195321,0.749091
0,lightgbm,train,0.114356,0.036674,0.191504,0.015787,0.998571
0,lightgbm,validation,1.66083,4.114566,2.028439,0.156451,0.837697
0,random_forest,train,0.839668,1.141729,1.068517,0.089564,0.955517
0,random_forest,validation,2.15108,6.787356,2.605255,0.210413,0.732267
0,linear_regression,train,1.241687,2.711592,1.646691,0.110095,0.894353
0,linear_regression,validation,2.066803,7.832096,2.798588,0.180543,0.691056
0,ridge,train,1.351632,3.093961,1.758966,0.12826,0.879456
0,ridge,validation,1.841866,6.258797,2.501759,0.156608,0.753116


### 6. Cross-validation

In [16]:
target_column = 'target'
numerical_features = list(df.drop(columns=[target_column]).columns)
categorical_features = []
preprocessing_graph = ['individual_num']#, 'combinated_num']
artifacts = {}
test_size = 0.2
num_folds = 5

list_models, list_model_names = get_default_regression_models()
metrics_config = default_regression_metrics()

metrics_df, all_models = cross_validate(df.copy(),
                                        list_models,
                                        list_model_names,
                                        target_column,
                                        metrics_config,
                                        num_folds,
                                        artifacts,
                                        numerical_features,
                                        categorical_features,
                                        preprocessing_graph,
                                        test_size
                                       )

Fold 0
Fold 1
Fold 2
Fold 3
Fold 4


In [19]:
summarize_cv_metrics(metrics_df, 'mape')

Unnamed: 0,dataset,model,mae_cv,mse_cv,rmse_cv,mape_cv,r2_cv
,,,,,,,
5.0,train,xgboost,0.0007 +/- 0.0002,0.0 +/- 0.0,0.001 +/- 0.0003,0.0001 +/- 0.0,1.0 +/- 0.0
1.0,train,lightgbm,0.1184 +/- 0.0022,0.0362 +/- 0.0018,0.1901 +/- 0.0048,0.0137 +/- 0.0027,0.9986 +/- 0.0001
3.0,train,random_forest,0.8177 +/- 0.021,1.0849 +/- 0.0445,1.0414 +/- 0.0214,0.0794 +/- 0.0058,0.958 +/- 0.0025
2.0,train,linear_regression,1.2475 +/- 0.0381,2.6353 +/- 0.1154,1.623 +/- 0.0355,0.1044 +/- 0.0052,0.898 +/- 0.0045
4.0,train,ridge,1.3451 +/- 0.0351,2.9698 +/- 0.1202,1.723 +/- 0.035,0.1182 +/- 0.0072,0.8851 +/- 0.0045
0.0,train,lasso,1.5444 +/- 0.0272,3.9225 +/- 0.1556,1.9802 +/- 0.0395,0.137 +/- 0.0056,0.8482 +/- 0.0055
6.0,validation,lasso,1.7002 +/- 0.1184,4.8368 +/- 0.7539,2.1937 +/- 0.1749,0.1563 +/- 0.0206,0.8119 +/- 0.0323
10.0,validation,ridge,1.8606 +/- 0.1969,7.4262 +/- 3.2361,2.6752 +/- 0.5807,0.1768 +/- 0.0361,0.7131 +/- 0.1205
8.0,validation,linear_regression,2.1468 +/- 0.2304,11.332 +/- 5.624,3.2959 +/- 0.7656,0.1974 +/- 0.028,0.5647 +/- 0.2017


### 6. Hyperparameter tunning

In [5]:
from src.pipelines.hypertune_pipelines import HyperTunnerPipeline
from src.hypertune.hypertuning import objective_score
from hyperopt import hp

In [6]:
target_column = 'target'
numerical_features = list(df.drop(columns=[target_column]).columns)
categorical_features = []
preprocessing_graph = ['individual_num']#, 'combinated_num']
artifacts = {}
test_size = 0.2
optimized_metric = 'mse'
model2tune = LGBMRegressor
model_name = 'lgbm'
max_iterations = 20

list_models, list_model_names = get_default_regression_models()
metrics_config = default_regression_metrics()

search_space = {}
search_space['max_depth'] = hp.uniformint('max_depth', 1, 500, q=1)
search_space['num_leaves'] = hp.uniformint('num_leaves', 30, 1000, q=1)
search_space['boosting_type'] = hp.choice('boosting_type', ['gbdt', 'dart'])
search_space['colsample_bytree'] = hp.uniform('colsample_bytree', 0.6, 1.0)
search_space['learning_rate'] = 0.1
search_space['reg_alpha'] = hp.uniform('reg_alpha', 0.0, 1000.0)
search_space['reg_lambda'] = hp.uniform('reg_lambda', 0.0, 1000.0)
search_space['verbose'] = -1



hypertune_pipeline = HyperTunnerPipeline(df,
                                        numerical_features,
                                        categorical_features,
                                        target_column,
                                        preprocessing_graph,
                                        artifacts,
                                        search_space,
                                        optimized_metric,
                                        model2tune,
                                        model_name,
                                        metrics_config,
                                        objective_score,
                                        test_size,
                                        max_iterations
                                        )
hypertune_pipeline.run()

{'mae': 2.013912528997095, 'mse': 6.179009285307078, 'rmse': 2.4857613089971204, 'mape': 0.37909733763756914, 'r2': 0.7732236030206548}
{'mae': 4.277315883460948, 'mse': 27.248318971801474, 'rmse': 5.219992238672532, 'mape': 0.7784082732162317, 'r2': -4.3099929082135446e-05}
{'mae': 4.2573636643957675, 'mse': 26.951971138219317, 'rmse': 5.191528786226589, 'mape': 0.7751689514138849, 'r2': 0.010833189593932535}
{'mae': 2.9158053401898014, 'mse': 12.567635930046412, 'rmse': 3.5450861668013673, 'mape': 0.5574069870871776, 'r2': 0.5387540197518172}
{'mae': 4.277315883460948, 'mse': 27.248318971801474, 'rmse': 5.219992238672532, 'mape': 0.7784082732162317, 'r2': -4.3099929082135446e-05}
{'mae': 4.277315883460948, 'mse': 27.248318971801474, 'rmse': 5.219992238672532, 'mape': 0.7784082732162317, 'r2': -4.3099929082135446e-05}
{'mae': 4.119546378784932, 'mse': 24.888429071648915, 'rmse': 4.988830431238259, 'mape': 0.7517344904432655, 'r2': 0.08656743974061953}
{'mae': 21.684063113573366, 'mse'

In [7]:
hypertune_pipeline.best_hyperparams

{'boosting_type': np.int64(0),
 'colsample_bytree': np.float64(0.7812079549846587),
 'max_depth': np.float64(40.0),
 'num_leaves': np.float64(630.0),
 'reg_alpha': np.float64(9.158947964783447),
 'reg_lambda': np.float64(250.1575746763408)}

In [11]:
hypertune_pipeline.trials.best_trial

{'state': 2,
 'tid': 0,
 'spec': None,
 'result': {'loss': 6.179009285307078, 'status': 'ok'},
 'misc': {'tid': 0,
  'cmd': ('domain_attachment', 'FMinIter_Domain'),
  'workdir': None,
  'idxs': {'boosting_type': [np.int64(0)],
   'colsample_bytree': [np.int64(0)],
   'max_depth': [np.int64(0)],
   'num_leaves': [np.int64(0)],
   'reg_alpha': [np.int64(0)],
   'reg_lambda': [np.int64(0)]},
  'vals': {'boosting_type': [np.int64(0)],
   'colsample_bytree': [np.float64(0.7812079549846587)],
   'max_depth': [np.float64(40.0)],
   'num_leaves': [np.float64(630.0)],
   'reg_alpha': [np.float64(9.158947964783447)],
   'reg_lambda': [np.float64(250.1575746763408)]}},
 'exp_key': None,
 'owner': None,
 'version': 0,
 'book_time': datetime.datetime(2025, 2, 27, 15, 43, 24, 816000),
 'refresh_time': datetime.datetime(2025, 2, 27, 15, 43, 25, 280000)}