In [1]:
import sys
import os
import logging

notebook_dir = os.path.dirname(os.path.abspath('__file__'))
src_dir = os.path.join(notebook_dir, '../../')
sys.path.append(src_dir)

logger = logging.getLogger(__name__)
logging.basicConfig(filename='../../logs/log_example.log',
                    level=logging.INFO,
                    format="[%(asctime)s] %(levelname)s [%(name)s.%(funcName)s:%(lineno)d] %(message)s",
                    datefmt="%d/%b/%Y %H:%M:%S"
                   )

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from src.metrics.metrics import compute_metrics, default_regression_metrics
from src.preprocessing.preprocessing import Preprocessor
from src.io.input import load_artifacts
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Lasso, Ridge, LinearRegression
from sklearn.preprocessing import StandardScaler
from src.pipelines.training_pipelines import StandardTrainerPipeline
from src.train.training import train_multiple_models
from src.models.default import get_default_regression_models
from src.train.cross_validation import cross_validate
from src.metrics.metrics import summarize_cv_metrics
from src.pipelines.hypertune_pipelines import HyperTunnerPipeline
from src.hypertune.hypertuning import objective_score
from hyperopt import hp

import warnings
warnings.filterwarnings('ignore')

In [3]:
df = pd.read_csv('../../data/training_data.csv')
print(df.shape)

(800, 21)


In [4]:
df.head(2)

Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,...,feature_11,feature_12,feature_13,feature_14,feature_15,feature_16,feature_17,feature_18,feature_19,target
0,432.475954,289.373016,481.3156,358.755566,802.659004,176.761177,72.648102,720.969179,36.327684,83.768878,...,4.385848,516.789458,19.624422,13.16244,42.351948,35.920392,20.755984,13.8143,384.497136,14.364922
1,517.59625,330.448341,585.920055,22.684031,169.81324,335.60164,284.451476,748.101047,73.701438,358.147215,...,5.563334,2.960064,20.721878,17.740184,1.726915,167.576065,75.492679,2.480979,303.710869,19.984801


### 1. Exploratory Data Analysis

In [5]:
#report = ProfileReport(df)
#report.to_file('report.html')

In [6]:
df_train, df_val = train_test_split(df, test_size=0.2)
print(df_train.shape, df_val.shape)

(640, 21) (160, 21)


### 2. Preprocessing

In [7]:
# Preprocessing params
target_column = 'target'
numerical_features = list(df_train.drop(columns=[target_column]).columns)
categorical_features = []
preprocessing_graph = ['individual_num', 'combinated_num']
artifacts = {}
# Fit preprocessors in training mode
train_pipeline = Preprocessor(df_train,
                              'train',
                              numerical_features=numerical_features,
                              categorical_features=categorical_features,
                              target_column=target_column,
                              graph_preprocess=preprocessing_graph,
                              artifacts=artifacts
                             )
df_train_preproc = train_pipeline.preprocess()

# Load generated artifacts
artifacts = load_artifacts('../../artifacts/')
# Apply same preprocess logic to validation set
validation_pipeline = Preprocessor(df_val,
                                   'val',
                                   numerical_features=numerical_features,
                                   categorical_features=categorical_features,
                                   target_column=target_column,
                                   graph_preprocess=preprocessing_graph,
                                   artifacts=artifacts
                                  )
df_val_preproc = validation_pipeline.preprocess()

### 3. Metrics choice

In [8]:
metrics_config = default_regression_metrics()

### 4. Model training

In [9]:
list_models = [XGBRegressor(), LGBMRegressor(verbose=-1),
               RandomForestRegressor(), LinearRegression(),
               Ridge(), Lasso()#, create_neural_network(df_train_preproc.shape[1]-1),
               ]
list_model_names = ['xgboost', 'lightgbm', 'random_forest',
                    'linear_regression', 'ridge', 'lasso'#, 'dnn'
                   ]

metrics_df, fitted_models = train_multiple_models(df_train_preproc,
                                                  df_val_preproc,
                                                  list_models,
                                                  list_model_names,
                                                  target_column,
                                                  metrics_config
                                                 )

In [10]:
metrics_df

Unnamed: 0,model,dataset,mae,mse,rmse,mape,r2
0,xgboost,train,0.0002000934,6.572867e-08,0.000256376,1.924151e-05,1.0
0,xgboost,validation,2.044858,6.605981,2.57021,0.148957,0.750471
0,lightgbm,train,0.04167624,0.01254246,0.1119931,0.01052817,0.999512
0,lightgbm,validation,1.618354,4.084498,2.021014,0.1164372,0.845716
0,random_forest,train,0.7852281,1.006756,1.003373,0.08564308,0.960811
0,random_forest,validation,2.013989,6.285308,2.507052,0.1499589,0.762584
0,linear_regression,train,8.794399e-09,1.232425e-16,1.110146e-08,7.610787e-10,1.0
0,linear_regression,validation,13.54208,9185.672,95.84191,0.7656274,-345.971479
0,ridge,train,0.7569835,0.9557422,0.9776207,0.06724569,0.962797
0,ridge,validation,12.912,6850.396,82.76712,0.7087164,-257.760823


### 5. Complete training pipeline

In [11]:
target_column = 'target'
numerical_features = list(df.drop(columns=[target_column]).columns)
categorical_features = []
preprocessing_graph = ['individual_num']#, 'combinated_num']
artifacts = {}
test_size = 0.2

list_models, list_model_names = get_default_regression_models()
metrics_config = default_regression_metrics()

training_pipeline = StandardTrainerPipeline(df.copy(),
                                            numerical_features,
                                            categorical_features,
                                            target_column,
                                            preprocessing_graph,
                                            artifacts.copy(),
                                            list_models.copy(),
                                            list_model_names,
                                            metrics_config,
                                            test_size
                                           )
training_pipeline.run()

In [12]:
training_pipeline.metrics_df

Unnamed: 0,model,dataset,mae,mse,rmse,mape,r2
0,xgboost,train,0.000676,8.57031e-07,0.000926,5.8e-05,1.0
0,xgboost,validation,2.29122,8.544879,2.923163,0.214728,0.704612
0,lightgbm,train,0.116742,0.03950342,0.198755,0.016529,0.998425
0,lightgbm,validation,1.866171,5.912409,2.431545,0.168658,0.795614
0,random_forest,train,0.801075,1.046045,1.022763,0.085238,0.958299
0,random_forest,validation,2.439763,9.4181,3.068892,0.222355,0.674426
0,linear_regression,train,1.187992,2.393707,1.547161,0.103781,0.904574
0,linear_regression,validation,2.623948,21.21799,4.606299,0.219833,0.266516
0,ridge,train,1.300193,2.796739,1.672346,0.121267,0.888507
0,ridge,validation,2.233309,16.70154,4.086751,0.183678,0.422645


### 6. Cross-validation

In [13]:
target_column = 'target'
numerical_features = list(df.drop(columns=[target_column]).columns)
categorical_features = []
preprocessing_graph = ['individual_num']#, 'combinated_num']
artifacts = {}
test_size = 0.2
num_folds = 5

list_models, list_model_names = get_default_regression_models()
metrics_config = default_regression_metrics()

metrics_df, all_models = cross_validate(df.copy(),
                                        list_models,
                                        list_model_names,
                                        target_column,
                                        metrics_config,
                                        num_folds,
                                        artifacts,
                                        numerical_features,
                                        categorical_features,
                                        preprocessing_graph,
                                        test_size
                                       )

In [14]:
summarize_cv_metrics(metrics_df, 'mape')

Unnamed: 0,dataset,model,mae_cv,mse_cv,rmse_cv,mape_cv,r2_cv
,,,,,,,
5.0,train,xgboost,0.0007 +/- 0.0004,0.0 +/- 0.0,0.001 +/- 0.0005,0.0001 +/- 0.0,1.0 +/- 0.0
1.0,train,lightgbm,0.1189 +/- 0.0022,0.0392 +/- 0.003,0.1978 +/- 0.0078,0.016 +/- 0.0018,0.9985 +/- 0.0001
3.0,train,random_forest,0.8196 +/- 0.008,1.0814 +/- 0.0104,1.0399 +/- 0.005,0.0865 +/- 0.003,0.9586 +/- 0.0007
2.0,train,linear_regression,1.2425 +/- 0.0118,2.6531 +/- 0.0686,1.6287 +/- 0.0211,0.1114 +/- 0.0027,0.8985 +/- 0.0028
4.0,train,ridge,1.3464 +/- 0.0089,3.0119 +/- 0.0583,1.7354 +/- 0.0168,0.1264 +/- 0.0036,0.8847 +/- 0.0026
0.0,train,lasso,1.5355 +/- 0.0047,3.8992 +/- 0.0413,1.9746 +/- 0.0105,0.1418 +/- 0.0014,0.8508 +/- 0.0017
6.0,validation,lasso,1.7254 +/- 0.0383,5.0452 +/- 0.7184,2.2419 +/- 0.1538,0.139 +/- 0.0052,0.7937 +/- 0.0438
7.0,validation,lightgbm,1.7746 +/- 0.0859,4.9837 +/- 0.4841,2.2303 +/- 0.1093,0.1451 +/- 0.0163,0.7984 +/- 0.0122
10.0,validation,ridge,1.8446 +/- 0.0995,6.7185 +/- 1.5039,2.5785 +/- 0.2959,0.1481 +/- 0.0095,0.7259 +/- 0.0695


### 6. Hyperparameter tunning

In [15]:
from src.pipelines.hypertune_pipelines import HyperTunnerPipeline
from src.hypertune.hypertuning import objective_score
from hyperopt import hp

In [16]:
target_column = 'target'
numerical_features = list(df.drop(columns=[target_column]).columns)
categorical_features = []
preprocessing_graph = ['individual_num']#, 'combinated_num']
artifacts = {}
test_size = 0.2
optimized_metric = 'mse'
model2tune = LGBMRegressor
model_name = 'lgbm'
max_iterations = 20

list_models, list_model_names = get_default_regression_models()
metrics_config = default_regression_metrics()

search_space = {}
search_space['max_depth'] = hp.uniformint('max_depth', 1, 500, q=1)
search_space['num_leaves'] = hp.uniformint('num_leaves', 30, 1000, q=1)
search_space['boosting_type'] = hp.choice('boosting_type', ['gbdt', 'dart'])
search_space['colsample_bytree'] = hp.uniform('colsample_bytree', 0.6, 1.0)
search_space['learning_rate'] = 0.1
search_space['reg_alpha'] = hp.uniform('reg_alpha', 0.0, 1000.0)
search_space['reg_lambda'] = hp.uniform('reg_lambda', 0.0, 1000.0)
search_space['verbose'] = -1



hypertune_pipeline = HyperTunnerPipeline(df,
                                        numerical_features,
                                        categorical_features,
                                        target_column,
                                        preprocessing_graph,
                                        artifacts,
                                        search_space,
                                        optimized_metric,
                                        model2tune,
                                        model_name,
                                        metrics_config,
                                        objective_score,
                                        test_size,
                                        max_iterations
                                        )
hypertune_pipeline.run()

{'mae': 4.488446757024763, 'mse': 28.384234235724193, 'rmse': 5.327685635970294, 'mape': 0.7575678850710423, 'r2': -0.036105942289546}
{'mae': 3.6208510879219773, 'mse': 18.984520266703868, 'rmse': 4.35712293454108, 'mape': 0.6393040446886932, 'r2': 0.30701057155553957}
{'mae': 3.8434929666779523, 'mse': 21.325336199573563, 'rmse': 4.617936357245903, 'mape': 0.6710344892423423, 'r2': 0.22156407764238573}
{'mae': 3.763562778092821, 'mse': 20.37553624092825, 'rmse': 4.513926920202436, 'mape': 0.6582637934512159, 'r2': 0.2562344997142345}
{'mae': 4.488446757024763, 'mse': 28.384234235724193, 'rmse': 5.327685635970294, 'mape': 0.7575678850710423, 'r2': -0.036105942289546}
{'mae': 2.2947307398561216, 'mse': 8.230221496745532, 'rmse': 2.868836261752408, 'mape': 0.4133440167944106, 'r2': 0.699573314949441}
{'mae': 3.1557401094190873, 'mse': 14.696454762607033, 'rmse': 3.833595539778164, 'mape': 0.5681267169519197, 'r2': 0.4635372586179567}
{'mae': 3.6545549623227984, 'mse': 19.306218278729546

In [17]:
hypertune_pipeline.best_hyperparams

{'boosting_type': np.int64(0),
 'colsample_bytree': np.float64(0.8238073679710151),
 'max_depth': np.float64(229.0),
 'num_leaves': np.float64(463.0),
 'reg_alpha': np.float64(202.7434171464968),
 'reg_lambda': np.float64(103.88259065308114)}

In [18]:
hypertune_pipeline.trials.best_trial

{'state': 2,
 'tid': 5,
 'spec': None,
 'result': {'loss': 8.230221496745532, 'status': 'ok'},
 'misc': {'tid': 5,
  'cmd': ('domain_attachment', 'FMinIter_Domain'),
  'workdir': None,
  'idxs': {'boosting_type': [np.int64(5)],
   'colsample_bytree': [np.int64(5)],
   'max_depth': [np.int64(5)],
   'num_leaves': [np.int64(5)],
   'reg_alpha': [np.int64(5)],
   'reg_lambda': [np.int64(5)]},
  'vals': {'boosting_type': [np.int64(0)],
   'colsample_bytree': [np.float64(0.8238073679710151)],
   'max_depth': [np.float64(229.0)],
   'num_leaves': [np.float64(463.0)],
   'reg_alpha': [np.float64(202.7434171464968)],
   'reg_lambda': [np.float64(103.88259065308114)]}},
 'exp_key': None,
 'owner': None,
 'version': 0,
 'book_time': datetime.datetime(2025, 2, 27, 16, 32, 33, 204000),
 'refresh_time': datetime.datetime(2025, 2, 27, 16, 32, 33, 340000)}