### Advanced modeling

In [52]:
# linear algebra
import numpy as np

#working with data in table structers
import pandas as pd
pd.set_option('display.float_format', lambda x: '%.2f' % x)

# data visualization
import seaborn as sns
import matplotlib.pyplot as plt


# working with files
import sys
import os

# to off warnings
import warnings
warnings.filterwarnings('ignore')

# pipeline schema 
import time
from datetime import timedelta, datetime
from sklearn.model_selection import TimeSeriesSplit as tscv
from collections import defaultdict
from sklearn.metrics import mean_squared_error as mse
from hyperopt import hp, fmin, tpe, Trials
from sklearn.model_selection import train_test_split
from scipy.stats import randint, uniform
import optuna

# models
from catboost import CatBoostRegressor, Pool
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

In [2]:
# # using ray engine for parallel calculation(for oprimization)
# %env MODIN_ENGINE=ray
# import modin.pandas as mpd

### Read data

In [3]:
# add data path to sys.path 
train_test_data_path = "C:\\Repository\\DS-Intership-data\\train_test_data\\"
sys.path.append(train_test_data_path)

# initiate dict for data
to_read_train_test_data = {}

# fill to_read
for dir_name, _, files in os.walk(train_test_data_path):
    for file in files:
        to_read_train_test_data[file] = dir_name + file

In [4]:
# check to_read
to_read_train_test_data

{'submission_example.csv': 'C:\\Repository\\DS-Intership-data\\train_test_data\\submission_example.csv',
 'test_data.csv': 'C:\\Repository\\DS-Intership-data\\train_test_data\\test_data.csv',
 'train_data.csv': 'C:\\Repository\\DS-Intership-data\\train_test_data\\train_data.csv'}

In [5]:
%%time
data = {}
# read data
for file, path in to_read_train_test_data.items():
    data[file.split('.')[0]] = pd.read_csv(os.path.join(os.path.dirname(path), file), index_col=0)

CPU times: total: 21.5 s
Wall time: 22.1 s


### Machine learning pipeline modeling

In [83]:
class Pipeline:
    def __init__(self,
                 train_data,
                 test_data,
                 submission_example,
                 metrics=['rmse'],
                 model=DecisionTreeRegressor(max_depth=1, random_state=42),
                 check_nans = True,
                 dropna = False,
                 check_infs = True,
                 feature_importance_layer=True,
                 hyperparametr_optimization_layer=True,
                 params = None,
                 explainability_layer=True,
                 error_analysis_layer=True,
                 ):
        self.train_data = train_data
        self.test_data = test_data
        self.submission_example = submission_example
        self.metrics = metrics
        self.model = model
        self.params = params
        self.__feature_importance_layer__ = feature_importance_layer
        self.__hyperparametr_optimization_layer__=hyperparametr_optimization_layer
        self.__explainability_layer__=explainability_layer
        self.__error_analysis_layer__=error_analysis_layer

        # Check data for valid columns
        assert set([
                    'date_block_num',
                    'shop_id',
                    'item_category_id',
                    'item_id',
                    'item_cnt_month'
                   ]).issubset(train_data.columns),\
                "Invalid data"
        
        assert set([
                    'shop_id',
                    'item_category_id',
                    'item_id',
                   ]).issubset(test_data.columns),\
                "Invalid data"
        
        assert set([
                    'shop_id',
                    'item_id',
                    'ID'
                    ]).issubset(submission_example.columns),\
                "Invalid data"

        # Check for valid variables
        if dropna:
            self.train_data = self.train_data.dropna()
            self.test_data = self.test_data.dropna()
            
        if check_nans:
            assert self.train_data.isna().sum().sum() == 0, 'Train data have NaNs'
        if check_infs:
            assert np.isfinite(self.train_data).sum().sum() != 0, 'Train data have Infs'
        if check_nans:
            assert self.test_data.isna().sum().sum() == 0, 'Test data have NaNs'
        if check_infs:
            assert np.isfinite(self.test_data).sum().sum() != 0, 'Test data have Infs'
        
        # Сheck for sorting by timeseries data
        amount_of_unsorted_rows = len(self.train_data) - (self.train_data.date_block_num.diff().fillna(0) >= 0).sum()
        if amount_of_unsorted_rows != 0:
            print(f"Data is not sorted by time ({amount_of_unsorted_rows} rows), it will be further sorted automatically")
            self.train_data = self.train_data.sort_values(by=['date_block_num'])
        
         # Split to X and y
        self.X = self.train_data.drop(columns='item_cnt_month')
        self.y = self.train_data[['item_id', 'shop_id', 'item_cnt_month']]
        
    def calculate_metrics(self, y_pred, y_true):
        rmse = mse(y_true, y_pred, squared=True)
        return rmse
    
    # Predict sales for target month (November 2015)
    def predict_target(self):
        """
            Return target predictions in accordance with submission example
        """
        X_train, y_train = self.train_data.drop(columns=['item_cnt_month']),\
                           self.train_data.item_cnt_month
        X_test = self.test_data
        X_test['date_block_num'] = 34
        model = self.model
        model.fit(X_train, y_train)
        result = X_test.join(pd.DataFrame(index=X_test.index, data=model.predict(X_test.values),\
                                       columns=['item_cnt_month']))\
                                            [['item_id', 'shop_id', 'item_cnt_month']].\
                            merge(self.submission_example, on=['shop_id', 'item_id'], how='right')\
                .drop_duplicates(['item_id', 'shop_id'])[['ID', 'item_cnt_month']].sort_values(by='ID')
        result.item_cnt_month = result.item_cnt_month.clip(0, 20).fillna(0) 
        return result

    def validate(self, model, n_splits=5):
        
        # Initialize variables and tscv
        score = []
        step = 0
        tscv = TimeSeriesSplit(n_splits=n_splits)
        
        # Validation loop
        for train, val in tscv.split(self.y):

            step += 1

            # Split data step
            y_tr, y_val = self.y.iloc[train], self.y.iloc[val]
            X_tr, X_val = self.X.iloc[train], self.X.iloc[val]

            # Train step
            rng = np.random.RandomState(42)
            model = model
            model.fit(X_tr.values, y_tr.item_cnt_month.values)

            # Evaluation step
            y_tr_pr = pd.DataFrame(index=X_tr.index, data=model.predict(X_tr.values), columns=['item_cnt_month'])
            y_val_pr = pd.DataFrame(index=X_val.index, data=model.predict(X_val.values), columns=['item_cnt_month'])
            
            # Extract step     
            y_tr = y_tr.merge(self.test_data, on=['shop_id', 'item_id'], how='right')
            y_tr = y_tr.fillna(0)
            y_val = y_val.merge(self.test_data, on=['shop_id', 'item_id'], how='right')
            y_val = y_val.fillna(0)
            y_tr_pr = X_tr.join(y_tr_pr)[['item_id', 'shop_id', 'item_cnt_month']].\
                                        merge(self.test_data, on=['shop_id', 'item_id'], how='right')
            y_tr_pr = y_tr_pr.fillna(0)
            y_val_pr = X_val.join(y_val_pr)[['item_id', 'shop_id', 'item_cnt_month']].\
                                        merge(self.test_data, on=['shop_id', 'item_id'], how='right')
            y_val_pr = y_val_pr.fillna(0)

            # Validation score calculation step
            score.append(self.calculate_metrics(y_val_pr.item_cnt_month, y_val.item_cnt_month))
            
        return score.mean()
    
    
    def feature_importance_layer(self, selector="Boruta", sample_size=None):
        if sample_size is None:
            sample_size = self.train_data.shape[0]
        if selector=="Boruta":
            from boruta import BorutaPy
            # select sample of data
            X = self.train_data[self.train_data.date_block_num != 34].dropna().drop(columns='item_cnt_month')[:sample_size]
            y = self.train_data[self.train_data.date_block_num != 34].dropna()['item_cnt_month'][:sample_size]
            np.int = np.int_
            np.float = np.float_
            np.bool=np.bool_

            # init selector
            feat_selector = BorutaPy(RandomForestRegressor(max_depth=5, n_jobs=-1, n_estimators=20), 
                                     n_estimators=20, 
                                     verbose=0, 
                                     max_iter=20,
                                     random_state=42,
                                 )

            # fit selector
            feat_selector.fit(X.values, y.values)

            # extract usefull features
            mask = np.array(feat_selector.support_).reshape(4,8)
            plt.imshow(mask);
            plt.title("Feature selection")
            plt.show()
            important_features = data['train_data'].drop(columns=['item_cnt_month']).iloc[:, feat_selector.support_]

            # save info about usefull/useless features
            feature_importnce_report = {
                "important_columns": data['train_data'].drop(columns=['item_cnt_month'])\
                                                                       .iloc[:, feat_selector.support_].columns,
                "unimportant_columns": data['train_data'].drop(columns=['item_cnt_month'])\
                                                                       .iloc[:, ~feat_selector.support_].columns
            }
            print(feature_importance_report)
            
            return important_features
        
    def hyperparametr_optimization_layer(self, optimazer="Grid"):
        assert self.params is None, "Parameter are not initialized"
        
        if optimazer=="Optuna":
            print(1)
            def objective(trial):
                params = {
                    "iterations": 1000,
                    "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
                    "depth": trial.suggest_int("depth", 1, 10),
                    "subsample": trial.suggest_float("subsample", 0.05, 1.0),
                    "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.05, 1.0),
                    "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 100),
                }

                model = cb.CatBoostRegressor(**params, silent=True)
                model.fit(X_train, y_train)
                predictions = model.predict(X_val)
                rmse = mean_squared_error(y_val, predictions, squared=False)
                return rmse
        if optimazer=="Hyperopt":
            print(2)
            # Optimization function
            def objective(self.params):
                model = model(
                    iterations=int(self.params['iterations']),
                    learning_rate=self.params['learning_rate'],
                    depth=int(self.params['depth']),
                    l2_leaf_reg=self.params['l2_leaf_reg'],
                    loss_function='RMSE',
                    random_seed=42
                )
                cv_results = self.validate(model)
                best_rmse = np.min(cv_results['test-RMSE-mean'])
                return best_rmse

            # Инициализируйте Trials для отслеживания результатов оптимизации
            trials = Trials()

            # Запустите оптимизацию гиперпараметров с помощью TPE-алгоритма
            best = fmin(fn=objective,
                        space=space,
                        algo=tpe.suggest,
                        max_evals=100,  # Количество итераций оптимизации
                        trials=trials,
                        rstate=np.random.default_rng(42))

            # Выведите лучшие найденные гиперпараметры
            print("Лучшие гиперпараметры:")
            print(best)
        
    def explainability_layer(self):
        pass
    def error_analysis_layer(self):
        pass
    
    def evaluate(self):
        # optimal_hyperparametres = self.hyperparametr_optimization_layer()
        if self.__feature_importance_layer__:
            important_features = self.feature_importance_layer()
        if self.__feature_importance_layer__ and self.__hyperparametr_optimization_layer__:
            predictions = self.predict_target(optimal_hyperparametres, important_features)
        elif self.__feature_importance_layer__:
            predictions = self.predict_target(important_features)
        elif self.__hyperparametr_optimization_layer__:
            predictions = self.predict_target(optimal_hyperparametres)
        else:
            predictions = self.predict_target()
        # model_explanation = self.explainability_layer()
        # error_analysis = self.error_analysis_layer()
        return predictions

In [74]:
%%time
test_pipeline = Pipeline(train_data=data['train_data'],
                 test_data=data['test_data'],
                 submission_example=data['submission_example'],
                 metrics=['rmse'],
                 model = DecisionTreeRegressor(max_depth=2, random_state=42),
                 check_nans=False,
                 feature_importance_layer=False,
                 hyperparametr_optimization_layer=False
                )

Data is not sorted by time (18781 rows), it will be further sorted automatically
CPU times: total: 6.19 s
Wall time: 7.18 s


In [61]:
#%%time
#test_pipeline.validate()

In [78]:
params = {
        "iterations": 1000,
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
        "depth": trial.suggest_int("depth", 1, 10),
        "subsample": trial.suggest_float("subsample", 0.05, 1.0),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.05, 1.0),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 100),
    }

CatBoostTest = Pipeline(train_data=data['train_data'],
                 test_data=data['test_data'],
                 submission_example=data['submission_example'],
                 metrics=['rmse'],
                 model = CatBoostRegressor(max_depth=2, random_state=42),
                 check_nans=False,
                 feature_importance_layer=False,
                 hyperparametr_optimization_layer=True,
                 params = params
                )

Data is not sorted by time (18781 rows), it will be further sorted automatically


In [79]:
CatBoostTest.evaluate()

NameError: name 'optimal_hyperparametres' is not defined

In [51]:
import numpy as np
import pandas as pd
from catboost import CatBoostRegressor, Pool
from hyperopt import hp, fmin, tpe, Trials

# Определите пространство поиска для гиперпараметров
space = {
    'iterations': hp.quniform('iterations', 100, 1000, 1),
    'learning_rate': hp.loguniform('learning_rate', -3, 0),
    'depth': hp.quniform('depth', 4, 10, 1),
    'l2_leaf_reg': hp.loguniform('l2_leaf_reg', 0, 3),
}

# Optimization function
def objective(params):
    model = CatBoostRegressor(
        iterations=int(params['iterations']),
        learning_rate=params['learning_rate'],
        depth=int(params['depth']),
        l2_leaf_reg=params['l2_leaf_reg'],
        loss_function='RMSE',
        random_seed=42
    )
    cv_results = test_pipeline.validate(model)
    best_rmse = np.min(cv_results['test-RMSE-mean'])
    return best_rmse

# Инициализируйте Trials для отслеживания результатов оптимизации
trials = Trials()

# Запустите оптимизацию гиперпараметров с помощью TPE-алгоритма
best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=100,  # Количество итераций оптимизации
            trials=trials,
            rstate=np.random.default_rng(42))

# Выведите лучшие найденные гиперпараметры
print("Лучшие гиперпараметры:")
print(best)

  0%|                                                                          | 0/100 [00:00<?, ?trial/s, best loss=?]0:	learn: 1.5607971	total: 266ms	remaining: 3m 57s
1:	learn: 1.5146829	total: 356ms	remaining: 2m 38s
2:	learn: 1.4864273	total: 449ms	remaining: 2m 13s
3:	learn: 1.4621804	total: 566ms	remaining: 2m 6s
4:	learn: 1.4485856	total: 668ms	remaining: 1m 59s
5:	learn: 1.4398749	total: 759ms	remaining: 1m 52s
6:	learn: 1.4310919	total: 857ms	remaining: 1m 48s
7:	learn: 1.4251730	total: 945ms	remaining: 1m 44s
8:	learn: 1.4207594	total: 1.04s	remaining: 1m 42s
9:	learn: 1.4170501	total: 1.13s	remaining: 1m 39s
10:	learn: 1.4092206	total: 1.22s	remaining: 1m 37s
11:	learn: 1.4031659	total: 1.31s	remaining: 1m 36s
12:	learn: 1.3805405	total: 1.41s	remaining: 1m 35s
13:	learn: 1.3763168	total: 1.5s	remaining: 1m 34s
14:	learn: 1.3485989	total: 1.6s	remaining: 1m 33s
15:	learn: 1.3312456	total: 1.72s	remaining: 1m 34s
16:	learn: 1.3275381	total: 1.83s	remaining: 1m 34s
17:	learn:

156:	learn: 1.1235099	total: 15.4s	remaining: 1m 12s
157:	learn: 1.1187275	total: 15.5s	remaining: 1m 12s
158:	learn: 1.1159338	total: 15.6s	remaining: 1m 12s
159:	learn: 1.1147657	total: 15.7s	remaining: 1m 12s
160:	learn: 1.1140587	total: 15.8s	remaining: 1m 12s
161:	learn: 1.1137153	total: 15.9s	remaining: 1m 12s
162:	learn: 1.1129091	total: 16s	remaining: 1m 12s
163:	learn: 1.1125109	total: 16.1s	remaining: 1m 11s
164:	learn: 1.1122252	total: 16.2s	remaining: 1m 11s
165:	learn: 1.1119425	total: 16.3s	remaining: 1m 11s
166:	learn: 1.1114571	total: 16.4s	remaining: 1m 11s
167:	learn: 1.1113410	total: 16.5s	remaining: 1m 11s
168:	learn: 1.1110795	total: 16.6s	remaining: 1m 11s
169:	learn: 1.1108222	total: 16.7s	remaining: 1m 11s
170:	learn: 1.1095467	total: 16.8s	remaining: 1m 11s
171:	learn: 1.1092863	total: 16.9s	remaining: 1m 11s
172:	learn: 1.1078751	total: 17s	remaining: 1m 11s
173:	learn: 1.1075387	total: 17.1s	remaining: 1m 11s
174:	learn: 1.1072008	total: 17.2s	remaining: 1m 1

316:	learn: 1.0509036	total: 30.5s	remaining: 55.8s
317:	learn: 1.0506526	total: 30.6s	remaining: 55.7s
318:	learn: 1.0504283	total: 30.7s	remaining: 55.5s
319:	learn: 1.0502837	total: 30.8s	remaining: 55.4s
320:	learn: 1.0501392	total: 30.9s	remaining: 55.3s
321:	learn: 1.0494085	total: 31s	remaining: 55.2s
322:	learn: 1.0477882	total: 31.1s	remaining: 55.1s
323:	learn: 1.0472326	total: 31.2s	remaining: 55s
324:	learn: 1.0471009	total: 31.3s	remaining: 54.9s
325:	learn: 1.0463454	total: 31.3s	remaining: 54.8s
326:	learn: 1.0446162	total: 31.5s	remaining: 54.7s
327:	learn: 1.0443206	total: 31.5s	remaining: 54.6s
328:	learn: 1.0441481	total: 31.6s	remaining: 54.5s
329:	learn: 1.0439451	total: 31.7s	remaining: 54.4s
330:	learn: 1.0438738	total: 31.8s	remaining: 54.3s
331:	learn: 1.0433586	total: 31.9s	remaining: 54.3s
332:	learn: 1.0420446	total: 32s	remaining: 54.2s
333:	learn: 1.0417867	total: 32.1s	remaining: 54.1s
334:	learn: 1.0414869	total: 32.2s	remaining: 54s
335:	learn: 1.041177

476:	learn: 1.0109609	total: 45.1s	remaining: 39.7s
477:	learn: 1.0109360	total: 45.2s	remaining: 39.5s
478:	learn: 1.0108953	total: 45.3s	remaining: 39.4s
479:	learn: 1.0107852	total: 45.4s	remaining: 39.3s
480:	learn: 1.0103959	total: 45.5s	remaining: 39.2s
481:	learn: 1.0097854	total: 45.6s	remaining: 39.1s
482:	learn: 1.0096040	total: 45.7s	remaining: 39s
483:	learn: 1.0090803	total: 45.8s	remaining: 39s
484:	learn: 1.0087740	total: 45.9s	remaining: 38.9s
485:	learn: 1.0086910	total: 46s	remaining: 38.8s
486:	learn: 1.0086328	total: 46s	remaining: 38.7s
487:	learn: 1.0086254	total: 46.1s	remaining: 38.6s
488:	learn: 1.0086175	total: 46.2s	remaining: 38.5s
489:	learn: 1.0085461	total: 46.3s	remaining: 38.4s
490:	learn: 1.0085021	total: 46.4s	remaining: 38.3s
491:	learn: 1.0083886	total: 46.5s	remaining: 38.2s
492:	learn: 1.0083318	total: 46.6s	remaining: 38.1s
493:	learn: 1.0083104	total: 46.6s	remaining: 38s
494:	learn: 1.0082269	total: 46.7s	remaining: 37.9s
495:	learn: 1.0077152	

637:	learn: 0.9789505	total: 1m	remaining: 24.4s
638:	learn: 0.9787957	total: 1m	remaining: 24.4s
639:	learn: 0.9787400	total: 1m	remaining: 24.3s
640:	learn: 0.9786427	total: 1m	remaining: 24.2s
641:	learn: 0.9785901	total: 1m	remaining: 24.1s
642:	learn: 0.9784861	total: 1m	remaining: 24s
643:	learn: 0.9784373	total: 1m 1s	remaining: 23.9s
644:	learn: 0.9779915	total: 1m 1s	remaining: 23.8s
645:	learn: 0.9778920	total: 1m 1s	remaining: 23.7s
646:	learn: 0.9777785	total: 1m 1s	remaining: 23.6s
647:	learn: 0.9777411	total: 1m 1s	remaining: 23.5s
648:	learn: 0.9776630	total: 1m 1s	remaining: 23.4s
649:	learn: 0.9774571	total: 1m 1s	remaining: 23.3s
650:	learn: 0.9773378	total: 1m 1s	remaining: 23.2s
651:	learn: 0.9772668	total: 1m 1s	remaining: 23.1s
652:	learn: 0.9772304	total: 1m 1s	remaining: 23s
653:	learn: 0.9771540	total: 1m 1s	remaining: 22.9s
654:	learn: 0.9769391	total: 1m 2s	remaining: 22.8s
655:	learn: 0.9768674	total: 1m 2s	remaining: 22.7s
656:	learn: 0.9768148	total: 1m 2s

796:	learn: 0.9608392	total: 1m 15s	remaining: 9.34s
797:	learn: 0.9607779	total: 1m 15s	remaining: 9.24s
798:	learn: 0.9607390	total: 1m 15s	remaining: 9.14s
799:	learn: 0.9605620	total: 1m 15s	remaining: 9.05s
800:	learn: 0.9603055	total: 1m 15s	remaining: 8.96s
801:	learn: 0.9602806	total: 1m 15s	remaining: 8.86s
802:	learn: 0.9601189	total: 1m 15s	remaining: 8.77s
803:	learn: 0.9600515	total: 1m 15s	remaining: 8.67s
804:	learn: 0.9599136	total: 1m 15s	remaining: 8.58s
805:	learn: 0.9598590	total: 1m 15s	remaining: 8.48s
806:	learn: 0.9598170	total: 1m 16s	remaining: 8.39s
807:	learn: 0.9597777	total: 1m 16s	remaining: 8.29s
808:	learn: 0.9596230	total: 1m 16s	remaining: 8.2s
809:	learn: 0.9595451	total: 1m 16s	remaining: 8.1s
810:	learn: 0.9592348	total: 1m 16s	remaining: 8.01s
811:	learn: 0.9591518	total: 1m 16s	remaining: 7.92s
812:	learn: 0.9590965	total: 1m 16s	remaining: 7.83s
813:	learn: 0.9590416	total: 1m 16s	remaining: 7.73s
814:	learn: 0.9564604	total: 1m 16s	remaining: 7

58:	learn: 1.2823661	total: 10.7s	remaining: 2m 32s
59:	learn: 1.2812840	total: 10.9s	remaining: 2m 32s
60:	learn: 1.2794969	total: 11.1s	remaining: 2m 31s
61:	learn: 1.2788335	total: 11.3s	remaining: 2m 31s
62:	learn: 1.2777612	total: 11.5s	remaining: 2m 31s
63:	learn: 1.2653108	total: 11.7s	remaining: 2m 31s
64:	learn: 1.2641008	total: 11.8s	remaining: 2m 31s
65:	learn: 1.2632231	total: 12s	remaining: 2m 30s
66:	learn: 1.2615446	total: 12.2s	remaining: 2m 30s
67:	learn: 1.2609385	total: 12.4s	remaining: 2m 30s
68:	learn: 1.2601617	total: 12.6s	remaining: 2m 30s
69:	learn: 1.2598567	total: 12.8s	remaining: 2m 30s
70:	learn: 1.2589999	total: 13s	remaining: 2m 30s
71:	learn: 1.2584055	total: 13.1s	remaining: 2m 30s
72:	learn: 1.2575787	total: 13.3s	remaining: 2m 30s
73:	learn: 1.2566904	total: 13.5s	remaining: 2m 29s
74:	learn: 1.2545481	total: 13.7s	remaining: 2m 29s
75:	learn: 1.2538286	total: 13.9s	remaining: 2m 29s
76:	learn: 1.2420056	total: 14.1s	remaining: 2m 30s
77:	learn: 1.241

KeyboardInterrupt: 