### Advanced modeling

In [1]:
# linear algebra
import numpy as np

#working with data in table structers
import pandas as pd
pd.set_option('display.float_format', lambda x: '%.2f' % x)

# data visualization
import seaborn as sns
import matplotlib.pyplot as plt


# working with files
import sys
import os

# to off warnings
import warnings
warnings.filterwarnings('ignore')

# validation schema 
import time
from datetime import timedelta, datetime
from sklearn.model_selection import TimeSeriesSplit
from collections import defaultdict
from sklearn.metrics import mean_squared_error as mse
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

In [2]:
# # using ray engine for parallel calculation(for oprimization)
# %env MODIN_ENGINE=ray
# import modin.pandas as mpd

### Read data

In [3]:
# add data path to sys.path 
train_test_data_path = "C:\\Repository\\DS-Intership-data\\train_test_data\\"
sys.path.append(train_test_data_path)

# initiate dict for data
to_read_train_test_data = {}

# fill to_read
for dir_name, _, files in os.walk(train_test_data_path):
    for file in files:
        to_read_train_test_data[file] = dir_name + file

In [4]:
# check to_read
to_read_train_test_data

{'sample_submission.csv': 'C:\\Repository\\DS-Intership-data\\train_test_data\\sample_submission.csv',
 'test_data.csv': 'C:\\Repository\\DS-Intership-data\\train_test_data\\test_data.csv',
 'train_data.csv': 'C:\\Repository\\DS-Intership-data\\train_test_data\\train_data.csv'}

In [5]:
%%time
data = {}
# read data
for file, path in to_read_train_test_data.items():
    data[file.split('.')[0]] = pd.read_csv(os.path.join(os.path.dirname(path), file), index_col=0)

CPU times: total: 41 s
Wall time: 43.8 s


### Machine learning pipeline modeling

In [13]:
class Pipeline:
    def __init__(self, train_data,
                test_data, metrics=['rmse'],
                model=DecisionTreeRegressor(max_depth=1, random_state=42),
                check_nans = True,
                check_infs = True,
                feature_importance_layer=True,
                hyperparametr_optimization_layer=True,
                explainability_layer=True,
                error_analysis_layer=True
                ):
        self.train_data = train_data
        self.test_data = test_data
        self.metrics = metrics
        self.model = model
        self.hyperparametr_optimization_layer=hyperparametr_optimization_layer
        self.explainability_layer=explainability_layer
        self.error_analysis_layer=error_analysis_layer

        # Check data for valid columns
        assert set([
            'date_block_num',
            'shop_id',
            'item_category_id',
            'item_id',
            'item_cnt_month'
        ]).issubset(train_data.columns),\
            "Invalid data"

        assert set([
            'shop_id',
            'item_id',
            'ID'
        ]).issubset(test_data.columns),\
            "Invalid data"

         # Check for valid variables
        if check_nans:
            assert train_data.isna().sum().sum() == 0, 'Data have NaNs'
        if check_infs:
            assert np.isfinite(train_data).sum().sum() != 0, 'Data have Infs'

        self.X = train_data.drop(columns='item_cnt_month')
        self.y = train_data[['item_id', 'shop_id', 'item_cnt_month']]
        
    def calculate_metrics(self, y_pred, y_true):
        rmse = mse(y_true, y_pred, squared=True)
        return rmse
    
    # Predict sales for target month (November 2015)
    def predict_target(self, train_data=None):
        assert train_data is None, "train_data is not defined"
        
        X_train, y_train = train_data[train_data.date_block_num != 34].drop(columns=['item_cnt_month']),\
                           train_data[train_data.date_block_num != 34].item_cnt_month
        X_test, y_test = train_data[train_data.date_block_num == 34].drop(columns=['item_cnt_month']),\
                         train_data[train_data.date_block_num == 34].item_cnt_month
        model = self.model
        model.fit(X_train, y_train)
        return X_test.join(pd.DataFrame(index=X_test.index, data=model.predict(X_test.values), columns=['item_cnt_month']))\
                                        [['item_id', 'shop_id', 'item_cnt_month']].\
                                        merge(self.test_data, on=['shop_id', 'item_id'], how='right').drop_duplicates()
        
    def feature_importance_layer(self, selector="Boruta", sample_size=self.train_data):
        if selector=="Boruta":
            from boruta import BorutaPy
            # select sample of data
            X = self.train_data[self.train_data.date_block_num != 34].dropna().drop(columns='item_cnt_month')[:sample_size]
            y = self.train_data[self.train_data.date_block_num != 34].dropna()['item_cnt_month'][:sample_size]
            np.int = np.int_
            np.float = np.float_
            np.bool=np.bool_

            # init selector
            feat_selector = BorutaPy(RandomForestRegressor(max_depth=5, n_jobs=-1, n_estimators=20), 
                                     n_estimators=20, 
                                     verbose=0, 
                                     max_iter=20,
                                     random_state=42,
                                 )

            # fit selector
            feat_selector.fit(X.values, y.values)

            # extract usefull features
            mask = np.array(feat_selector.support_).reshape(4,8)
            plt.imshow(mask);
            plt.title("Feature selection")
            plt.show()
            important_features = data['train_data'].drop(columns=['item_cnt_month']).iloc[:, feat_selector.support_]

            # save info about usefull/useless features
            feature_importnce_report = {
                "important_columns": data['train_data'].drop(columns=['item_cnt_month'])\
                                                                       .iloc[:, feat_selector.support_].columns,
                "unimportant_columns": data['train_data'].drop(columns=['item_cnt_month'])\
                                                                       .iloc[:, ~feat_selector.support_].columns
            }
            print(feature_importance_report)
            
            return important_features
        
    def hyperparametr_optimization_layer(self, optimazer="Grid"):
        if optimazer=="Optuna":
            pass
        if optimazer=="Hyperopt":
            pass
        
    def explainability_layer(self):
        pass
    def error_analysis_layer(self):
        pass
    
    def evaluate(self):
        # optimal_hyperparametres = self.hyperparametr_optimization_layer()
        if self.feature_importance_layer:
            important_features = self.feature_importance_layer()
        predictions = self.predict_target(important_features) # optimal_hyperparametres, important_features)
        # model_explanation = self.explainability_layer()
        # error_analysis = self.error_analysis_layer()
        return predictions

In [14]:
%%time
test_pipeline = Pipeline(train_data=data['train_data'],
                 test_data=data['test_data'],
                 metrics=['rmse'],
                 model = DecisionTreeRegressor(max_depth=2, random_state=42),
                 check_nans=False,
                 feature_importance_layer=True
                )

CPU times: total: 1.59 s
Wall time: 2.05 s


In [16]:
%%time
test_pipeline.evaluate()