### Validation schema

In [1]:
# linear algebra
import numpy as np

#working with data in table structers
import pandas as pd
pd.set_option('display.float_format', lambda x: '%.2f' % x)

# data visualization
import seaborn as sns
import matplotlib.pyplot as plt


# working with files
import sys
import os

# to off warnings
import warnings
warnings.filterwarnings('ignore')

# validation schema 
import time
from datetime import timedelta, datetime
from sklearn.model_selection import TimeSeriesSplit
from collections import defaultdict
from sklearn.metrics import mean_squared_error as mse
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

In [2]:
# using ray engine for parallel calculation(for oprimization)
%env MODIN_ENGINE=ray
import modin.pandas as mpd

env: MODIN_ENGINE=ray


### Read data

In [3]:
# add data path to sys.path 
train_test_data_path = "C:\\Repository\\DS-Intership-data\\train_test_data\\"
sys.path.append(train_test_data_path)

# initiate dict for data
to_read_train_test_data = {}

# fill to_read
for dir_name, _, files in os.walk(train_test_data_path):
    for file in files:
        to_read_train_test_data[file] = dir_name + file

In [4]:
# check to_read
to_read_train_test_data

{'sample_submission.csv': 'C:\\Repository\\DS-Intership-data\\train_test_data\\sample_submission.csv',
 'test_data.csv': 'C:\\Repository\\DS-Intership-data\\train_test_data\\test_data.csv',
 'train_data.csv': 'C:\\Repository\\DS-Intership-data\\train_test_data\\train_data.csv'}

In [5]:
%%time
data = {}
# read data
for file, path in to_read_train_test_data.items():
    data[file.split('.')[0]] = pd.read_csv(os.path.join(os.path.dirname(path), file), index_col=0)

CPU times: total: 10.1 s
Wall time: 10.5 s


### Validation schema creation

In [152]:
"""
    The following indexes will be used:
      date_block_num
      shop_id
      item_category_id
      item_id
      item_cnt_month
      
    Concept:
    Apply expanding window validation (except last month - target of competition)
    Monthly predictions
"""

class Validation:
    def __init__(self, train_data,
                 test_data, metrics=['rmse'],
                 n_splits=2,
                 model=DecisionTreeRegressor(max_depth=1, random_state=42)
                ):
        self.train_data = train_data
        self.test_data = test_data
        self.metrics = metrics
        self.n_splits = n_splits
        self.model = model

        # Check data for valid columns
        assert set([
                    'date_block_num',
                    'shop_id',
                    'item_category_id',
                    'item_id',
                    'item_cnt_month'
                   ]).issubset(train_data.columns),\
                "Invalid data"
        
        assert set([
                    'shop_id',
                    'item_id',
                    'ID'
                    ]).issubset(test_data.columns),\
                "Invalid data"
        
        # Check for valid variables
        assert train_data.isna().sum().sum() == 0, 'Data have NaNs'
        assert np.isfinite(train_data).sum().sum() != 0, 'Data have Infs'
        
        self.X = train_data.drop(columns='item_cnt_month')
        self.y = train_data[['item_id', 'shop_id', 'item_cnt_month']]
        
        
    def calculate_metrics(self, y_pred, y_true):
        rmse = mse(y_true, y_pred, squared=True)
        return rmse
    
    # Predict sales for November 2015
    def fit_predict(self, predictions_by_ID = True):
        pass

    def evaluate(self, predictions_by_ID = True):
        eval_report = {}
        tscv = TimeSeriesSplit(n_splits=self.n_splits)
        step = 0
        for train, test in tscv.split(self.y):
            step += 1

            # Split data step
            y_tr, y_ts = self.y.iloc[train], self.y.iloc[test]
            X_tr, X_ts = self.X.iloc[train], self.X.iloc[test]

            # Train step
            rng = np.random.RandomState(42)
            model = self.model
            model.fit(X_tr.values, y_tr.item_cnt_month.values)

            # Evaluation step
            y_tr_pr = pd.DataFrame(index=X_tr.index, data=model.predict(X_tr.values), columns=['item_cnt_month'])
            y_ts_pr = pd.DataFrame(index=X_ts.index, data=model.predict(X_ts.values), columns=['item_cnt_month'])
            
            # Extract step (predictions by ID)
            if predictions_by_ID == True:
                nan_report = {}
                y_tr = y_tr.merge(self.test_data, on=['shop_id', 'item_id'], how='right')
                nan_report['y_train'] = f"{y_tr.item_cnt_month.isna().sum() / y_tr.shape[0] * 100} %"
                y_tr = y_tr.fillna(0)
                y_ts = y_ts.merge(self.test_data, on=['shop_id', 'item_id'], how='right')
                nan_report['y_test'] = f"{y_ts.item_cnt_month.isna().sum() / y_ts.shape[0] * 100} %"
                y_ts = y_ts.fillna(0)
                y_tr_pr = X_tr.join(y_tr_pr)[['item_id', 'shop_id', 'item_cnt_month']].\
                                            merge(self.test_data, on=['shop_id', 'item_id'], how='right')
                nan_report['y_train_pred'] = f"{y_tr_pr.item_cnt_month.isna().sum() / y_tr_pr.shape[0] * 100} %"
                y_tr_pr = y_tr_pr.fillna(0)
                y_ts_pr = X_ts.join(y_ts_pr)[['item_id', 'shop_id', 'item_cnt_month']].\
                                            merge(self.test_data, on=['shop_id', 'item_id'], how='right')
                nan_report['y_test_pred'] = f"{y_ts_pr.item_cnt_month.isna().sum() / y_ts_pr.shape[0] * 100} %"
                y_ts_pr = y_ts_pr.fillna(0)
            
            # Metrics calucaltion step
            eval_report["step"+str(step)] = {
                "train\test limits" : f"TRAIN: from {train.min()} to  {train.max()}  (size: {train.max() - train.min()} ) " +
                                      f"TEST: from {test.min()} to  {test.max()}  (size: {test.max() - test.min()} )",
                "error" : [self.calculate_metrics(y_tr_pr.item_cnt_month, y_tr.item_cnt_month),
                           self.calculate_metrics(y_ts_pr.item_cnt_month, y_ts.item_cnt_month)],
                "feature_importance" : model.feature_importances_,
                "nan_report" : nan_report,
                "__________________" : "_________________________________________________________________________________"
            }
            
        return eval_report, y_ts_pr       

In [153]:
%%time
val = Validation(train_data=data['train_data'],
                 test_data=data['test_data'],
                 n_splits=5,
                 model = DecisionTreeRegressor(max_depth=1, random_state=42))

CPU times: total: 562 ms
Wall time: 567 ms


In [154]:
%%time
val.evaluate()

CPU times: total: 25.8 s
Wall time: 26.2 s


({'step1': {'train\test limits': 'TRAIN: from 0 to  1408978  (size: 1408978 ) TEST: from 1408979 to  2817954  (size: 1408975 )',
   'error': [3.958650855160097, 13.684399963683227],
   'feature_importance': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
          0.]),
   'nan_report': {'y_train': '52.728930571837964 %',
    'y_test': '46.75204073568098 %',
    'y_train_pred': '52.728930571837964 %',
    'y_test_pred': '46.75204073568098 %'},
   '__________________': '_________________________________________________________________________________'},
  'step2': {'train\test limits': 'TRAIN: from 0 to  2817954  (size: 2817954 ) TEST: from 2817955 to  4226930  (size: 1408975 )',
   'error': [7.500121799609886, 7.089370749074403],
   'feature_importance': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
          0.]),
   'nan_report': {'y_train': '32.206109978031144 %',
    'y_test': '36.75200293560027 %',
    'y_train_pred': '32.206