### Validation schema

In [1]:
# linear algebra
import numpy as np

#working with data in table structers
import pandas as pd
pd.set_option('display.float_format', lambda x: '%.2f' % x)

# data visualization
import seaborn as sns
import matplotlib.pyplot as plt


# working with files
import sys
import os

# to off warnings
import warnings
warnings.filterwarnings('ignore')

# validation schema 
import time
from datetime import timedelta, datetime
from sklearn.model_selection import TimeSeriesSplit
from collections import defaultdict
from sklearn.metrics import mean_squared_error as mse
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

In [2]:
# using ray engine for parallel calculation(for oprimization)
%env MODIN_ENGINE=ray
import modin.pandas as mpd

### Read data

In [3]:
# add data path to sys.path 
train_test_data_path = "C:\\Repository\\DS-Intership-data\\train_test_data\\"
sys.path.append(train_test_data_path)

# initiate dict for data
to_read_train_test_data = {}

# fill to_read
for dir_name, _, files in os.walk(train_test_data_path):
    for file in files:
        to_read_train_test_data[file] = dir_name + file

In [4]:
# check to_read
to_read_train_test_data

{'sample_submission.csv': 'C:\\Repository\\DS-Intership-data\\train_test_data\\sample_submission.csv',
 'test_data.csv': 'C:\\Repository\\DS-Intership-data\\train_test_data\\test_data.csv',
 'train_data.csv': 'C:\\Repository\\DS-Intership-data\\train_test_data\\train_data.csv'}

In [5]:
%%time
data = {}
# read data
for file, path in to_read_train_test_data.items():
    data[file.split('.')[0]] = pd.read_csv(os.path.join(os.path.dirname(path), file), index_col=0)

CPU times: total: 10.2 s
Wall time: 10.5 s


### Validation schema creation

In [20]:
"""
    The following indexes will be used:
      date_block_num
      shop_id
      item_category_id
      item_id
      item_cnt_month
      
    Concept:
    Apply expanding window validation
    Monthly predictions
"""

class Validation:
    def __init__(self, data, metrics=['rmse'], n_splits=10):
        self.data = data
        self.metrics = metrics
        self.n_splits = n_splits

        assert set([
                    'date_block_num',
                    'shop_id',
                    'item_category_id',
                    'item_id',
                    'item_cnt_month'
                   ]).issubset(data.columns),\
                "Invalid data"
        
    def calculate_metrics(self, y_pred, y_true):
        rmse = mse(y_true, y_pred, squared=True)
        return rmse

    def evaluate(self, model=RandomForestRegressor(max_depth=5, n_estimators=5, random_state=42)):
        eval_report = {}
        tscv = TimeSeriesSplit(n_splits=self.n_splits)
        data['train_data'] = data['train_data'].dropna()
        X = data['train_data'].drop(columns='item_cnt_month')
        y = data['train_data'].item_cnt_month
        step = 0
        for train, test in tscv.split(y):
            step += 1

            # Split data step
            y_tr, y_ts = y.iloc[train].values, y.iloc[test].values
            X_tr, X_ts = X.iloc[train].values, X.iloc[test].values

            # Train step
            rng = np.random.RandomState(42)
            model = model
            model.fit(X_tr, y_tr)

            # Evaluation step
            y_tr_pr = model.predict(X_tr)
            y_ts_pr = model.predict(X_ts)
            
            # Metrics
            eval_report["step"+str(step)] = {
                "train\test limits" : f"TRAIN: from {train.min()} to  {train.max()}  (size: {train.max() - train.min()} ) " +
                                      f"TEST: from {test.min()} to  {test.max()}  (size: {test.max() - test.min()} )",
                "error" : [self.calculate_metrics(y_tr_pr, y_tr), self.calculate_metrics(y_ts_pr, y_ts)],
                "feature_importance" : model.feature_importances_,
                "__________________" : "_________________________________________________________________________________"
            }
        return eval_report

In [21]:
%%time
val = Validation(data['train_data'])
val.evaluate()

CPU times: total: 8min 37s
Wall time: 9min 24s


{'step1': {'train\test limits': 'TRAIN: from 0 to  768538  (size: 768538 ) TEST: from 768539 to  1537070  (size: 768531 )',
  'error': [0.8865417115365465, 2.716969283081017],
  'feature_importance': array([3.09234013e-03, 0.00000000e+00, 7.68077350e-04, 4.20136059e-03,
         1.30872375e-02, 0.00000000e+00, 1.25157869e-03, 2.14183098e-03,
         0.00000000e+00, 0.00000000e+00, 7.15144810e-04, 8.75384390e-01,
         1.04999068e-03, 6.30949907e-02, 2.08190351e-02, 1.03446718e-03,
         3.26573926e-03, 1.00938171e-02]),
  '__________________': '_________________________________________________________________________________'},
 'step2': {'train\test limits': 'TRAIN: from 0 to  1537070  (size: 1537070 ) TEST: from 1537071 to  2305602  (size: 768531 )',
  'error': [1.225768874098291, 1.9906718851848355],
  'feature_importance': array([1.70830842e-03, 0.00000000e+00, 3.85108747e-03, 0.00000000e+00,
         9.11041125e-03, 0.00000000e+00, 4.88950110e-03, 2.93348027e-04,
         1