### Validation schema

In [1]:
# linear algebra
import numpy as np

#working with data in table structers
import pandas as pd
pd.set_option('display.float_format', lambda x: '%.2f' % x)

# data visualization
import seaborn as sns
import matplotlib.pyplot as plt


# working with files
import sys
import os

# to off warnings
import warnings
warnings.filterwarnings('ignore')

# validation schema 
import time
from datetime import timedelta, datetime
from sklearn.model_selection import TimeSeriesSplit
from collections import defaultdict
from sklearn.metrics import mean_squared_error as mse
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

In [2]:
# # using ray engine for parallel calculation(for oprimization)
# %env MODIN_ENGINE=ray
# import modin.pandas as mpd

### Read data

In [3]:
# add data path to sys.path 
train_test_data_path = "C:\\Repository\\DS-Intership-data\\train_test_data\\"
sys.path.append(train_test_data_path)

# initiate dict for data
to_read_train_test_data = {}

# fill to_read
for dir_name, _, files in os.walk(train_test_data_path):
    for file in files:
        to_read_train_test_data[file] = dir_name + file

In [4]:
# check to_read
to_read_train_test_data

{'sample_submission.csv': 'C:\\Repository\\DS-Intership-data\\train_test_data\\sample_submission.csv',
 'test_data.csv': 'C:\\Repository\\DS-Intership-data\\train_test_data\\test_data.csv',
 'train_data.csv': 'C:\\Repository\\DS-Intership-data\\train_test_data\\train_data.csv'}

In [5]:
%%time
data = {}
# read data
for file, path in to_read_train_test_data.items():
    data[file.split('.')[0]] = pd.read_csv(os.path.join(os.path.dirname(path), file), index_col=0)

CPU times: total: 5.36 s
Wall time: 5.45 s


In [6]:
# need to fix in data preprocessing (bag of memory usage reduce)
data['train_data'] = data['train_data'][data['train_data'].profit != float('inf')]

In [7]:
# to select predicts we only need for submision
data['train_data'].merge(data['test_data'], on=['item_id', 'shop_id'], how='left').ID

0                NaN
1                NaN
2                NaN
3                NaN
4           20995.00
              ...   
10902713   149575.00
10902714   148799.00
10902715   149242.00
10902716   152328.00
10902717   148377.00
Name: ID, Length: 10902718, dtype: float64

### Validation schema creation

In [8]:
"""
    The following indexes will be used:
      date_block_num
      shop_id
      item_category_id
      item_id
      item_cnt_month
      
    Concept:
    Apply expanding window validation
    Monthly predictions
"""

class Validation:
    def __init__(self, data, metrics=['rmse'], n_splits=10):
        self.data = data
        self.metrics = metrics
        self.n_splits = n_splits

        assert set([
                    'date_block_num',
                    'shop_id',
                    'item_category_id',
                    'item_id',
                    'item_cnt_month'
                   ]).issubset(data.columns),\
                "Invalid data"
        
    def calculate_metrics(self, y_pred, y_true):
        rmse = mse(y_true, y_pred, squared=True)
        return rmse

    def evaluate(self, model=DecisionTreeRegressor(max_depth=5)):
        eval_report = {}
        tscv = TimeSeriesSplit(n_splits=self.n_splits)
        X = data['train_data'].drop(columns='item_cnt_month')
        y = data['train_data'].item_cnt_month
        step = 0
        for train, test in tscv.split(y):
            step += 1

            # Split data step
            y_tr, y_ts = y.iloc[train].values, y.iloc[test].values
            X_tr, X_ts = X.iloc[train].values, X.iloc[test].values

            # Train step
            rng = np.random.RandomState(42)
            model = model
            model.fit(X_tr, y_tr)

            # Evaluation step
            y_tr_pr = model.predict(X_tr)
            y_ts_pr = model.predict(X_ts)
            
            # Metrics
            eval_report["step"+str(step)] = {
                "train\test limits" : f"TRAIN: from {train.min()} to  {train.max()}  (size: {train.max() - train.min()} ) " +
                                      f"TEST: from {test.min()} to  {test.max()}  (size: {test.max() - test.min()} )",
                "error" : [self.calculate_metrics(y_tr_pr, y_tr), self.calculate_metrics(y_ts_pr, y_ts)],
                "feature_importance" : model.feature_importances_,
                "__________________" : "_________________________________________________________________________________"
            }
        return eval_report

In [9]:
%%time
val = Validation(data['train_data'])
val.evaluate()

CPU times: total: 1min 12s
Wall time: 1min 19s


{'step1': {'train\test limits': 'TRAIN: from 0 to  991157  (size: 991157 ) TEST: from 991158 to  1982313  (size: 991155 )',
  'error': [2.7755703599570585, 2.877392044566422],
  'feature_importance': array([4.51336510e-02, 1.43183142e-01, 3.38873785e-01, 6.66430652e-03,
         0.00000000e+00, 0.00000000e+00, 5.01583643e-02, 2.26861119e-02,
         1.41614549e-07, 3.33613351e-01, 5.96871475e-02, 0.00000000e+00]),
  '__________________': '_________________________________________________________________________________'},
 'step2': {'train\test limits': 'TRAIN: from 0 to  1982313  (size: 1982313 ) TEST: from 1982314 to  2973469  (size: 991155 )',
  'error': [2.7439262660143213, 5.836563234664521],
  'feature_importance': array([0.01768405, 0.13007455, 0.22700182, 0.49410234, 0.        ,
         0.        , 0.        , 0.06125952, 0.02491591, 0.01919449,
         0.02576732, 0.        ]),
  '__________________': '________________________________________________________________________