### Advanced modeling

In [9]:
# linear algebra
import numpy as np

#working with data in table structers
import pandas as pd
pd.set_option('display.float_format', lambda x: '%.2f' % x)

# data visualization
import seaborn as sns
import matplotlib.pyplot as plt


# working with files
import sys
import os

# to off warnings
import warnings
warnings.filterwarnings('ignore')

# validation schema 
import time
from datetime import timedelta, datetime
from sklearn.model_selection import TimeSeriesSplit
from collections import defaultdict
from sklearn.metrics import mean_squared_error as mse
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

In [10]:
# # using ray engine for parallel calculation(for oprimization)
# %env MODIN_ENGINE=ray
# import modin.pandas as mpd

### Read data

In [11]:
# add data path to sys.path 
train_test_data_path = "C:\\Repository\\DS-Intership-data\\train_test_data\\"
sys.path.append(train_test_data_path)

# initiate dict for data
to_read_train_test_data = {}

# fill to_read
for dir_name, _, files in os.walk(train_test_data_path):
    for file in files:
        to_read_train_test_data[file] = dir_name + file

In [12]:
# check to_read
to_read_train_test_data

{'sample_submission.csv': 'C:\\Repository\\DS-Intership-data\\train_test_data\\sample_submission.csv',
 'test_data.csv': 'C:\\Repository\\DS-Intership-data\\train_test_data\\test_data.csv',
 'train_data.csv': 'C:\\Repository\\DS-Intership-data\\train_test_data\\train_data.csv'}

In [13]:
%%time
data = {}
# read data
for file, path in to_read_train_test_data.items():
    data[file.split('.')[0]] = pd.read_csv(os.path.join(os.path.dirname(path), file), index_col=0)

CPU times: total: 41 s
Wall time: 47.6 s


### Machine learning pipeline modeling

In [14]:
class Pipeline:
    def __init__(self, train_data,
                test_data, metrics=['rmse'],
                model=DecisionTreeRegressor(max_depth=1, random_state=42),
                check_nans = True,
                check_infs = True,
                feature_importance_layer=True,
                hyperparametr_optimization_layer=True,
                explainability_layer=True,
                error_analysis_layer=True
                ):
        self.train_data = train_data
        self.test_data = test_data
        self.metrics = metrics
        self.model = model
        self.hyperparametr_optimization_layer=hyperparametr_optimization_layer
        self.explainability_layer=explainability_layer
        self.error_analysis_layer=error_analysis_layer

        # Check data for valid columns
        assert set([
            'date_block_num',
            'shop_id',
            'item_category_id',
            'item_id',
            'item_cnt_month'
        ]).issubset(train_data.columns),\
            "Invalid data"

        assert set([
            'shop_id',
            'item_id',
            'ID'
        ]).issubset(test_data.columns),\
            "Invalid data"

         # Check for valid variables
        if check_nans:
            assert train_data.isna().sum().sum() == 0, 'Data have NaNs'
        if check_infs:
            assert np.isfinite(train_data).sum().sum() != 0, 'Data have Infs'

        self.X = train_data.drop(columns='item_cnt_month')
        self.y = train_data[['item_id', 'shop_id', 'item_cnt_month']]
        
    def calculate_metrics(self, y_pred, y_true):
        rmse = mse(y_true, y_pred, squared=True)
        return rmse
    
    # Predict sales for target month (November 2015)
    def predict_target(self, predictions_by_ID = True):
        X_train, y_train = self.train_data[self.train_data.date_block_num != 34].drop(columns=['item_cnt_month']),\
                           self.train_data[self.train_data.date_block_num != 34].item_cnt_month
        X_test, y_test = self.train_data[self.train_data.date_block_num == 34].drop(columns=['item_cnt_month']),\
                         self.train_data[self.train_data.date_block_num == 34].item_cnt_month
        model = self.model
        model.fit(X_train, y_train)
        if predictions_by_ID:
            return X_test.join(pd.DataFrame(index=X_test.index, data=model.predict(X_test.values), columns=['item_cnt_month']))\
                                        [['item_id', 'shop_id', 'item_cnt_month']].\
                                        merge(self.test_data, on=['shop_id', 'item_id'], how='right').drop_duplicates()
        else:
            return model.predict(X_test)
        
    def feature_importance_layer(self):
        pass
    def hyperparametr_optimization_layer(self):
        pass
    def explainability_layer(self):
        pass
    def error_analysis_layer(self):
        pass
    
    def evaluate(self):
        # optimal_hyperparametres = self.hyperparametr_optimization_layer()
        # important_features = self.feature_importance_layer()
        predictions = self.predict_target() # optimal_hyperparametres, important_features)
        # model_explanation = self.explainability_layer()
        # error_analysis = self.error_analysis_layer()
        return predictions

In [15]:
%%time
test_pipeline = Pipeline(train_data=data['train_data'],
                 test_data=data['test_data'],
                 metrics=['rmse'],
                 model = DecisionTreeRegressor(max_depth=2, random_state=42),
                 check_nans=False
                )

CPU times: total: 2.05 s
Wall time: 2.21 s


In [16]:
%%time
test_pipeline.evaluate()

CPU times: total: 1min 2s
Wall time: 1min 24s


Unnamed: 0,item_id,shop_id,item_cnt_month,ID,date_block_num
0,5037,5,0.33,0,34
2,5320,5,0.33,1,34
3,5233,5,0.33,2,34
5,5232,5,0.33,3,34
7,5268,5,0.33,4,34
...,...,...,...,...,...
309449,18454,45,0.33,214195,34
309450,16188,45,0.33,214196,34
309451,15757,45,0.33,214197,34
309452,19648,45,0.33,214198,34
