In [1]:
# Import library and dataset
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_log_error
from sklearn.ensemble import RandomForestRegressor
from sklearn import base

import warnings
warnings.filterwarnings('ignore')

In [2]:
raw = pd.read_csv('dss_mock_1.csv')

In [3]:
raw.describe()

Unnamed: 0,date,food_1,food_2,food_3,food_4,is_holiday
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,1571316000.0,587.609,343.696,331.703,226.048,0.188
std,2286453.0,24.6876,18.659415,19.067731,15.228053,0.390908
min,1567297000.0,496.0,293.0,277.0,181.0,0.0
25%,1569300000.0,571.0,331.0,319.0,216.0,0.0
50%,1571338000.0,588.0,344.0,332.0,226.0,0.0
75%,1573352000.0,604.0,356.0,344.0,236.0,0.0
max,1575155000.0,672.0,401.0,396.0,275.0,1.0


In [4]:
# cleaning dataset
# convert 'epoch' date into appropriate datetime
from datetime import datetime
tf = raw.copy()
tf['date_conv'] = raw['date'].apply(lambda x : datetime.fromtimestamp(x).strftime("%Y-%m-%d"))

In [5]:
tf.drop('date', axis=1, inplace=True)

In [6]:
tf['day'] = tf['date_conv'].apply(lambda x : x.split('-')[2]).astype('int64')
tf['month'] = tf['date_conv'].apply(lambda x: x.split('-')[1]).astype('int64')
tf['year'] = tf['date_conv'].apply(lambda x: x.split('-')[0]).astype('int64')
tf = tf.sort_values(by='date_conv').reset_index(drop=True)

In [7]:
tf.drop_duplicates(inplace=True, keep='first', subset='date_conv')

In [8]:
tf = tf.reset_index(drop=True)
tf.drop('date_conv', axis=1, inplace=True)

In [9]:
tf_sales = tf.drop(['is_holiday','day','month','year'], axis=1)
tf_date = tf.drop(['food_1','food_2','food_3','food_4'],axis=1)

In [10]:
tf_sales_c = tf_sales.T.unstack().reset_index(level=1, name='sales').rename(columns={'level_1':'menu'})
tf_sales_c['menu'] = tf_sales_c['menu'].str.extract('(\d+)', expand=False).astype(int)
tf_complete = pd.concat([tf_sales_c,tf_date],axis=1, join='inner').reset_index(drop=True)

In [11]:
tf_complete

Unnamed: 0,menu,sales,is_holiday,day,month,year
0,1,584,0,1,9,2019
1,2,354,0,1,9,2019
2,3,355,0,1,9,2019
3,4,208,0,1,9,2019
4,1,610,1,2,9,2019
5,2,360,1,2,9,2019
6,3,346,1,2,9,2019
7,4,231,1,2,9,2019
8,1,611,0,3,9,2019
9,2,323,0,3,9,2019


# To Supervised

In [12]:
class ToSupervised(base.BaseEstimator,base.TransformerMixin):
# to supervised
    def __init__(self,col,groupCol,numLags,dropna=False):

        self.col = col
        self.groupCol = groupCol
        self.numLags = numLags
        self.dropna = dropna

    def fit(self,X,y=None):
        self.X = X
        return self

    def transform(self,X):
        tmp = self.X.copy()
        for i in range(1,self.numLags+1):
            tmp[str(i)+'_days_ago'+"_"+self.col] = tmp.groupby([self.groupCol])[self.col].shift(i) 

        if self.dropna:
            tmp = tmp.dropna()
            tmp = tmp.reset_index(drop=True)



        return tmp

In [13]:
class ToSupervisedDiff(base.BaseEstimator,base.TransformerMixin):
    
    def __init__(self,col,groupCol,numLags,dropna=False):
        
        self.col = col
        self.groupCol = groupCol
        self.numLags = numLags
        self.dropna = dropna
        
    def fit(self,X,y=None):
        self.X = X
        return self
    
    def transform(self,X):
        tmp = self.X.copy()
        for i in range(1,self.numLags+1):
            tmp[str(i)+'_days_ago_diff_'+"_"+self.col] = tmp.groupby([self.groupCol])[self.col].diff(i) 
            
        if self.dropna:
            tmp = tmp.dropna()
            tmp = tmp.reset_index(drop=True)
            
        return tmp

# Time Series K-Fold

In [14]:
from itertools import chain
class Kfold_time(object):
    
    def __init__(self,**options):
        
        
        self.target     = options.pop('target', None)
        self.date_col   = options.pop('date_col', None)
        self.date_init  = options.pop('date_init', None)
        self.date_final = options.pop('date_final', None)

        if options:
            raise TypeError("Invalid parameters passed: %s" % str(options))
            
        if ((self.target==None )| (self.date_col==None )| (self.date_init==None ) | (self.date_final==None )):
            raise TypeError("Incomplete inputs")
    
    def _train_test_split_time(self,X):
        n_arrays = len(X)
        if n_arrays == 0:
            raise ValueError("At least one array required as input")

        for i in range(self.date_init,self.date_final):

            train = X[X[self.date_col] < i]
            val   = X[X[self.date_col] == i]

            X_train, X_test = train.drop([self.target], axis=1), val.drop([self.target], axis=1)
            y_train, y_test = train[self.target].values, val[self.target].values

            yield X_train, X_test, y_train, y_test

    
    def split(self,X):
        cv_t = self._train_test_split_time(X)
        return chain(cv_t)

# Metric: RMSLE

In [15]:
def rmsle(ytrue, ypred):
    return np.sqrt(mean_squared_log_error(ytrue, ypred))

# Baseline Estimator

In [16]:
class BaseEstimator(base.BaseEstimator, base.RegressorMixin):
    def __init__(self, predCol):
        """
            As a base model we assume the number of sales last week and this week are the same
            Input: 
                    predCol: l-week ago sales
        """
        self.predCol = predCol

        
    def fit(self, X, y):
        return self


    def predict(self, X):
        prediction = X[self.predCol].values
        return prediction

    def score(self, X, y,scoring):
        
        prediction = self.predict(X)
    
        error =scoring(y, prediction)# np.sqrt(mean_squared_log_error(y, prediction))
        return error

# Time Series Regression

In [50]:
class TimeSeriesRegressor(base.BaseEstimator, base.RegressorMixin):
    
    def __init__(self,model,cv,scoring,verbosity=True):
        self.model = model
        self.cv = cv
        self.verbosity = verbosity
        self.scoring = scoring 
        
            
    def fit(self,X,y=None):
        return self
        
    
#     def predict(self,X=None):
        
#         pred = {}
#         for indx,fold in enumerate(self.cv.split(X)):

#             X_train, X_test, y_train, y_test = fold    
#             self.model.fit(X_train, y_train)
#             pred[str(indx)+'_fold'] = self.model.predict(X_test)
            
#         prediction = pd.DataFrame(pred)
    
#         return prediction
    
    def predict(self,X=None):
        pred = {}
        x_train, x_test, y_train, y_test = fold
        self.model.fit(x_train, y_train)
        pred = self.model.predict(x_test)
        prediction = pd.DataFrame(pred)
        return prediction

    def score(self,X,y=None):


        errors = []
        for indx,fold in enumerate(self.cv.split(X)):

            X_train, X_test, y_train, y_test = fold    
            self.model.fit(X_train, y_train)
            prediction = self.model.predict(X_test)
            error = self.scoring(y_test, prediction)
            errors.append(error)

            if self.verbosity:
                print("Fold: {}, Error: {:.4f}".format(indx,error))

        if self.verbosity:
            print('Total Error {:.4f}'.format(np.mean(errors)))

        return errors

In [37]:
class TimeSeriesRegressorLog(base.BaseEstimator, base.RegressorMixin):
    
    def __init__(self,model,cv,scoring,verbosity=True):
        self.model = model
        self.cv = cv
        self.verbosity = verbosity
        self.scoring = scoring
        
            
    def fit(self,X,y=None):
        return self
        
    
    def predict(self,X=None):
        
        pred = {}
        for indx,fold in enumerate(self.cv.split(X)):

            X_train, X_test, y_train, y_test = fold    
            self.model.fit(X_train, y_train)
            pred[str(indx)+'_fold'] = self.model.predict(X_test)
            
        prediction = pd.DataFrame(pred)
    
        return prediction

    
    def score(self,X,y=None):#**options):


        errors = []
        for indx,fold in enumerate(self.cv.split(X)):

            X_train, X_test, y_train, y_test = fold    
            self.model.fit(X_train, np.log1p(y_train))
            prediction = np.expm1(self.model.predict(X_test))
            error = self.scoring(y_test, prediction)
            errors.append(error)

            if self.verbosity:
                print("Fold: {}, Error: {:.4f}".format(indx,error))

        if self.verbosity:
                print('Total Error {:.4f}'.format(np.mean(errors)))

        return errors

# Steps Tuning

In [19]:
def getDataFramePipeline(i):
    steps = [(str(i)+'_step',ToSupervised('sales','menu',i))]
    for j in range(1,i+1):
        if i==j:

            pp = (str(j)+'_step_diff',
                  ToSupervisedDiff(str(i)+'_days_ago_sales','menu',1,dropna=True))
            steps.append(pp)
        else:

            pp = (str(j)+'_step_diff',
                  ToSupervisedDiff(str(i)+'_days_ago_sales','menu',1))
            steps.append(pp)
            
    return steps

In [20]:
from tqdm import tqdm
def stepsTune(X,model,num_steps,init=1):
    scores = []
    for i in tqdm(range(init,num_steps+1)):
        steps = []
        steps.extend(getDataFramePipeline(i))
        steps.append(('predic_1',model))
        super_ = Pipeline(steps).fit(X)
        score_ = np.mean(super_.score(X))
        scores.append((i,score_))
        
    return scores

# Tune Hyperparameter

In [21]:
from collections.abc import Mapping, Sequence, Iterable
from itertools import product
from functools import partial, reduce
import operator

class TimeGridBasic(base.BaseEstimator, base.RegressorMixin):
    
    def __init__(self,param_grid):
        
    
        if not isinstance(param_grid, (Mapping, Iterable)):
                raise TypeError('Parameter grid is not a dict or '
                                'a list ({!r})'.format(param_grid))

        if isinstance(param_grid, Mapping):
                # wrap dictionary in a singleton list to support either dict
                # or list of dicts
                param_grid = [param_grid]


        if isinstance(param_grid, Mapping):
                # wrap dictionary in a singleton list to support either dict
                # or list of dicts
                param_grid = [param_grid]

        # check if all entries are dictionaries of lists
        for grid in param_grid:
            if not isinstance(grid, dict):
                raise TypeError('Parameter grid is not a '
                                'dict ({!r})'.format(grid))
            for key in grid:
                if not isinstance(grid[key], Iterable):
                    raise TypeError('Parameter grid value is not iterable '
                                    '(key={!r}, value={!r})'
                                    .format(key, grid[key]))


        self.param_grid = param_grid
                
    def __iter__(self):
        """Iterate over the points in the grid.
        Returns
        -------
        params : iterator over dict of string to any
            Yields dictionaries mapping each estimator parameter to one of its
            allowed values.
        """
        for p in self.param_grid:
            # Always sort the keys of a dictionary, for reproducibility
            items = sorted(p.items())
            if not items:
                yield {}
            else:
                keys, values = zip(*items)
                for v in product(*values):
                    params = dict(zip(keys, v))
                    yield params

In [22]:
class TimeSeriesGridSearch(TimeGridBasic,base.BaseEstimator, base.RegressorMixin):
    
    
    def __init__(self,**options):
        
        self.model      = options.pop('model', None)
        self.cv         = options.pop('cv', None)
        self.verbosity  = options.pop('verbosity', False)
        self.scoring    = options.pop('scoring', None)
        param_grid      = options.pop('param_grid', None)
        self.param_grid = TimeGridBasic(param_grid)
        
        if options:
            raise TypeError("Invalid parameters passed: %s" % str(options))

        if ((self.model==None )| (self.cv==None)):
            raise TypeError("Incomplete inputs")
            
            
    def fit(self,X,y=None):
        self.X = X
        return self


    def _get_score(self,param):

        errors = []
        for indx,fold in enumerate(self.cv.split(self.X)):

            X_train, X_test, y_train, y_test = fold    
            self.model.set_params(**param).fit(X_train, np.log1p(y_train))
            prediction = np.expm1(self.model.predict(X_test))
            error = self.scoring(y_test, prediction)
            errors.append(error)

            if self.verbosity:
                print("Fold: {}, Error: {:.4f}".format(indx,error))

        if self.verbosity:
                print('Total Error {:.4f}'.format(np.mean(errors)))
                
        
        return errors

    def score(self):

        errors=[]
        get_param = []
        for param in self.param_grid:
            
            if self.verbosity:
                print(param)
                
            errors.append(np.mean(self._get_score(param)))
            get_param.append(param)

        self.sorted_errors,self.sorted_params = (list(t) for t in zip(*sorted(zip(errors,get_param))))
        
        return self.sorted_errors,self.sorted_params
    
    
    def best_estimator(self,verbosity=False):

        if verbosity:
            print('error: {:.4f} \n'.format(self.sorted_errors[0]))
            print('Best params:')
            print(self.sorted_params[0])

        return self.sorted_params[0]

In [23]:
tf_complete

Unnamed: 0,menu,sales,is_holiday,day,month,year
0,1,584,0,1,9,2019
1,2,354,0,1,9,2019
2,3,355,0,1,9,2019
3,4,208,0,1,9,2019
4,1,610,1,2,9,2019
5,2,360,1,2,9,2019
6,3,346,1,2,9,2019
7,4,231,1,2,9,2019
8,1,611,0,3,9,2019
9,2,323,0,3,9,2019


In [38]:
# Data preparation
steps = [('1_step',ToSupervised('sales','menu',1)),
         ('1_step_diff',ToSupervisedDiff('1_days_ago_sales','menu',1,dropna=True))]
super_1 = Pipeline(steps).fit_transform(tf_complete)

In [39]:
tf_complete.drop(['is_holiday','month','year'],axis=1, inplace=True)

KeyError: "['is_holiday' 'month' 'year'] not found in axis"

In [40]:
tf_complete

Unnamed: 0,menu,sales,day
0,1,584,1
1,2,354,1
2,3,355,1
3,4,208,1
4,1,610,2
5,2,360,2
6,3,346,2
7,4,231,2
8,1,611,3
9,2,323,3


# First Model

In [41]:
kf = Kfold_time(target='sales',date_col = 'day',date_init=17,date_final=30)

In [42]:
print(kf)

<__main__.Kfold_time object at 0x00000178EDF67DD8>


In [43]:
# one step
model = RandomForestRegressor(n_estimators=1000, n_jobs=-1, random_state=0)

In [48]:
steps_1 = [('1_step',ToSupervised('sales','menu',1)),
         ('1_step_diff',ToSupervisedDiff('1_days_ago_sales','menu',1,dropna=True)),
         ('predic_1',TimeSeriesRegressor(model=model,cv=kf,scoring=rmsle))]
super_1_p = Pipeline(steps_1).fit(tf_complete)

In [45]:
super_1_p.score(tf_complete)

Fold: 0, Error: 0.0644
Fold: 1, Error: 0.0666
Fold: 2, Error: 0.0925
Fold: 3, Error: 0.0736
Fold: 4, Error: 0.0862
Fold: 5, Error: 0.0712
Fold: 6, Error: 0.0766
Fold: 7, Error: 0.0664
Fold: 8, Error: 0.0726
Fold: 9, Error: 0.0453
Fold: 10, Error: 0.0627
Fold: 11, Error: 0.0568
Fold: 12, Error: 0.0571
Total Error 0.0686


[0.06443201363977456,
 0.0666116454669814,
 0.0924508117558433,
 0.07364802981722209,
 0.08622271693796084,
 0.07117343107298457,
 0.07656515541667958,
 0.06637773070487314,
 0.07257802440138317,
 0.04530767884188991,
 0.06265650473988088,
 0.056755943525360245,
 0.05711466589988496]

In [52]:
super_1_p.predict(tf_complete)

Unnamed: 0,0_fold,1_fold,2_fold,3_fold,4_fold,5_fold,6_fold,7_fold,8_fold,9_fold,10_fold,11_fold,12_fold
0,585.194,571.517,566.809,565.53,580.48,589.95,579.365,588.096,577.292,611.446,582.48,630.408,597.993
1,340.414,332.44,333.009,337.329,348.877,349.154,374.909,332.651,353.16,339.27,351.987,352.972,336.205
2,328.115,330.08,325.584,333.111,349.309,321.418,336.925,318.965,310.502,313.941,308.973,311.634,318.325
3,225.788,237.493,222.097,240.748,236.038,228.514,233.303,224.489,219.148,218.236,219.76,221.406,216.728
4,576.0,580.475,595.758,534.938,591.953,581.851,589.856,600.038,582.955,562.914,585.829,594.095,587.998
5,338.0,338.615,330.164,353.307,338.879,356.477,358.775,331.297,366.145,331.815,337.594,341.902,336.05
6,321.931,334.992,332.817,330.08,334.079,355.005,340.981,323.183,320.51,311.378,318.052,315.011,327.489
7,220.89,240.24,226.7,225.714,238.412,221.323,223.701,224.887,221.927,216.939,225.932,218.525,213.542
8,562.647,592.03,596.385,537.252,590.687,588.894,590.382,603.59,580.243,586.309,598.481,607.469,600.078
9,329.299,332.94,350.467,329.226,337.672,362.744,381.469,351.913,333.073,346.264,343.153,344.988,343.397


# Second Model

In [None]:
steps_3_log = [('1_step',ToSupervised('sales','menu',3)),
         ('1_step_diff',ToSupervisedDiff('1_days_ago_sales','menu',1)),
         ('2_step_diff',ToSupervisedDiff('2_days_ago_sales','menu',1)),
         ('3_step_diff',ToSupervisedDiff('3_days_ago_sales','menu',1,dropna=True)),
         ('predic_3',TimeSeriesRegressorLog(model=model,cv=kf,scoring=rmsle))]
super_3_p_log = Pipeline(steps_3_log).fit(tf_complete)

In [None]:
Model_3_Log_Error = super_3_p_log.score(tf_complete)