In [13]:
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
import pandas as pd
import scipy.linalg as sc
from sklearn import preprocessing
from utils import qualityMACAPE, qualityRMSE
from sklearn.ensemble import RandomForestRegressor
from tqdm import tqdm_notebook
from sklearn.utils.extmath import softmax
%matplotlib inline

In [30]:
# softmax([[1.,3,3],[3.,1.,0] ])
# >>> array([[0.06337894, 0.46831053, 0.46831053],
# >>> [0.84379473, 0.1141952 , 0.04201007]])

# datetime.strptime('17DEC2018','%d%b%Y').toordinal()
# >>> 737045

In [3]:
ts = pd.read_csv('./EGG_train.csv', encoding="cp1251", sep=';', decimal=',', parse_dates=True, dayfirst=True)
# parse dates
ts['period_start_dt'] = [ datetime.strptime(date, '%d%b%Y').toordinal() for date in ts['period_start_dt'] ]
ts = ts.sort_values(by=['PRODUCT_ID', 'STORE_LOCATION_ID', 'period_start_dt'])
ts = ts.groupby(['PRODUCT_ID', 'STORE_LOCATION_ID']).agg(
    lambda x: list(x)[0] if np.unique(x).shape[0] == 1 else list(x) )

base_algs = pd.read_csv('./EGG_BA_FCST.csv', encoding="cp1251", sep=';',
                        decimal=',', parse_dates=True, dayfirst=True)
# parse dates
base_algs['dt'] = [ datetime.strptime(date, '%d%b%Y').toordinal() for date in base_algs['dt'] ]
base_algs = base_algs.sort_values(by=['PRODUCT_LVL_ID', 'STORE_LOCATION_LVL_ID', 'dt']).groupby(
    ['PRODUCT_LVL_ID', 'STORE_LOCATION_LVL_ID']).agg(lambda x: list(x))

prod_feat = list(ts.columns)
prod_feat.remove('period_start_dt')
prod_feat.remove('demand_qty')
prod_feat

['STORE_LOCATION_LVL_NM2',
 'STORE_LOCATION_LVL_NM7',
 'ST_LOC_DIVISION_NM',
 'ST_LOC_MNGMT_CLSTR',
 'PRODUCT_LVL_NM8',
 'PRODUCT_LVL_NM13',
 'PRODUCT_DESC']

In [25]:
def arbitrating_composition(ts, base_algs, prod_feat, h, params ):
    ''' Parameters:
    ts <Pandas DataFrame> - data frame of time series with features (=prod_feat)
    base_algs <Pandas DataFrame> - data frame of base algorithms answers
    prod_feat <list of str> - features of product (don't include demand, date)
    h <integer scalar> - forecasting delay
    params <dict> - is not used
    '''
    base_algs_num = 3  # TODO: make it parameter
    forecast = np.empty( (ts.shape[0], max(list(map(len, ts['period_start_dt']))) + h) )
    forecast[:] = np.nan
    
    X_train, Y_train = None, None
    rf = [ RandomForestRegressor() for _ in range(base_algs_num) ]  # random forest
    le = [ (preprocessing.LabelEncoder()).fit(ts[pf]) for pf in prod_feat]
    is_fitted = False
    
    print(f'Wait {len(ts)} iterations')
    for ts_n, ((pr_id, st_id), _) in tqdm_notebook(enumerate(ts.iterrows()) ):
        ba = base_algs.loc[pr_id, st_id]
        x = ts.loc[pr_id, st_id]['demand_qty']
        forecast_ba = np.array([ba['REGULAR'], ba['SAS_FAW'], ba['SAS_MINER'] ], dtype=np.float )
        
        step = 7  # TODO: make it parameter
        k = 8  # size of sample for meta-learner TODO: make it parameter
        
        xx = sc.hankel( x[:-(k-1)], [-1]+list(x[-(k-1):]) )[:-h, :]
        xx_f = np.array( [ le[j].transform([ts.loc[pr_id, st_id][prod_feat[j]] ]) for j in range(len(prod_feat)) ] )
        xx = np.hstack( [xx, np.tile( xx_f.reshape(-1), (xx.shape[0], 1) ) ] )
        
        yy = np.vstack([ x[k-1+h:] for i in range(base_algs_num) ])
        yy = np.abs(forecast_ba[:, k-1+h: k-1+h + yy.shape[1]] - yy )
        for t in range( yy.shape[1] + k-1+h ):
            if (t >= k-1+h) and (not np.isnan(yy[:, t-(k-1+h)]).any()) and is_fitted:
                x_t = xx[t - (k-1+h)]
                ans = [ rf[i].predict([x_t])[0] for i in range(len(rf)) ]
                forecast[ts_n, t] = np.dot(softmax([ans])[0], forecast_ba[:, t])
            if (t % step == 0) and (t != 0):
                for rf_i in range(len(rf)):
                    if X_train is None:
                        batch_x, batch_y = xx[:t-(k-1+h)], yy[rf_i, :t-(k-1+h)]
                    else:
                        batch_x, batch_y = np.vstack( (X_train, xx[:t-(k-1+h)] )), np.append(Y_train[rf_i, :], yy[rf_i, :t-(k-1+h)])
                    if batch_y.shape[0] != 0:
                        rf[rf_i] = RandomForestRegressor()
                        rf[rf_i].fit( batch_x, batch_y )
                        is_fitted = True
        if X_train is None:
            X_train, Y_train = xx, yy
        else:
            X_train, Y_train = np.vstack( (X_train, xx) ), np.hstack( (Y_train, yy) )
    return forecast

In [28]:
forecast = arbitrating_composition(ts, base_algs, prod_feat, 1, {})

In [5]:
#ts.iloc[9747] - три значения вр
#ts[np.logical_and( (ts['PRODUCT_ID'] == 475861),ts['STORE_LOCATION_ID'] == 845840)]

In [27]:
# forecast[ np.logical_not(np.isnan(forecast)) ].shape

# for i, col in enumerate(ts.columns):
#     ts[col].plot(figsize=(15,5), color='green', label='real')
#     pd.Series(data=forecast[i, :-1], index=ts[col].index).plot(color='red', label='arbitrated')
    
#     plt.title(col)
#     plt.legend()
#     plt.show()