In [1]:
from process import SMAPE, WAPE
import numpy as np

### Примеры применений

In [3]:
print('multiple lists ',SMAPE([[1],[2],[3]],2))
print('prediction in one list ',SMAPE([2,3,1],[2,3,1]))
print('2-dim array, rows is predictions ',SMAPE(np.array([[1,2],[3,4],[5,6]]),[4,3]))
print('1-dim array ', SMAPE(np.array([1,2,3]),[2,1,3]))


multiple lists  [0.16666666666666666, 0.0, 0.1]
prediction in one list  [0.0]
2-dim array, rows is predictions  [0.21349206 0.16904762]
1-dim array  [0.1111111111111111]


In [4]:
print('multiple lists ',WAPE([[1],[2],[3]],2))
print('prediction in one list ',WAPE([2,3,1],[2,3,1]))
print('2-dim array, rows is predictions ',WAPE(np.array([[1,2],[3,4],[5,6]]),[4,3]))
print('1-dim array ', WAPE(np.array([1,2,3]),[2,1,3]))


multiple lists  [0.5, 0.0, 0.5]
prediction in one list  [0.0]
2-dim array, rows is predictions  [0.55555556 0.41666667]
1-dim array  [0.3333333333333333]


### пример пайплайна на примере предсказания сглаживания

In [6]:
import pandas as pd
import numpy as np

def fix_nan(csv_data):
    """

    :param csv_data: pd.DataFrame from test.csv
    :return: DataFrame with fixed nan in some columns and make data as datetime
    """
    csv_data["period_start_dt"] = pd.to_datetime(csv_data["period_start_dt"], format='%Y.%m.%d')
    bad_column = ["PROMO1_FLAG","PROMO2_FLAG","NUM_CONSULTANT","AUTORIZATION_FLAG"]
    for replace_name in bad_column:
        csv_data[replace_name] = csv_data[replace_name].fillna(0.0)
    csv_data = csv_data.drop(columns=['PROMO2_FLAG', 'NUM_CONSULTANT'])
    return csv_data
    
def percentile(n):
    '''Calculate n - percentile of data'''
    def percentile_(x):
        return np.percentile(x, n)
    percentile_.__name__ = 'pctl%s' % n
    return percentile_

def lagged_features(df
                    , lags = [7, 14, 21, 28]
                    , windows = [7, 14]
                    , aggregation_methods = {'mean', 'median', percentile(10),  percentile(90)}
                    , promo_filters = [0, 1]
                    , target_var = 'demand'):
    
    # loop by filter variables and window
    for w in windows:

        # check whether filtered df in not empty
        if len(df) > 0:
            
            # lagged features calculation
            lf_df = df.set_index(['product_rk', 'store_location_rk', 'period_start_dt']).\
                 demand.groupby(level=['product_rk','store_location_rk']).\
                apply(lambda x: x.rolling(window=w, min_periods=1).agg(aggregation_methods))

            # provide lags tranformations
            for l in lags:
                new_names = {x: "lag{0}_wdw{1}_{2}".
                              format(l, w, x) for x in lf_df.columns }

                df = df.merge(lf_df.shift(l).reset_index().rename(columns = new_names),
                    how='left', on =['product_rk', 'store_location_rk', 'period_start_dt'] )

    return df

def process(csv_data):
    """

    :param csv_data: pd.DataFrame from test.csv
    :return: fixed DataFrame with one-hot for product and store
    """
    csv_data = fix_nan(csv_data)
    
    def make_column_one_hot(data, column_name):
        data = pd.merge(data, pd.get_dummies(data[column_name], prefix=column_name), left_index=True, right_index=True)
        return data.drop(columns=column_name)

    def prepare_data(data):
        data = lagged_features(data)
        data = make_column_one_hot(data,"product_rk")
        data = make_column_one_hot(data,"store_location_rk")
        return data
        
    return prepare_data(csv_data)

In [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn as skl


In [8]:
train = pd.read_csv('../data/train.csv')
raw_data = train
final = process(raw_data)
final

Unnamed: 0.1,Unnamed: 0,period_start_dt,demand,PROMO1_FLAG,PRICE_REGULAR,PRICE_AFTER_DISC,AUTORIZATION_FLAG,lag7_wdw7_pctl90,lag7_wdw7_mean,lag7_wdw7_median,...,store_location_rk_1191,store_location_rk_1202,store_location_rk_1203,store_location_rk_1281,store_location_rk_1316,store_location_rk_1326,store_location_rk_1328,store_location_rk_1347,store_location_rk_1363,store_location_rk_1380
0,0,2016-12-19,29.0,0.0,,,0.0,,,,...,0,0,0,0,0,0,0,0,0,0
1,1,2016-12-19,64.0,0.0,,,0.0,,,,...,0,0,0,0,0,0,0,0,0,0
2,2,2016-12-19,32.0,0.0,,,0.0,,,,...,0,0,0,0,0,0,0,0,0,0
3,3,2016-12-19,10.0,0.0,,,0.0,,,,...,0,0,0,0,0,0,0,0,0,0
4,4,2016-12-19,15.0,0.0,,,0.0,,,,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35339,35537,2019-12-30,,0.0,1000.00,1000.0,1.0,,1.333333,1.0,...,0,0,0,0,0,0,0,0,0,1
35340,35538,2019-12-30,,0.0,2000.00,2000.0,1.0,,20.666667,20.0,...,0,0,0,0,0,0,0,0,0,1
35341,35539,2019-12-30,,0.0,3000.00,3000.0,1.0,,9.000000,9.0,...,0,0,0,0,0,0,0,0,0,1
35342,35540,2019-12-30,,1.0,284.29,199.0,1.0,,4.666667,5.0,...,0,0,0,0,0,0,0,0,0,1


In [12]:
product = raw_data[(raw_data["product_rk"] == 40369) & (raw_data['store_location_rk'] == 425)]
demand = product['demand'].to_numpy()


In [13]:
def fully_smooth_precision(X, m=52, known_to=80, pred_to=160):
    alpha = beta = gamma = 0.0005
    n = X.shape[0]
    l = np.zeros(pred_to+1)
    b = np.zeros(pred_to+1)
    s = np.zeros(pred_to+1)
    y_hat = np.zeros(pred_to+1)
    l[0] = sum(X[0:known_to:m]) / (known_to//m + 1)
    for i in range(m):
        s[i] = X[i]/l[0]
    b[0] = np.exp(1/m*(np.log(X[m:2*m].sum()) - np.log(X[0:m].sum())))
    for t in range(1, pred_to):
        if t < known_to:
            y_hat[t] = X[t]
        l[t] = alpha*y_hat[t]
        if (t >= m):
            l[t] /= s[t-m]
            s[t] = gamma*y_hat[t]/(l[t-1]*b[t-1]) + (1 - gamma)*s[t-m]
        l[t] += (1 - alpha)*l[t-1]*b[t-1]
        b[t] = beta*l[t]/l[t-1] + (1-beta)*b[t-1]
        #print(t,' ',l[t],' ',b[t],' ',s[t])
        y_hat[t+1] = l[t]*b[t]
        if (t - m + 1 >= 0):
            y_hat[t+1] *= s[t-m+1]
    return y_hat

In [34]:
pred = fully_smooth_precision(demand, known_to=55)
print("SMAPE - {} \n WAPE - {}".format(SMAPE(pred[:154],product['demand'][:154]),WAPE(pred[:154],product['demand'][:154])))


SMAPE - [0.09591625859835859] 
 WAPE - [0.24899808379603655]
