### Requirements for WRMSSE metric:
* Function to convert level 12 series predictions to all level predictions by aggregating
* Actual labels for all levels
* Series ID as part of each DataLoader output
* Function to create a dictionary with weights for all levels acc to series ID

In [1]:
import torch
import numpy as np
import pandas as pd
import category_encoders as ce
from tqdm import notebook
import matplotlib.pyplot as plt
import gc
import pickle as pkl
from itertools import product
%matplotlib inline

In [2]:
train_data = pd.read_csv('../../data/sales_train_validation.csv')
sell_prices = pd.read_csv('../../data/sell_prices.csv')
calendar = pd.read_csv('../../data/calendar.csv')
sample_submission = pd.read_csv('../../data/sample_submission.csv')
weights_validation = pd.read_csv('../../data/weights_validation.csv')

In [3]:
with open('../data/data.pickle', 'rb') as f:
    data_dict = pkl.load(f)
    
sales_data_ids = data_dict['sales_data_ids']
calendar_index = data_dict['calendar_index']
X_prev_day_sales = data_dict['X_prev_day_sales']
X_enc_only_feats = data_dict['X_enc_only_feats']
X_enc_dec_feats = data_dict['X_enc_dec_feats']
X_calendar = data_dict['X_calendar']
X_calendar_cols = data_dict['X_calendar_cols']
Y = data_dict['Y']

In [10]:
list(sales_data_ids)[0]

array(['HOBBIES_1_001', 'HOBBIES_1', 'HOBBIES', 'CA_1', 'CA'],
      dtype=object)

#### Get all aggregated series from level 12 series

In [4]:
def get_aggregated_series(sales, item_id, dept_id, cat_id, store_id, state_id):
    """
    Aggregates 30,490 level 12 series to generate data for all 42,840 series
    
    Input data format:
    sales: np array of shape (30490, num_timesteps)
    all id arguments: np arrays of shape (30490,)
    """
    
    aggregated_series, aggregated_series_id = np.empty((42840, sales.shape[1])), np.empty(42840, '<U28')
    
    # Level 1
    aggregated_series[0] = sales.sum(0)
    aggregated_series_id[0] = 'Level1_Total_X'
    fill_id = 1
    
    # Level 2
    for agg_element in np.unique(state_id):
        agg_sales = sales[np.where(state_id == agg_element)[0]].sum(0)[np.newaxis, :]
        aggregated_series[fill_id] = agg_sales
        aggregated_series_id[fill_id] = f'Level2_{agg_element}_X'
        fill_id += 1
        
    # Level 3
    for agg_element in np.unique(store_id):
        agg_sales = sales[np.where(store_id == agg_element)[0]].sum(0)[np.newaxis, :]
        aggregated_series[fill_id] = agg_sales
        aggregated_series_id[fill_id] = f'Level3_{agg_element}_X'
        fill_id += 1

        
    # Level 4
    for agg_element in np.unique(cat_id):
        agg_sales = sales[np.where(cat_id == agg_element)[0]].sum(0)[np.newaxis, :]
        aggregated_series[fill_id] = agg_sales
        aggregated_series_id[fill_id] = f'Level4_{agg_element}_X'
        fill_id += 1
        
    # Level 5
    for agg_element in np.unique(dept_id):
        agg_sales = sales[np.where(dept_id == agg_element)[0]].sum(0)[np.newaxis, :]
        aggregated_series[fill_id] = agg_sales
        aggregated_series_id[fill_id] = f'Level5_{agg_element}_X'
        fill_id += 1
    
    # Level 6
    for agg_1, agg_2 in product(np.unique(state_id), np.unique(cat_id)):
        agg_sales = sales[np.where((state_id == agg_1) & (cat_id == agg_2))[0]].sum(0)[np.newaxis, :]
        aggregated_series[fill_id] = agg_sales
        aggregated_series_id[fill_id] = f'Level6_{agg_1}_{agg_2}'
        fill_id += 1
    
    # Level 7
    for agg_1, agg_2 in product(np.unique(state_id), np.unique(dept_id)):
        agg_sales = sales[np.where((state_id == agg_1) & (dept_id == agg_2))[0]].sum(0)[np.newaxis, :]
        aggregated_series[fill_id] = agg_sales
        aggregated_series_id[fill_id] = f'Level7_{agg_1}_{agg_2}'
        fill_id += 1
        
    # Level 8
    for agg_1, agg_2 in product(np.unique(store_id), np.unique(cat_id)):
        agg_sales = sales[np.where((store_id == agg_1) & (cat_id == agg_2))[0]].sum(0)[np.newaxis, :]
        aggregated_series[fill_id] = agg_sales
        aggregated_series_id[fill_id] = f'Level8_{agg_1}_{agg_2}'
        fill_id += 1
    
    # Level 9
    for agg_1, agg_2 in product(np.unique(store_id), np.unique(dept_id)):
        agg_sales = sales[np.where((store_id == agg_1) & (dept_id == agg_2))[0]].sum(0)[np.newaxis, :]
        aggregated_series[fill_id] = agg_sales
        aggregated_series_id[fill_id] = f'Level9_{agg_1}_{agg_2}'
        fill_id += 1
        
    # Level 10
    for agg_element in np.unique(item_id):
        agg_sales = sales[np.where(item_id == agg_element)[0]].sum(0)[np.newaxis, :]
        aggregated_series[fill_id] = agg_sales
        aggregated_series_id[fill_id] = f'Level10_{agg_element}_X'
        fill_id += 1
    
    # Level 11
    for agg_1, agg_2 in product(np.unique(state_id), np.unique(item_id)):
        agg_sales = sales[np.where((state_id == agg_1) & (item_id == agg_2))[0]].sum(0)[np.newaxis, :]
        aggregated_series[fill_id] = agg_sales
        aggregated_series_id[fill_id] = f'Level11_{agg_1}_{agg_2}'
        fill_id += 1
        
    # Level 12
    aggregated_series[fill_id:] = sales
    aggregated_series_id[fill_id:] = np.array([f'Level12_{item}_{store}' 
                                               for item, store in zip(item_id, store_id)])
    
    # Return the arrays sorted acc to ids
    sort_idx = aggregated_series_id.argsort()
    
    return aggregated_series[sort_idx], aggregated_series_id[sort_idx]

In [5]:
agg_series, agg_series_id = get_aggregated_series(Y, *[sales_data_ids[:, i] for i in range(0, 5)])

#### Calculate weights for 42,840 series

In [29]:
def get_weights_all_levels(sales, sell_price, item_id, dept_id, cat_id, store_id, state_id):
    """
    Generates weights for all 42,840 series
    
    Input data format:
    sales: np array of shape (30490, 28)
    sell_price: np array of shape (30490, 28)
    
    all id arguments: np arrays of shape (30490,)
    """
    
    assert (sales.shape == sell_price.shape), "Sell price and Sales arrays have different sizes"
    assert (sales.shape[1] == 28), "Number of timesteps provided weight calculation is not equal to 28"
    
    # Get actual dollar sales for last 28 days for all 42,840 series
    dollar_sales = sales * sell_price
    agg_series, agg_series_id = get_aggregated_series(dollar_sales, item_id, dept_id, cat_id, store_id, state_id)
    
    # Sum up the actual dollar sales for all 28 timesteps
    agg_series = agg_series.sum(1)
    
    # Calculate total sales for each level
    level_totals = agg_series[np.core.defchararray.find(agg_series_id, f'Level1_') == 0].sum()
    
    # Calculate weight for each series
    weights = agg_series / level_totals
    
    return weights, agg_series_id

In [45]:
weights, agg_series_id = get_weights_all_levels(Y[:, -84:-56], X_enc_dec_feats[:, :, 0].T[:, -84:-56], 
                                 *[sales_data_ids[:, i] for i in range(0, 5)])

8.25 s ± 57.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [39]:
weights_validation = pd.read_csv('../../data/weights_validation.csv')

In [41]:
weights = pd.DataFrame({'id': agg_series_id, 'my_weight': weights})

weights_validation['id'] = weights_validation['Level_id'] + '_' \
                            + weights_validation['Agg_Level_1'] + '_' + weights_validation['Agg_Level_2']
weights_validation = weights_validation.merge(right=weights, on='id', how='left')

In [44]:
np.where(((weights_validation.Weight - weights_validation.my_weight).values < 1e-7) != True)

(array([], dtype=int64),)

#### DISCARDED: Calculate weights for 30,490 series by adding upper level weights to level 12 weights according to hierarchy 

In [22]:
weights_validation

Unnamed: 0,Level_id,Agg_Level_1,Agg_Level_2,Weight,id,my_weight
0,Level1,Total,X,1.000000,Level1_Total_X,1.000000
1,Level2,CA,X,0.442371,Level2_CA_X,0.442371
2,Level2,TX,X,0.269297,Level2_TX_X,0.269297
3,Level2,WI,X,0.288332,Level2_WI_X,0.288332
4,Level3,CA_1,X,0.110888,Level3_CA_1_X,0.110888
...,...,...,...,...,...,...
42835,Level12,HOUSEHOLD_2_516,TX_2,0.000013,Level12_HOUSEHOLD_2_516_TX_2,0.000013
42836,Level12,HOUSEHOLD_2_516,TX_3,0.000008,Level12_HOUSEHOLD_2_516_TX_3,0.000008
42837,Level12,HOUSEHOLD_2_516,WI_1,0.000002,Level12_HOUSEHOLD_2_516_WI_1,0.000002
42838,Level12,HOUSEHOLD_2_516,WI_2,0.000002,Level12_HOUSEHOLD_2_516_WI_2,0.000002


In [48]:
A_act_prev, B_act_prev = 7, 17
Total_act_prev = 24

In [46]:
A_act, B_act = 10, 15
Total_act = 25

In [56]:
A_pred, B_pred = 8, 12
Total_pred = 20

In [57]:
WRMSE_level_2 = ((2) * (7/24)) + ((3) * (17/24))
WRMSE_level_2

2.7083333333333335

In [58]:
WRMSE_level_1 = (5) * (1)
WRMSE_level_1

5

In [59]:
WRMSE_total = (WRMSE_level_1 + WRMSE_level_2) / 2
WRMSE_total

3.854166666666667

In [61]:
A_pred, B_pred = 8, 18
Total_pred = 26

In [62]:
WRMSE_level_2 = ((2) * (7/24)) + ((3) * (17/24))
WRMSE_level_2

2.7083333333333335

In [63]:
WRMSE_level_1 = (1) * (1)
WRMSE_level_1

1

In [64]:
WRMSE_total = (WRMSE_level_1 + WRMSE_level_2) / 2
WRMSE_total

1.8541666666666667

In [41]:
weights_validation.iloc[-3]['Weight'] / weights_validation.iloc[3]['Weight'] 

5.479793004911743e-06

In [None]:
def get_weights_modified_level_12(sales, sell_price, item_id, dept_id, cat_id, store_id, state_id):
    """
    Generates weights for for 30,490 series by adding upper level
    weights to level 12 weights according to the hierarchy 
    
    Input data format:
    sales: np array of shape (30490, 28)
    sell_price: np array of shape (30490, 28)
    
    all id arguments: np arrays of shape (30490,)
    """
    
    assert (sales.shape == sell_price.shape), "Sell price and Sales arrays have different sizes"
    assert (sales.shape[1] == 28), "Number of timesteps provided weight calculation is not equal to 28"
    
    # Get actual dollar sales for last 28 days for all 42,840 series
    dollar_sales = sales * sell_price
    agg_series, agg_series_id = get_aggregated_series(dollar_sales, item_id, dept_id, cat_id, store_id, state_id)
    
    # Sum up the actual dollar sales for all 28 timesteps
    agg_series = agg_series.sum(1)
    
    
    # Get sales contribution of each level 12 series to its upper hierarchical series
    # and add the upper level series' weights to the corresponding level 12 series in the same ratio
    
    weights_dict = {}
    for series_sales, series in zip(agg_series, agg_series_id):
        
        
    # Level 1
    aggregated_series[0] = sales.sum(0)
    aggregated_series_id[0] = 'Level1_Total_X'
    fill_id = 1
    
    # Level 2
    for agg_element in np.unique(state_id):
        agg_sales = sales[np.where(state_id == agg_element)[0]].sum(0)[np.newaxis, :]
        aggregated_series[fill_id] = agg_sales
        aggregated_series_id[fill_id] = f'Level2_{agg_element}_X'
        fill_id += 1
    
    # Calculate total sales for each level
    level_totals = {}
    for level in range(1, 13):
        level_totals[level] = agg_series[np.core.defchararray.find(agg_series_id, f'Level{level}_') == 0].sum()
    
    # Calculate weight for each series
    weights_dict = {}
    for series_sales, series in zip(agg_series, agg_series_id):
        level = int(series[series.find('Level') + 5: series.find('_')])
        weights_dict[series] = (series_sales / level_totals[level])
    
    return weights_dict