### Requirements for WRMSSE metric:
* Function to convert level 12 series predictions to all level predictions by aggregating
* Actual labels for all levels
* Series ID as part of each DataLoader output
* Function to create a dictionary with weights for all levels acc to series ID

In [4]:
import torch
import numpy as np
import pandas as pd
import category_encoders as ce
from tqdm import notebook
import matplotlib.pyplot as plt
import gc
import pickle as pkl
from itertools import product
%matplotlib inline

In [2]:
train_data = pd.read_csv('../../data/sales_train_validation.csv')
sell_prices = pd.read_csv('../../data/sell_prices.csv')
calendar = pd.read_csv('../../data/calendar.csv')
sample_submission = pd.read_csv('../../data/sample_submission.csv')
weights_validation = pd.read_csv('../../data/weights_validation.csv')

In [5]:
with open('../data/data.pickle', 'rb') as f:
    data_dict = pkl.load(f)
    
sales_data_ids = data_dict['sales_data_ids']
calendar_index = data_dict['calendar_index']
X_prev_day_sales = data_dict['X_prev_day_sales']
X_enc_only_feats = data_dict['X_enc_only_feats']
X_enc_dec_feats = data_dict['X_enc_dec_feats']
X_calendar = data_dict['X_calendar']
X_calendar_cols = data_dict['X_calendar_cols']
Y = data_dict['Y']

#### Get all aggregated series from level 12 series

In [102]:
def get_aggregated_series(sales, sales_data_ids):
    """
    Aggregates 30,490 level 12 series to generate data for all 42,840 series
    
    Input data format:
    sales: np array of shape (30490, num_timesteps)
    sales_data_ids: np array of shape (30490, 5) 
                    with 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id' as the columns
    """
    
    df = pd.DataFrame({col: sales_data_ids[:, i] for col, i in 
                       zip(['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'], range(0, 5))})
    df = pd.concat([df, pd.DataFrame(sales)], axis=1)
    data_cols = [i for i in range(0, sales.shape[1])]
    
    agg_indices, agg_series, agg_series_id = [], [], []
    
    # Level 1
    agg_series.append(sales.sum(0).reshape(1, -1))
    agg_series_id.append(np.array(['Level1_Total_X']))
    
    # Level 2
    agg = df.groupby(['state_id'])[data_cols]
    agg_indices.append(agg.indices)
    agg = agg.sum()
    agg_series.append(agg.values)
    agg_series_id.append(('Level2_' + agg.index.values + '_X'))
        
    # Level 3
    agg = df.groupby(['store_id'])[data_cols]
    agg_indices.append(agg.indices)
    agg = agg.sum()
    agg_series.append(agg.values)
    agg_series_id.append(('Level3_' + agg.index.values + '_X'))
        
    # Level 4
    agg = df.groupby(['cat_id'])[data_cols]
    agg_indices.append(agg.indices)
    agg = agg.sum()
    agg_series.append(agg.values)
    agg_series_id.append(('Level4_' + agg.index.values + '_X'))
        
    # Level 5
    agg = df.groupby(['dept_id'])[data_cols]
    agg_indices.append(agg.indices)
    agg = agg.sum()
    agg_series.append(agg.values)
    agg_series_id.append(('Level5_' + agg.index.values + '_X'))
    
    # Level 6
    agg = df.groupby(['state_id', 'cat_id'])[data_cols]
    agg_indices.append(agg.indices)
    agg = agg.sum()
    agg_series.append(agg.values)
    agg_series_id.append('Level6_' + agg.index.get_level_values(0) + '_' + agg.index.get_level_values(1))
    
    # Level 7
    agg = df.groupby(['state_id', 'dept_id'])[data_cols]
    agg_indices.append(agg.indices)
    agg = agg.sum()
    agg_series.append(agg.values)
    agg_series_id.append('Level7_' + agg.index.get_level_values(0) + '_' + agg.index.get_level_values(1))
        
    # Level 8
    agg = df.groupby(['store_id', 'cat_id'])[data_cols]
    agg_indices.append(agg.indices)
    agg = agg.sum()
    agg_series.append(agg.values)
    agg_series_id.append('Level8_' + agg.index.get_level_values(0) + '_' + agg.index.get_level_values(1))

    # Level 9
    agg = df.groupby(['store_id', 'dept_id'])[data_cols]
    agg_indices.append(agg.indices)
    agg = agg.sum()
    agg_series.append(agg.values)
    agg_series_id.append('Level9_' + agg.index.get_level_values(0) + '_' + agg.index.get_level_values(1))

    # Level 10
    agg = df.groupby(['item_id'])[data_cols]
    agg_indices.append(agg.indices)
    agg = agg.sum()
    agg_series.append(agg.values)
    agg_series_id.append(('Level10_' + agg.index.values + '_X'))

    # Level 11
    agg = df.groupby(['state_id', 'item_id'])[data_cols]
    agg_indices.append(agg.indices)
    agg = agg.sum()
    agg_series.append(agg.values)
    agg_series_id.append('Level11_' + agg.index.get_level_values(0) + '_' + agg.index.get_level_values(1))
    
    # Level 12
    agg = df.set_index(['item_id', 'store_id'])[data_cols]
    agg_series.append(agg.values)
    agg_series_id.append('Level12_' + agg.index.get_level_values(0) + '_' + agg.index.get_level_values(1))
    
    # Get affected_hierarchy_ids - all the series affected on updating each Level 12 series
    affected_hierarchy_ids = np.empty((30490, 12), np.int32)

    # Level 1
    affected_hierarchy_ids[:, 0] = 0
    fill_id, fill_col = 1, 1
    # Level 2
    for k, v in agg_indices[0].items():
        affected_hierarchy_ids[v, fill_col] = fill_id
        fill_id += 1
    fill_col += 1
    # Level 3
    for k, v in agg_indices[1].items():
        affected_hierarchy_ids[v, fill_col] = fill_id
        fill_id += 1
    fill_col += 1
    # Level 4
    for k, v in agg_indices[2].items():
        affected_hierarchy_ids[v, fill_col] = fill_id
        fill_id += 1
    fill_col += 1
    # Level 5
    for k, v in agg_indices[3].items():
        affected_hierarchy_ids[v, fill_col] = fill_id
        fill_id += 1
    fill_col += 1
    # Level 6
    for k, v in agg_indices[4].items():
        affected_hierarchy_ids[v, fill_col] = fill_id
        fill_id += 1
    fill_col += 1
    # Level 7
    for k, v in agg_indices[5].items():
        affected_hierarchy_ids[v, fill_col] = fill_id
        fill_id += 1
    fill_col += 1
    # Level 8
    for k, v in agg_indices[6].items():
        affected_hierarchy_ids[v, fill_col] = fill_id
        fill_id += 1
    fill_col += 1
    # Level 9
    for k, v in agg_indices[7].items():
        affected_hierarchy_ids[v, fill_col] = fill_id
        fill_id += 1
    fill_col += 1
    # Level 10
    for k, v in agg_indices[8].items():
        affected_hierarchy_ids[v, fill_col] = fill_id
        fill_id += 1
    fill_col += 1
    # Level 11
    for k, v in agg_indices[9].items():
        affected_hierarchy_ids[v, fill_col] = fill_id
        fill_id += 1
    fill_col += 1
    # Level 12
    affected_hierarchy_ids[:, fill_col] = fill_id + np.arange(0, 30490)

    return np.concatenate(agg_series, axis=0), np.concatenate(agg_series_id, axis=0).astype('<U28'), affected_hierarchy_ids

In [96]:
agg_series, agg_series_id, aff_hier_ids = get_aggregated_series(Y, sales_data_ids)

In [99]:
%%timeit
agg_series, agg_series_id, _ = get_aggregated_series(Y, sales_data_ids)

3.99 s ± 59.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [2]:
def unison_shuffled_copies(a, b):
    assert len(a) == len(b)
    p = np.random.permutation(len(a))
    return a[p], b[p], p

In [6]:
Y_s, ids_s, p_s = unison_shuffled_copies(Y, sales_data_ids)

In [15]:
print(np.argsort(p_s).dtype)

int64


In [143]:
Y_s = Y_s[np.argsort(p_s)]
ids_s = ids_s[np.argsort(p_s)]

In [144]:
agg_series_s, agg_series_id_s, aff_hier_ids_s = get_aggregated_series(Y_s, ids_s)

In [132]:
agg_series_id

array(['Level1_Total_X', 'Level2_CA_X', 'Level2_TX_X', ...,
       'Level12_FOODS_3_825_WI_3', 'Level12_FOODS_3_826_WI_3',
       'Level12_FOODS_3_827_WI_3'], dtype='<U28')

#### Calculate weights for 42,840 series

In [105]:
def get_weights_all_levels(sales, sell_price, sales_data_ids):
    """
    Generates weights for all 42,840 series
    
    Input data format:
    sales: np array of shape (30490, 28)
    sell_price: np array of shape (30490, 28)
    
    sales_data_ids: np array of shape (30490, 5) 
                with 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id' as the columns
    """
    
    assert (sales.shape == sell_price.shape), "Sell price and Sales arrays have different sizes"
    assert (sales.shape[1] == 28), "Number of timesteps provided weight calculation is not equal to 28"
    
    # Get actual dollar sales for last 28 days for all 42,840 series
    dollar_sales = sales * sell_price
    agg_series, agg_series_id, _ = get_aggregated_series(dollar_sales, sales_data_ids)
    
    # Sum up the actual dollar sales for all 28 timesteps
    agg_series = agg_series.sum(1)
    
    # Calculate total sales for each level
    level_totals = agg_series[np.core.defchararray.find(agg_series_id, f'Level1_') == 0].sum()
    
    # Calculate weight for each series
    weights = agg_series / level_totals
    
    return weights, agg_series_id

In [107]:
weights, agg_series_id = get_weights_all_levels(Y[:, -84:-56], X_enc_dec_feats[:, :, 0].T[:, -84:-56], 
                                 sales_data_ids)

In [108]:
weights_validation = pd.read_csv('../../data/weights_validation.csv')

In [109]:
weights = pd.DataFrame({'id': agg_series_id, 'my_weight': weights})

weights_validation['id'] = weights_validation['Level_id'] + '_' \
                            + weights_validation['Agg_Level_1'] + '_' + weights_validation['Agg_Level_2']
weights_validation = weights_validation.merge(right=weights, on='id', how='left')

In [112]:
np.where(((weights_validation.Weight - weights_validation.my_weight).values < 1e-7) != True)

(array([], dtype=int64),)