# Model Training
Finally we get to the good stuff. We will use linear regression which will probably suck, Adaboost which I have absolutely no underestanding of (apparently widely used for time series forecasting), and lgbm which I actually did some research on. I expect linear to be awful but hey, noone said this project has to win the competition that has been closed for like 2 years anyway.

### WRMSSE Calculation
As mentioned in notebook 1, these metrics will be used to determine our models score.
Big thanks to sakami for this implementation:
https://www.kaggle.com/c/m5-forecasting-accuracy/discussion/133834

Note that due to having a competition leaderboard, we will forgo making a baseline model in lieu of a simple score comparison.

I will also refrain from reformatting the results into the proper submission format as there is nothing to submit anyway. The WRMSSE scores should be good enough to judge the models, I might go back to this and add it later, but for now this will have to do.

In [3]:
from typing import Union

import numpy as np
import pandas as pd
from tqdm.notebook import tqdm_notebook as tqdm


class WRMSSEEvaluator(object):

    def __init__(self, train_df: pd.DataFrame, valid_df: pd.DataFrame, calendar: pd.DataFrame, prices: pd.DataFrame):
        train_y = train_df.loc[:, train_df.columns.str.startswith('d_')]
        train_target_columns = train_y.columns.tolist()
        weight_columns = train_y.iloc[:, -28:].columns.tolist()

        train_df['all_id'] = 0  # for lv1 aggregation

        id_columns = train_df.loc[:, ~train_df.columns.str.startswith('d_')].columns.tolist()
        valid_target_columns = valid_df.loc[:, valid_df.columns.str.startswith('d_')].columns.tolist()

        if not all([c in valid_df.columns for c in id_columns]):
            valid_df = pd.concat([train_df[id_columns], valid_df], axis=1, sort=False)

        self.train_df = train_df
        self.valid_df = valid_df
        self.calendar = calendar
        self.prices = prices

        self.weight_columns = weight_columns
        self.id_columns = id_columns
        self.valid_target_columns = valid_target_columns

        weight_df = self.get_weight_df()

        self.group_ids = (
            'all_id',
            'state_id',
            'store_id',
            'cat_id',
            'dept_id',
            ['state_id', 'cat_id'],
            ['state_id', 'dept_id'],
            ['store_id', 'cat_id'],
            ['store_id', 'dept_id'],
            'item_id',
            ['item_id', 'state_id'],
            ['item_id', 'store_id']
        )

        for i, group_id in enumerate(tqdm(self.group_ids)):
            train_y = train_df.groupby(group_id)[train_target_columns].sum()
            scale = []
            for _, row in train_y.iterrows():
                series = row.values[np.argmax(row.values != 0):]
                scale.append(((series[1:] - series[:-1]) ** 2).mean())
            setattr(self, f'lv{i + 1}_scale', np.array(scale))
            setattr(self, f'lv{i + 1}_train_df', train_y)
            setattr(self, f'lv{i + 1}_valid_df', valid_df.groupby(group_id)[valid_target_columns].sum())

            lv_weight = weight_df.groupby(group_id)[weight_columns].sum().sum(axis=1)
            setattr(self, f'lv{i + 1}_weight', lv_weight / lv_weight.sum())

    def get_weight_df(self) -> pd.DataFrame:
        day_to_week = self.calendar.set_index('d')['wm_yr_wk'].to_dict()
        weight_df = self.train_df[['item_id', 'store_id'] + self.weight_columns].set_index(['item_id', 'store_id'])
        weight_df = weight_df.stack().reset_index().rename(columns={'level_2': 'd', 0: 'value'})
        weight_df['wm_yr_wk'] = weight_df['d'].map(day_to_week)

        weight_df = weight_df.merge(self.prices, how='left', on=['item_id', 'store_id', 'wm_yr_wk'])
        weight_df['value'] = weight_df['value'] * weight_df['sell_price']
        weight_df = weight_df.set_index(['item_id', 'store_id', 'd']).unstack(level=2)['value']
        weight_df = weight_df.loc[zip(self.train_df.item_id, self.train_df.store_id), :].reset_index(drop=True)
        weight_df = pd.concat([self.train_df[self.id_columns], weight_df], axis=1, sort=False)
        return weight_df

    def rmsse(self, valid_preds: pd.DataFrame, lv: int) -> pd.Series:
        valid_y = getattr(self, f'lv{lv}_valid_df')
        score = ((valid_y - valid_preds) ** 2).mean(axis=1)
        scale = getattr(self, f'lv{lv}_scale')
        return (score / scale).map(np.sqrt)

    def score(self, valid_preds: Union[pd.DataFrame, np.ndarray]) -> float:
        assert self.valid_df[self.valid_target_columns].shape == valid_preds.shape

        if isinstance(valid_preds, np.ndarray):
            valid_preds = pd.DataFrame(valid_preds, columns=self.valid_target_columns)

        valid_preds = pd.concat([self.valid_df[self.id_columns], valid_preds], axis=1, sort=False)

        all_scores = []
        for i, group_id in enumerate(self.group_ids):
            lv_scores = self.rmsse(valid_preds.groupby(group_id)[self.valid_target_columns].sum(), i + 1)
            weight = getattr(self, f'lv{i + 1}_weight')
            lv_scores = pd.concat([weight, lv_scores], axis=1, sort=False).prod(axis=1)
            all_scores.append(lv_scores.sum())

        return np.mean(all_scores)

## Imports and reading data

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from downcast import reduce
import seaborn as sns
from sklearn.model_selection import train_test_split
import random
from typing import Union
from tqdm.notebook import tqdm_notebook as tqdm
import gc
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import AdaBoostRegressor
import lightgbm as lgb
from sklearn.metrics import mean_squared_error

In [9]:
train = pd.read_pickle('df_train_final.pkl')
valid = pd.read_pickle('df_valid_final.pkl')
test = pd.read_pickle('df_test_final.pkl')

train = train.reset_index().set_index('date')
valid = valid.reset_index().set_index('date')
test = test.reset_index().set_index('date')

train = reduce(train)
valid = reduce(valid)
test = reduce(test)

# Feature selection and final splits

In [12]:
train.columns

Index(['index', 'id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id',
       'wm_yr_wk', 'wday', 'month', 'year', 'items_sold', 'sell_price',
       'event_num', 'snap', 'weekend', 'season', 'item_id_label',
       'dept_id_label', 'cat_id_label', 'store_id_label', 'state_id_label',
       'lag_7', 'lag_28', 'lag_35', 'lag_42', 'lag_60', 'lag_360', 'rmean_7_7',
       'rmean_28_7', 'rmean_35_7', 'rmean_42_7', 'rmean_60_7', 'rmean_360_7',
       'rmean_7_28', 'rmean_28_28', 'rmean_35_28', 'rmean_42_28',
       'rmean_60_28', 'rmean_360_28', 'rmean_7_35', 'rmean_28_35',
       'rmean_35_35', 'rmean_42_35', 'rmean_60_35', 'rmean_360_35',
       'rmean_7_42', 'rmean_28_42', 'rmean_35_42', 'rmean_42_42',
       'rmean_60_42', 'rmean_360_42', 'rmean_7_60', 'rmean_28_60',
       'rmean_35_60', 'rmean_42_60', 'rmean_60_60', 'rmean_360_60',
       'rmean_7_360', 'rmean_28_360', 'rmean_35_360', 'rmean_42_360',
       'rmean_60_360', 'rmean_360_360'],
      dtype='object')

In [16]:
unused = ['index', 'id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id','wm_yr_wk','items_sold',]
features = list(set(train.columns)-set(unused))
target = ['items_sold']

In [15]:
features

['rmean_35_28',
 'rmean_42_35',
 'lag_28',
 'season',
 'event_num',
 'rmean_42_28',
 'rmean_35_35',
 'rmean_60_35',
 'rmean_60_42',
 'cat_id_label',
 'lag_360',
 'rmean_7_360',
 'month',
 'lag_42',
 'rmean_42_360',
 'rmean_42_60',
 'state_id_label',
 'lag_35',
 'rmean_360_42',
 'rmean_7_28',
 'rmean_35_60',
 'rmean_28_42',
 'dept_id_label',
 'year',
 'rmean_28_7',
 'rmean_42_42',
 'sell_price',
 'rmean_7_42',
 'lag_7',
 'rmean_360_28',
 'lag_60',
 'rmean_60_28',
 'rmean_360_360',
 'rmean_35_42',
 'rmean_60_360',
 'rmean_360_7',
 'store_id_label',
 'wday',
 'rmean_60_60',
 'rmean_360_35',
 'rmean_28_28',
 'rmean_42_7',
 'rmean_60_7',
 'rmean_35_7',
 'rmean_28_35',
 'rmean_7_60',
 'rmean_360_60',
 'snap',
 'item_id_label',
 'rmean_35_360',
 'rmean_7_35',
 'rmean_7_7',
 'weekend',
 'rmean_28_60',
 'rmean_28_360']

In [None]:
X_train = train[features]
y_train = train[target]

X_valid = valid[features]
y_valid = valid[target]

## 1. Linear Regression

In [None]:
regressor = LinearRegression(n_job=7, normalize=True)
regressor.fit