In [None]:
import pandas as pd
from sklearn.metrics import mean_absolute_percentage_error
import requests
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor
import numpy as np
import optuna
import xgboost as xgb
from dateutil.easter import easter
from datetime import timedelta

# Load

In [2]:
sales_daily = pd.read_csv("./data/external/train.csv").assign(

    date = lambda df_: pd.to_datetime(df_['date']),

    country_store = lambda df_: df_['country'].str.cat(df_['store'], sep='|'),
    country_product = lambda df_: df_['country'].str.cat(df_['product'], sep='|'),
    store_product = lambda df_: df_['store'].str.cat(df_['product'], sep='|'),
    country_store_product = lambda df_: df_['country'].str.cat([df_['store'], df_['product']], sep='|')

    ).assign(series_id = lambda df_: df_['country_store_product'])

In [None]:
sales_daily.head()

In [None]:
def extract_gdp_per_capita(country_code, year):
    """
    Adapted from https://www.kaggle.com/competitions/playground-series-s5e1/discussion/554349.
    """

    url='https://api.worldbank.org/v2/country/{0}/indicator/NY.GDP.PCAP.CD?date={1}&format=json'
    response = requests.get(url.format(country_code, year)).json()

    return response[1][0]['value']

# per CountryCode-year: request GDP per capita.
# concatenate dataframe of CountryCode | Country | Year | GDP, for integration to Kaggle source

countries_code_map = {
    'Canada': 'CAN', 
    'Finland': 'FIN',
    'Italy': 'ITA',
    'Kenya': 'KEN',
    'Norway': 'NOR',
    'Singapore': 'SGP'
    }

countries_gdp_yearly = []
for country_title, country_code in countries_code_map.items():
    
    values_yearly = [
        {'year': i, 'gdp_per_capita': extract_gdp_per_capita(country_code, i)}
        for i in range(2010, 2019+1)
        ]
    values_yearly = [pd.DataFrame(x, index=[0]) for x in values_yearly]
    values_yearly = pd.concat(values_yearly, axis=0)
    
    values_yearly = values_yearly.assign(
        country = country_title,
        country_code = country_code
        )
    
    countries_gdp_yearly.append(values_yearly)

    print(f"{country_title} ({country_code}) GDP Per Capita extraction complete.")

countries_gdp_yearly = pd.concat(countries_gdp_yearly, axis=0)

countries_gdp_yearly = countries_gdp_yearly.assign(
    gdp_per_capita_log = lambda df_: np.log(df_['gdp_per_capita'])
    )

In [None]:
(
    countries_gdp_yearly
    .set_index('year')
    .groupby('country')
    ['gdp_per_capita']
    .plot(legend=True)
)
;

In [6]:
days_easter0 = [easter(x) for x in range(2010, 2019+1)]
days_easter = pd.DataFrame({'date': days_easter0}).assign(is_easter = 1)

# motivated by exploratory analysis of model errors.
# appears that errors concentrate on days shortly after Easter
dfs_days_special_relative_easter = [days_easter]
for delta_days in [2, 3, 4, 5, 6, 7]:

    df_special = (
        days_easter
        .copy()
        .assign(date = lambda df_: df_['date'] + timedelta(days=delta_days))
        .rename(columns={'is_easter': f'is_easter_plus{delta_days}'})
        )
    dfs_days_special_relative_easter.append(df_special)

days_special_relative_easter = (
    pd.concat(dfs_days_special_relative_easter, axis=0)
    .fillna(0)
    .assign(date = lambda df_: pd.to_datetime(df_['date']))
    )

assert days_special_relative_easter['date'].is_unique

FEATURES_EASTER = [x for x in days_special_relative_easter.columns if 'easter' in x]

# Data Understanding

## Data Description Report: "Surface Properties"

In [None]:
sales_daily['id'].is_unique

### Volumetric Analyses

In [None]:
sales_daily.shape

In [None]:
sales_daily['series_id'].nunique()

In [None]:
sales_daily['date'].nunique()

In [None]:
sales_daily['series_id'].value_counts().value_counts()

In [None]:
sales_daily['product'].value_counts(dropna=False)

In [None]:
sales_daily['country'].value_counts(dropna=False)

In [None]:
sales_daily['store'].value_counts(dropna=False)

### Fields' Types and Values

In [None]:
sales_daily['date'].describe()

In [None]:
sales_daily['num_sold'].describe()

## Data Quality Report

In [None]:
sales_daily.isnull().mean()

In [None]:
# are null sales events concentrated on a particular date?
# doesn't appear so
sales_daily.query("num_sold.isnull()")['date'].value_counts()

In [None]:
sales_daily.query("num_sold.isnull()")['series_id'].value_counts()

## Data Exploration Report

In [None]:
sales_daily.groupby('series_id')['num_sold'].sum().sort_values(ascending=False)

In [None]:
# a naive model: series' historical average sales, daily

is_training = (
    (sales_daily['date'] >= pd.to_datetime('2010-01-01'))
    & (sales_daily['date'] < pd.to_datetime("2014-01-01"))
)

is_validation = sales_daily['date'] >= pd.to_datetime("2014-01-01")

# is_training.sum(), is_validation.sum()

# with original train data: using strictly train segment, groupby average
predictions_naive = (
    sales_daily
    .loc[is_training]
    .groupby('series_id')
    [['num_sold']]
    .agg('mean')
    .reset_index(drop=False)
)

predictions_naive_evaluate = pd.merge(
    sales_daily.loc[is_validation],
    predictions_naive.rename(columns={'num_sold': 'yhat'}),
    how='left'
    )

# expect a couple series with all null
predictions_naive.isnull().sum()
# recommended in this discussion: https://www.kaggle.com/competitions/playground-series-s5e1/discussion/554553 
predictions_naive_evaluate = predictions_naive_evaluate.dropna()

mean_absolute_percentage_error(
    predictions_naive_evaluate['num_sold'],
    predictions_naive_evaluate['yhat']
)

In [None]:
predictions_naive_evaluate.shape

In [None]:
# a naive model: series' historical average sales, daily

is_training = (
    (sales_daily['date'] >= pd.to_datetime('2013-01-01'))
    & (sales_daily['date'] < pd.to_datetime("2014-01-01"))
)

is_validation = sales_daily['date'] >= pd.to_datetime("2014-01-01")

# is_training.sum(), is_validation.sum()

# with original train data: using strictly train segment, groupby average
predictions_naive = (
    sales_daily
    .loc[is_training]
    .groupby('series_id')
    [['num_sold']]
    .agg('mean')
    .reset_index(drop=False)
)

predictions_naive_evaluate = pd.merge(
    sales_daily.loc[is_validation],
    predictions_naive.rename(columns={'num_sold': 'yhat'}),
    how='left'
    )

# expect a couple series with all null
predictions_naive.isnull().sum()
# recommended in this discussion: https://www.kaggle.com/competitions/playground-series-s5e1/discussion/554553 
predictions_naive_evaluate = predictions_naive_evaluate.dropna()

mean_absolute_percentage_error(
    predictions_naive_evaluate['num_sold'],
    predictions_naive_evaluate['yhat']
)

In [None]:
sales_sample_daily = sales_daily.query("series_id == 'Norway|Premium Sticker Mart|Kaggle'")

(
    sales_sample_daily
    .loc[is_training]
    [['date', 'num_sold']]
    .set_index('date')
    .plot
    .line()
)
;

# Data Preparation

In [25]:
def transform_calendar_features(df):

    df = (
        df
        .assign(
            year = lambda df_: df_['date'].dt.year,
            month = lambda df_: df_['date'].dt.month,
            week_of_year = lambda df_: df_['date'].dt.isocalendar().week,
            day_of_week = lambda df_: df_['date'].dt.day_name(),
            # President's Day is the 'third Monday in February'
            day_of_month = lambda df_: df_['date'].dt.day,
            day_of_year = lambda df_: df_['date'].dt.dayofyear,
            # week of month would be ambiguous because, one week may span 2 months,
            days_since_start = lambda df_: (df['date'] - pd.to_datetime("2010-01-01")).dt.days
            )
        .assign(
            # as day_of_year rises, don't expect monotonic relationship with outcome.
            # rather, expect periodic (sinusoidal) relationship.
            # as sin(x) rises, so too does outcome ...
            # ensure one cycle over one year.
            # at baseline, one sinusoidal cycle occurs per 2π
            day_of_year_sin = lambda df_: np.sin(df_['day_of_year'] * 2 * np.pi / 365),
            day_of_year_cos = lambda df_: np.cos(df_['day_of_year'] * 2 * np.pi / 365),

            day_of_month_sin = lambda df_: np.sin(df_['day_of_month'] * 2 * np.pi / 30),
            day_of_month_cos = lambda df_: np.cos(df_['day_of_month'] * 2 * np.pi / 30),

            # exploratory visuals suggest ~2-year cycles
            days_since_start_macro_sin = lambda df_: np.sin(df_['days_since_start'] * 2 * np.pi / 730),
            days_since_start_macro_cos = lambda df_: np.cos(df_['days_since_start'] * 2 * np.pi / 730),
            )
        .assign(
            is_yearend = lambda df_: (
                (df_['month'] == 12) & (df_['day_of_month'].isin([28, 29, 30, 31]))
                ).astype(int)
            )

        )
    
    df = pd.merge(df, days_special_relative_easter, how='left')
    assert df['is_easter'].notnull().any()
    df[FEATURES_EASTER] = df[FEATURES_EASTER].fillna(0)
    
    return df

def integrate_external_features(df):

    df = pd.merge(df, countries_gdp_yearly, how='left')
    assert df['gdp_per_capita'].notnull().all().all()

    return df

sales_daily = transform_calendar_features(sales_daily)
sales_daily = integrate_external_features(sales_daily)

In [26]:
FEATURES_TO_ONEHOT = [
    'country', 
    'store',
    'product',
    'country_store',
    'country_product',
    'store_product',
    'country_store_product',
    # year attempted, but then omitted, because year-grained shifts
    # should be explained by exogenous factors, 
    # out-of-sample forecasts that aren't flat
    'month', 
    'week_of_year', 
    'day_of_week'
    ]
FEATURES_NUMERIC_CONTINUOUS = [
    'gdp_per_capita_log',
    'day_of_month', 
    'day_of_month_sin',
    'day_of_month_cos',
    'day_of_year_sin',
    'day_of_year_cos',
    'day_of_year',
    'days_since_start_macro_sin',
    'days_since_start_macro_cos',
    'days_since_start'
    ]
FEATURES_ONEHOT = ['is_yearend'] + FEATURES_EASTER
FEATURES_SOURCE_FORM = FEATURES_TO_ONEHOT + FEATURES_NUMERIC_CONTINUOUS + FEATURES_ONEHOT

ATTRIBUTES = ['series_id', 'date', 'id']

sales_daily_complete = sales_daily.dropna(subset='num_sold')

XY = sales_daily_complete[['num_sold'] + ATTRIBUTES + FEATURES_SOURCE_FORM]

XY = XY.assign(num_sold_log = lambda df_: np.log(df_['num_sold']))

feature_transform_pipeline = ColumnTransformer([
    ('transformer_onehot', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), FEATURES_TO_ONEHOT),
    ('transformer_std', StandardScaler(), FEATURES_NUMERIC_CONTINUOUS)
    ],
    verbose_feature_names_out=False,
    remainder='passthrough'
    )
feature_transform_pipeline.set_output(transform='pandas')

XY = feature_transform_pipeline.fit_transform(XY)

FEATURES_UNIVERSE = list( set(XY.columns).difference(set(ATTRIBUTES + ['num_sold', 'num_sold_log'])) )

FEATURES_GLOBAL_MODEL = [
    x for x in FEATURES_UNIVERSE
    # inclusion yields flat predictions.
    # moreover, sinusoidal transforms drag performance
    if not any(stem in x for stem in ['days_since_start'])
    ]

FEATURES_GLOBAL_TREND_LEVEL_MODEL = [
    x for x in FEATURES_UNIVERSE 
    if any(stem in x for stem in ['country', 'store', 'product', 'gdp']) & ('country_store_product' not in x)
    ]

FEATURES_LOCAL_MODEL = [
    x for x in FEATURES_UNIVERSE 
    if not any(stem in x for stem in ['country', 'store', 'product'])
    ]

FEATURES_LOCAL_REMAINDER_MODEL = list( set(FEATURES_LOCAL_MODEL).intersection(set(FEATURES_GLOBAL_MODEL)) )
FEATURES_LOCAL_REMAINDER_MODEL.remove('gdp_per_capita_log')

# Modeling

## Global Model

In [27]:
class TrendRemainderModelPipeline:
    def __init__(
        self, 
        trend_model_features, 
        trend_model_ridge_alpha, 
        remainder_model_features
        ):

        self.trend_model_features = trend_model_features
        self.trend_level_model_ridge_alpha = trend_model_ridge_alpha

        self.remainder_model_features = remainder_model_features

    def fit(self, X, y):

        self.fit_trend_level_model(X, y)
        yhat_trend_level = self.predict_trend_level_model(X)
        y_detrended = y - yhat_trend_level

        self.fit_remainder_model(X, y_detrended)

        return self

    def predict(self, X):

        yhat_trend_level = self.predict_trend_level_model(X)
        yhat_remainder = self.predict_remainder_model(X)

        return yhat_trend_level + yhat_remainder

    def fit_trend_level_model(self, X, y):

        model_trend_level = Ridge(self.trend_level_model_ridge_alpha)
        model_trend_level.fit(X[self.trend_model_features], y)

        self.trend_level_model = model_trend_level

    def fit_remainder_model(self, X, y):

        model_remainder = RandomForestRegressor(n_estimators=100, n_jobs=-1)
        model_remainder.fit(X[self.remainder_model_features], y)

        self.remainder_model = model_remainder

    def predict_trend_level_model(self, X):
        return self.trend_level_model.predict(X[self.trend_model_features])
    
    def predict_remainder_model(self, X):
        return self.remainder_model.predict(X[self.remainder_model_features])

In [None]:
kfolds = [

    # validation set 2014-16 matches ultimate test set's length, 2017-19
    ( 
        ( (XY['date'] >= pd.to_datetime('2010-01-01')) & (XY['date'] < pd.to_datetime("2014-01-01")) ),
        XY['date'] >= pd.to_datetime("2014-01-01")
    ),

    ( 
        ( (XY['date'] >= pd.to_datetime('2010-01-01')) & (XY['date'] < pd.to_datetime("2016-01-01")) ),
        XY['date'] >= pd.to_datetime("2016-01-01")
    )

    ]

kfolds_evaluation = []
for is_training, is_validation in kfolds:

    model_global = RandomForestRegressor(n_estimators=100, n_jobs=-1)
    model_global.fit(
        XY.loc[is_training, FEATURES_GLOBAL_MODEL],
        XY.loc[is_training, 'num_sold_log']
        )
    
    predictions = (
        XY
        .copy()
        .assign(yhat = lambda df_: np.exp(model_global.predict(df_[FEATURES_GLOBAL_MODEL])))
        )
    
    scores = {
        'validation': mean_absolute_percentage_error( 
            predictions.loc[is_validation, 'num_sold'],
            predictions.loc[is_validation, 'yhat']
            ),
        'train': mean_absolute_percentage_error( 
            predictions.loc[is_training, 'num_sold'],
            predictions.loc[is_training, 'yhat']
            )
        }
    
    kfolds_evaluation.append(scores)

kfolds_evaluation

In [None]:
predictions_sample = predictions.query("series_id == 'Norway|Stickers for Less|Kaggle'")
# predictions_sample = predictions.query("series_id == 'Norway|Premium Sticker Mart|Kaggle'")

predictions_sample.set_index('date')[['num_sold', 'yhat']].plot.line(alpha=0.5);

In [30]:
from sklearn.inspection import permutation_importance

# result = permutation_importance(
#     model_global, 
#     XY.loc[is_validation, FEATURES_GLOBAL_MODEL],
#     XY.loc[is_validation, 'num_sold_log'], 
#     n_repeats=5, 
#     random_state=777, 
#     n_jobs=-1
# )
# importances = pd.Series(result.importances_mean, index=FEATURES_GLOBAL_MODEL)

importances = pd.Series(model_global.feature_importances_, index=FEATURES_GLOBAL_MODEL)

In [None]:
importances.sort_values(ascending=False).head(25)

In [None]:
kfolds = [

    # validation set 2014-16 matches ultimate test set's length, 2017-19
    ( 
        ( (XY['date'] >= pd.to_datetime('2010-01-01')) & (XY['date'] < pd.to_datetime("2014-01-01")) ),
        XY['date'] >= pd.to_datetime("2014-01-01")
    ),

    ( 
        ( (XY['date'] >= pd.to_datetime('2010-01-01')) & (XY['date'] < pd.to_datetime("2016-01-01")) ),
        XY['date'] >= pd.to_datetime("2016-01-01")
    )

    ]

kfolds_evaluation = []
for is_training, is_validation in kfolds:

    model_global = Ridge(1e-2)
    model_global.fit(
        XY.loc[is_training, FEATURES_GLOBAL_MODEL],
        XY.loc[is_training, 'num_sold_log']
        )
    
    predictions = (
        XY
        .copy()
        .assign(yhat = lambda df_: np.exp(model_global.predict(df_[FEATURES_GLOBAL_MODEL])))
        )
    
    scores = {
        'validation': mean_absolute_percentage_error( 
            predictions.loc[is_validation, 'num_sold'],
            predictions.loc[is_validation, 'yhat']
            ),
        'train': mean_absolute_percentage_error( 
            predictions.loc[is_training, 'num_sold'],
            predictions.loc[is_training, 'yhat']
            )
        }
    
    kfolds_evaluation.append(scores)

kfolds_evaluation

In [None]:
predictions_sample = predictions.query("series_id == 'Norway|Stickers for Less|Kaggle'")
# predictions_sample = predictions.query("series_id == 'Norway|Premium Sticker Mart|Kaggle'")

predictions_sample.set_index('date')[['num_sold', 'yhat']].plot.line(alpha=0.5);

In [None]:
kfolds = [

    # validation set 2014-16 matches ultimate test set's length, 2017-19
    ( 
        ( (XY['date'] >= pd.to_datetime('2010-01-01')) & (XY['date'] < pd.to_datetime("2014-01-01")) ),
        XY['date'] >= pd.to_datetime("2014-01-01")
    ),

    ( 
        ( (XY['date'] >= pd.to_datetime('2010-01-01')) & (XY['date'] < pd.to_datetime("2016-01-01")) ),
        XY['date'] >= pd.to_datetime("2016-01-01")
    )

    ]

kfolds_evaluation = []
for is_training, is_validation in kfolds:

    model_global_pipeline = TrendRemainderModelPipeline(
        trend_model_features=FEATURES_GLOBAL_TREND_LEVEL_MODEL,
        trend_model_ridge_alpha=1e-2,
        remainder_model_features=FEATURES_GLOBAL_MODEL
        )
    model_global_pipeline.fit(
        XY.loc[is_training].drop(columns='num_sold_log'),
        XY.loc[is_training, 'num_sold_log']
        )
    
    predictions = XY.copy()
    predictions = (
        predictions
        .assign(yhat_log = lambda df_: model_global_pipeline.predict(df_))
        .assign(yhat = lambda df_: np.exp(df_['yhat_log']) )
        )
    
    scores = {
        'validation': mean_absolute_percentage_error( 
            predictions.loc[is_validation, 'num_sold'],
            predictions.loc[is_validation, 'yhat']
            ),
        'train': mean_absolute_percentage_error( 
            predictions.loc[is_training, 'num_sold'],
            predictions.loc[is_training, 'yhat']
            )
        }
    
    kfolds_evaluation.append(scores)

kfolds_evaluation

In [None]:
predictions_sample = predictions.query("series_id == 'Norway|Stickers for Less|Kaggle'")
# predictions_sample = predictions.query("series_id == 'Norway|Premium Sticker Mart|Kaggle'")

predictions_sample.set_index('date')[['num_sold', 'yhat']].plot.line(alpha=0.5);

In [36]:
# gradient boosting machine does not appear to improve global model.

# def objective(trial):

#     ROUNDS_COUNT = 100

#     param = {
#         "objective": "reg:squarederror",
#         "booster": trial.suggest_categorical(
#             "booster", ["gbtree", "gblinear", "dart"]
#         ),
#         "lambda": trial.suggest_float("lambda", 1e-8, 1.0, log=True),
#         "alpha": trial.suggest_float("alpha", 1e-8, 1.0, log=True),
#         "subsample": trial.suggest_float("subsample", 0.4, 1.0),
#         "colsample_bytree": trial.suggest_float("colsample_bytree", 0.2, 1.0),
#     }

#     if param["booster"] in ["gbtree", "dart"]:
#         param["max_depth"] = trial.suggest_int("max_depth", 1, 9, step=2)

#         param["min_child_weight"] = trial.suggest_int("min_child_weight", 1, 10)
#         param["eta"] = trial.suggest_float("eta", 1e-5, 0.01, log=True)

#         param["gamma"] = trial.suggest_float("gamma", 1e-8, 1.0, log=True)
#         param["grow_policy"] = trial.suggest_categorical(
#             "grow_policy", ["depthwise", "lossguide"]
#         )

#     if param["booster"] == "dart":
#         param["sample_type"] = trial.suggest_categorical(
#             "sample_type", ["uniform", "weighted"]
#         )
#         param["normalize_type"] = trial.suggest_categorical(
#             "normalize_type", ["tree", "forest"]
#         )
#         param["rate_drop"] = trial.suggest_float(
#             "rate_drop", 1e-8, 1.0, log=True
#         )
#         param["skip_drop"] = trial.suggest_float(
#             "skip_drop", 1e-8, 1.0, log=True
#         )


#     kfolds_evaluation = []
#     for is_training, is_validation in kfolds:

#         dtrain = xgb.DMatrix(
#             XY.loc[is_training, FEATURES_GLOBAL_MODEL], 
#             label=XY.loc[is_training, 'num_sold_log']
#             )
#         dtest = xgb.DMatrix(XY[FEATURES_GLOBAL_MODEL])

#         model_global = xgb.train(param, dtrain, ROUNDS_COUNT)
        
#         yhat = model_global.predict(dtest)
#         predictions = (
#             XY
#             .copy()
#             .assign(yhat = lambda df_: np.exp(yhat))
#             )

#         scores = {
#             'validation': mean_absolute_percentage_error( 
#                 predictions.loc[is_validation, 'num_sold'],
#                 predictions.loc[is_validation, 'yhat']
#                 ),
#             'train': mean_absolute_percentage_error( 
#                 predictions.loc[is_training, 'num_sold'],
#                 predictions.loc[is_training, 'yhat']
#                 )
#             }
        
#         kfolds_evaluation.append(scores)

#     score_overall = np.mean([ 
#         kfolds_evaluation[0]['validation'], 
#         kfolds_evaluation[1]['validation'] 
#         ])
    
#     return score_overall

# study = optuna.create_study()

# study.optimize(
#     objective,
#     n_trials=50,
#     catch=(ValueError,),
#     n_jobs=-1,
#     timeout=12 * 60 * 60,
# )

In [None]:
model_global = TrendRemainderModelPipeline(
        trend_model_features=FEATURES_GLOBAL_TREND_LEVEL_MODEL,
        trend_model_ridge_alpha=1e-2,
        remainder_model_features=FEATURES_GLOBAL_MODEL
        )
model_global.fit(XY.drop(columns='num_sold_log'), XY['num_sold_log'])

## Local Models

In [38]:
# from previous retail forecasting competitions' leaders,
# plus theoretically expected heterogeneity between series: 
# one model per segment

segments_XY = {grp: df for grp, df in XY.groupby('series_id')}

# even when split into many dataframes, *index-based* subsets.
# indexes maintained when dataframe splits.
kfolds = [

    # validation set 2014-16 matches ultimate test set's length, 2017-19
    ( 
        ( (XY['date'] >= pd.to_datetime('2010-01-01')) & (XY['date'] < pd.to_datetime("2014-01-01")) ),
        XY['date'] >= pd.to_datetime("2014-01-01")
    ),

    ( 
        ( (XY['date'] >= pd.to_datetime('2010-01-01')) & (XY['date'] < pd.to_datetime("2016-01-01")) ),
        XY['date'] >= pd.to_datetime("2016-01-01")
    )

    ]

kfolds_evaluation = []
for is_training, is_validation in kfolds:

    # per group: fit model on train, predict on validation
    # WARNING: one model instance, then fitting it on each segment,
    # does not imply separate model objects. they're all tied together.
        # model = LinearRegression()
    segments_models = {
        grp: Ridge(1e-1).fit(
            XY.loc[is_training, FEATURES_LOCAL_MODEL], 
            XY.loc[is_training, 'num_sold_log']
            )
        for grp, XY in segments_XY.items()
        }

    segments_predictions = [
        df.assign(
            yhat = lambda df_: np.exp(segments_models[grp].predict(df_[FEATURES_LOCAL_MODEL])) 
            )
        for grp, df in segments_XY.items()
        ]

    predictions = pd.concat(segments_predictions, axis=0)

    scores = {
        'validation': mean_absolute_percentage_error( 
            predictions.loc[is_validation, 'num_sold'],
            predictions.loc[is_validation, 'yhat']
            ),
        'train': mean_absolute_percentage_error( 
            predictions.loc[is_training, 'num_sold'],
            predictions.loc[is_training, 'yhat']
            )
        }
    
    kfolds_evaluation.append(scores)

In [None]:
kfolds_evaluation

In [None]:
def objective(trial):

    parameters = {'alpha': trial.suggest_float("alpha", 1e-4, 1, log=True)}

    kfolds_evaluation = []
    for is_training, is_validation in kfolds:

        # per group: fit model on train, predict on validation
        segments_models = {
            grp: Ridge(**parameters).fit(
                XY.loc[is_training, FEATURES_LOCAL_MODEL], 
                XY.loc[is_training, 'num_sold_log']
                )
            for grp, XY in segments_XY.items()
            }

        segments_predictions = [
            df.assign(
                yhat = lambda df_: np.exp(segments_models[grp].predict(df_[FEATURES_LOCAL_MODEL])) 
                )
            for grp, df in segments_XY.items()
            ]

        predictions = pd.concat(segments_predictions, axis=0)

        scores = {
            'validation': mean_absolute_percentage_error( 
                predictions.loc[is_validation, 'num_sold'],
                predictions.loc[is_validation, 'yhat']
                ),
            'train': mean_absolute_percentage_error( 
                predictions.loc[is_training, 'num_sold'],
                predictions.loc[is_training, 'yhat']
                )
            }
        
        kfolds_evaluation.append(scores)

    score_overall = np.mean([ 
        kfolds_evaluation[0]['validation'], 
        kfolds_evaluation[1]['validation'] 
        ])
    
    return score_overall

study = optuna.create_study()

study.optimize(
    objective,
    n_trials=50,
    catch=(ValueError,),
    n_jobs=-1,
    timeout=12 * 60 * 60,
)

In [None]:
study.best_trial, study.best_params, study.best_value

In [42]:
# INCOMPLETE: stalled after ~20 minute runtime, and score not better than 0.1

# def objective(trial):

#     ROUNDS_COUNT = 100

#     param = {
#         "objective": "reg:squarederror",
#         "booster": trial.suggest_categorical(
#             "booster", ["gbtree", "gblinear", "dart"]
#         ),
#         "lambda": trial.suggest_float("lambda", 1e-8, 1.0, log=True),
#         "alpha": trial.suggest_float("alpha", 1e-8, 1.0, log=True),
#         "subsample": trial.suggest_float("subsample", 0.4, 1.0),
#         "colsample_bytree": trial.suggest_float("colsample_bytree", 0.2, 1.0),
#     }

#     if param["booster"] in ["gbtree", "dart"]:
#         param["max_depth"] = trial.suggest_int("max_depth", 1, 9, step=2)

#         param["min_child_weight"] = trial.suggest_int("min_child_weight", 1, 10)
#         param["eta"] = trial.suggest_float("eta", 1e-5, 0.01, log=True)

#         param["gamma"] = trial.suggest_float("gamma", 1e-8, 1.0, log=True)
#         param["grow_policy"] = trial.suggest_categorical(
#             "grow_policy", ["depthwise", "lossguide"]
#         )

#     if param["booster"] == "dart":
#         param["sample_type"] = trial.suggest_categorical(
#             "sample_type", ["uniform", "weighted"]
#         )
#         param["normalize_type"] = trial.suggest_categorical(
#             "normalize_type", ["tree", "forest"]
#         )
#         param["rate_drop"] = trial.suggest_float(
#             "rate_drop", 1e-8, 1.0, log=True
#         )
#         param["skip_drop"] = trial.suggest_float(
#             "skip_drop", 1e-8, 1.0, log=True
#         )

#     kfolds_evaluation = []
#     for is_training, is_validation in kfolds:

#         segments_models = {}
#         segments_predictions = []
#         for grp, XY in segments_XY.items():

#             dtrain = xgb.DMatrix(
#                 XY.loc[is_training, FEATURES_LOCAL_MODEL], 
#                 label=XY.loc[is_training, 'num_sold_log']
#                 )
#             dtest = xgb.DMatrix(XY[FEATURES_LOCAL_MODEL])

#             model_global = xgb.train(param, dtrain, ROUNDS_COUNT)
#             segments_models[grp] = model_global
        
#             yhat = model_global.predict(dtest)
#             predictions = (
#                 XY
#                 .copy()
#                 .assign(yhat = lambda df_: np.exp(yhat))
#                 )
#             segments_predictions.append(predictions)

#         predictions = pd.concat(segments_predictions, axis=0)

#         scores = {
#             'validation': mean_absolute_percentage_error( 
#                 predictions.loc[is_validation, 'num_sold'],
#                 predictions.loc[is_validation, 'yhat']
#                 ),
#             'train': mean_absolute_percentage_error( 
#                 predictions.loc[is_training, 'num_sold'],
#                 predictions.loc[is_training, 'yhat']
#                 )
#             }
        
#         kfolds_evaluation.append(scores)

#     score_overall = np.mean([ 
#         kfolds_evaluation[0]['validation'], 
#         kfolds_evaluation[1]['validation'] 
#         ])
    
#     return score_overall

# study = optuna.create_study()

# study.optimize(
#     objective,
#     n_trials=25,
#     catch=(ValueError,),
#     n_jobs=-1,
#     timeout=60 * 10,
# )

# study.best_trial, study.best_params, study.best_value

In [None]:
RIDGE_ALPHA_TUNED = study.best_params['alpha']

# from previous retail forecasting competitions' leaders,
# plus theoretically expected heterogeneity between series: 
# one model per segment

segments_XY = {grp: df for grp, df in XY.groupby('series_id')}

# even when split into many dataframes, *index-based* subsets.
# indexes maintained when dataframe splits.
kfolds = [

    # validation set 2014-16 matches ultimate test set's length, 2017-19
    ( 
        ( (XY['date'] >= pd.to_datetime('2010-01-01')) & (XY['date'] < pd.to_datetime("2014-01-01")) ),
        XY['date'] >= pd.to_datetime("2014-01-01")
    ),

    ( 
        ( (XY['date'] >= pd.to_datetime('2010-01-01')) & (XY['date'] < pd.to_datetime("2016-01-01")) ),
        XY['date'] >= pd.to_datetime("2016-01-01")
    )

    ]

kfolds_evaluation = []
for is_training, is_validation in kfolds:

    # per group: fit model on train, predict on validation
    # WARNING: one model instance, then fitting it on each segment,
    # does not imply separate model objects. they're all tied together.
        # model = LinearRegression()
    segments_models = {
        grp: Ridge(RIDGE_ALPHA_TUNED).fit(
            XY.loc[is_training, FEATURES_LOCAL_MODEL], 
            XY.loc[is_training, 'num_sold_log']
            )
        for grp, XY in segments_XY.items()
        }

    segments_predictions = [
        df.assign(
            yhat = lambda df_: np.exp(segments_models[grp].predict(df_[FEATURES_LOCAL_MODEL])) 
            )
        for grp, df in segments_XY.items()
        ]

    predictions = pd.concat(segments_predictions, axis=0)

    scores = {
        'validation': mean_absolute_percentage_error( 
            predictions.loc[is_validation, 'num_sold'],
            predictions.loc[is_validation, 'yhat']
            ),
        'train': mean_absolute_percentage_error( 
            predictions.loc[is_training, 'num_sold'],
            predictions.loc[is_training, 'yhat']
            )
        }
    
    kfolds_evaluation.append(scores)

kfolds_evaluation

In [None]:
predictions_sample = predictions.query("series_id == 'Norway|Premium Sticker Mart|Kaggle'")

predictions_sample.set_index('date')[['num_sold', 'yhat']].plot.line()

In [45]:
(
    predictions
    .assign(mape = lambda df_: np.abs(100 * (df_['num_sold']/df_['yhat'] - 1)) )
    .sort_values('mape', ascending=False)
    .head(100)
    .to_csv("./data/processed/predictions_errors_local_model.csv", index=False)
)

In [None]:
(
    predictions
    .assign(mape = lambda df_: np.abs(100 * (df_['num_sold']/df_['yhat'] - 1)) )
    .sort_values('mape', ascending=False)
    .head(50)
)

In [None]:
predictions_sample = predictions.query("series_id == 'Norway|Premium Sticker Mart|Kaggle Tiers'")

predictions_sample.set_index('date')[['num_sold', 'yhat']].plot.line()

In [None]:
predictions_sample = predictions.query("series_id == 'Kenya|Stickers for Less|Holographic Goose'")

predictions_sample.set_index('date')[['num_sold', 'yhat']].plot.line()

In [None]:
segments_XY = {grp: df for grp, df in XY.groupby('series_id')}

# even when split into many dataframes, *index-based* subsets.
# indexes maintained when dataframe splits.
kfolds = [

    # validation set 2014-16 matches ultimate test set's length, 2017-19
    ( 
        ( (XY['date'] >= pd.to_datetime('2010-01-01')) & (XY['date'] < pd.to_datetime("2014-01-01")) ),
        XY['date'] >= pd.to_datetime("2014-01-01")
    ),

    ( 
        ( (XY['date'] >= pd.to_datetime('2010-01-01')) & (XY['date'] < pd.to_datetime("2016-01-01")) ),
        XY['date'] >= pd.to_datetime("2016-01-01")
    )

    ]

kfolds_evaluation = []
FEATURES_LOCAL_TREND_MODEL = FEATURES_LOCAL_MODEL
for is_training, is_validation in kfolds:
    
    segments_models = {
        grp: (
            TrendRemainderModelPipeline(FEATURES_LOCAL_TREND_MODEL, RIDGE_ALPHA_TUNED, ['day_of_year'])
            .fit(XY.loc[is_training, FEATURES_LOCAL_MODEL], XY.loc[is_training, 'num_sold_log'])
            )
        for grp, XY in segments_XY.items()
        }

    segments_predictions = [
        df.assign(
            yhat = lambda df_: np.exp(segments_models[grp].predict(df_[FEATURES_LOCAL_MODEL])) 
            )
        for grp, df in segments_XY.items()
        ]

    predictions = pd.concat(segments_predictions, axis=0)

    scores = {
        'validation': mean_absolute_percentage_error( 
            predictions.loc[is_validation, 'num_sold'],
            predictions.loc[is_validation, 'yhat']
            ),
        'train': mean_absolute_percentage_error( 
            predictions.loc[is_training, 'num_sold'],
            predictions.loc[is_training, 'yhat']
            )
        }
    
    kfolds_evaluation.append(scores)

kfolds_evaluation

In [None]:
predictions_sample = predictions.query("series_id == 'Norway|Premium Sticker Mart|Kaggle'")

predictions_sample.set_index('date')[['num_sold', 'yhat']].plot.line()

In [51]:
SEGMENTS_MODELS = {
    grp: Ridge(RIDGE_ALPHA_TUNED).fit(XY[FEATURES_LOCAL_MODEL], XY['num_sold_log'])
    for grp, XY in segments_XY.items()
    }

# Deployment

In [52]:
sales_test_daily = pd.read_csv("./data/external/test.csv").assign(

    date = lambda df_: pd.to_datetime(df_['date']),

    country_store = lambda df_: df_['country'].str.cat(df_['store'], sep='|'),
    country_product = lambda df_: df_['country'].str.cat(df_['product'], sep='|'),
    store_product = lambda df_: df_['store'].str.cat(df_['product'], sep='|'),
    country_store_product = lambda df_: df_['country'].str.cat([df_['store'], df_['product']], sep='|')

    ).assign(series_id = lambda df_: df_['country_store_product'])

In [None]:
sales_test_daily.head()

In [None]:
sales_test_daily['date'].describe()

In [55]:
sales_test_daily = transform_calendar_features(sales_test_daily)
sales_test_daily = integrate_external_features(sales_test_daily)

In [56]:
sales_test_daily = sales_test_daily.assign(
    num_sold = None,
    num_sold_log = None
)
sales_test_enriched_daily = feature_transform_pipeline.transform(sales_test_daily)

In [57]:
segments_X_test = {grp: df for grp, df in sales_test_enriched_daily.groupby('series_id')}

segments_predictions_test = []
for grp, df in segments_X_test.items():

    if grp in SEGMENTS_MODELS:
        df = df.assign(
            yhat = lambda df_: np.exp(SEGMENTS_MODELS[grp].predict(df_[FEATURES_LOCAL_MODEL])) 
            )  
    else:
        df = df.assign(
            yhat = lambda df_: np.exp(model_global.predict(df_[FEATURES_GLOBAL_MODEL])) 
            )

    segments_predictions_test.append(df)

predictions_test = pd.concat(segments_predictions_test, axis=0)

In [58]:
predictions_test_submit = (
    predictions_test
    [['id', 'yhat']]
    .rename(columns={'yhat': 'num_sold'})
    )

In [59]:
assert predictions_test_submit.shape[0] == 98_550
assert predictions_test_submit.notnull().all().all()
predictions_test_submit.to_csv("./data/processed/submission4.csv", index=False)