In [None]:
import pandas as pd
from sklearn.metrics import mean_absolute_percentage_error
import requests
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
import numpy as np
import optuna
import xgboost as xgb
from dateutil.easter import easter
from datetime import timedelta
import plotnine as p9

# Load

In [2]:
sales_daily = (
    pd.read_csv("./data/external/train.csv")
    .assign(

        date = lambda df_: pd.to_datetime(df_['date']),

        country_store = lambda df_: df_['country'].str.cat(df_['store'], sep='|'),
        country_product = lambda df_: df_['country'].str.cat(df_['product'], sep='|'),
        store_product = lambda df_: df_['store'].str.cat(df_['product'], sep='|'),
        country_store_product = lambda df_: df_['country'].str.cat([df_['store'], df_['product']], sep='|')

    )
    .assign(series_id = lambda df_: df_['country_store_product'])
)

In [None]:
sales_daily.head()

In [None]:
def extract_gdp_per_capita(country_code, year):
    """
    Adapted from https://www.kaggle.com/competitions/playground-series-s5e1/discussion/554349.
    """

    url='https://api.worldbank.org/v2/country/{0}/indicator/NY.GDP.PCAP.CD?date={1}&format=json'
    response = requests.get(url.format(country_code, year)).json()

    return response[1][0]['value']

# per CountryCode-year: request GDP per capita.
# concatenate dataframe of CountryCode | Country | Year | GDP, for integration to Kaggle source

countries_code_map = {
    'Canada': 'CAN', 
    'Finland': 'FIN',
    'Italy': 'ITA',
    'Kenya': 'KEN',
    'Norway': 'NOR',
    'Singapore': 'SGP'
    }

countries_gdp_yearly = []
for country_title, country_code in countries_code_map.items():
    
    values_yearly = [
        {'year': i, 'gdp_per_capita': extract_gdp_per_capita(country_code, i)}
        for i in range(2010, 2019+1)
        ]
    values_yearly = [pd.DataFrame(x, index=[0]) for x in values_yearly]
    values_yearly = pd.concat(values_yearly, axis=0)
    
    values_yearly = values_yearly.assign(
        country = country_title,
        country_code = country_code
        )
    
    countries_gdp_yearly.append(values_yearly)

    print(f"{country_title} ({country_code}) GDP Per Capita extraction complete.")

countries_gdp_yearly = pd.concat(countries_gdp_yearly, axis=0)

countries_gdp_yearly = countries_gdp_yearly.assign(
    gdp_per_capita_log = lambda df_: np.log(df_['gdp_per_capita'])
    )

In [None]:
(
    countries_gdp_yearly
    .set_index('year')
    .groupby('country')
    ['gdp_per_capita']
    .plot(legend=True)
)
;

In [None]:
(
    countries_gdp_yearly
    .query("country == 'Kenya'")
    .set_index('year')
    .groupby('country')
    ['gdp_per_capita']
    .plot(legend=True)
)
;

In [7]:
# TODO: is there a better practice for holidays feature representation?
    # in Hyndman's Electricity Load Forecasting Kaggle (https://robjhyndman.com/papers/kaggle-competition.pdf),
    # "holiday effect modelled with a factor variable, taking value zero on a non-work day,
    # some non-zero value day before a non-work day, and a different value the day after a non-work day."
    # meaning -- holiday days pooled, before & after estimated separately?

days_easter0 = [easter(x) for x in range(2010, 2019+1)]
days_easter = pd.DataFrame({'date': days_easter0}).assign(is_easter = 1)

# motivated by exploratory analysis of model errors.
# appears that errors concentrate on days shortly after Easter
dfs_days_special_relative_easter = [days_easter]
for delta_days in [2, 3, 4, 5, 6, 7]:

    df_special = (
        days_easter
        .copy()
        .assign(date = lambda df_: df_['date'] + timedelta(days=delta_days))
        .rename(columns={'is_easter': f'is_easter_plus{delta_days}'})
        )
    dfs_days_special_relative_easter.append(df_special)

days_special_relative_easter = (
    pd.concat(dfs_days_special_relative_easter, axis=0)
    .fillna(0)
    .assign(date = lambda df_: pd.to_datetime(df_['date']))
    )

assert days_special_relative_easter['date'].is_unique

FEATURES_EASTER = [x for x in days_special_relative_easter.columns if 'easter' in x]

In [8]:
def transform_calendar_features(df):

    df = (
        df
        .assign(
            year = lambda df_: df_['date'].dt.year,
            month = lambda df_: df_['date'].dt.month,
            week_of_year = lambda df_: df_['date'].dt.isocalendar().week,
            day_of_week = lambda df_: df_['date'].dt.day_name(),
            # President's Day is the 'third Monday in February'
            day_of_month = lambda df_: df_['date'].dt.day,
            day_of_year = lambda df_: df_['date'].dt.dayofyear,
            # week of month would be ambiguous because, one week may span 2 months,
            days_since_start = lambda df_: (df['date'] - pd.to_datetime("2010-01-01")).dt.days
            )
        .assign(
            # TODO: are periodic feature transforms too rigid?

            # as day_of_year rises, don't expect monotonic relationship with outcome.
            # rather, expect periodic (sinusoidal) relationship.
            # as sin(x) rises, so too does outcome ...
            # ensure one cycle over one year.
            # at baseline, one sinusoidal cycle occurs per 2π
            day_of_year_sin = lambda df_: np.sin(df_['day_of_year'] * 2 * np.pi / 365),
            day_of_year_cos = lambda df_: np.cos(df_['day_of_year'] * 2 * np.pi / 365),

            day_of_month_sin = lambda df_: np.sin(df_['day_of_month'] * 2 * np.pi / 30),
            day_of_month_cos = lambda df_: np.cos(df_['day_of_month'] * 2 * np.pi / 30),

            # exploratory visuals suggest ~2-year cycles
            days_since_start_macro_sin = lambda df_: np.sin(df_['days_since_start'] * 2 * np.pi / 730),
            days_since_start_macro_cos = lambda df_: np.cos(df_['days_since_start'] * 2 * np.pi / 730),
            )
        .assign(
            is_yearend = lambda df_: (
                (df_['month'] == 12) & (df_['day_of_month'].isin([28, 29, 30, 31]))
                ).astype(int)
            )

        )
    
    df = pd.merge(df, days_special_relative_easter, how='left')
    assert df['is_easter'].notnull().any()
    df[FEATURES_EASTER] = df[FEATURES_EASTER].fillna(0)
    
    return df

def integrate_external_features(df):

    df = pd.merge(df, countries_gdp_yearly, how='left')
    assert df['gdp_per_capita'].notnull().all().all()

    return df

def transform_lagged_predictors(df):

    # to ensure proper within-series outcome lags
    # TODO: with lagged features coming into play, how to enforce proper order via indexes?
    df = (
        df
        .sort_values(['series_id', 'date'])
        .assign(
            num_sold_lag1 = lambda df_: df_.groupby('series_id')['num_sold'].shift(1),
            num_sold_lag7 = lambda df_: df_.groupby('series_id')['num_sold'].shift(7)
            )
        )
    
    return df

def transform_logs(df):

    df = (
        df
        .assign(
            num_sold_log = lambda df_: np.log(df_['num_sold']),
            num_sold_lag1_log = lambda df_: np.log(df_['num_sold_lag1']),
        )
    )

    return df

sales_daily = transform_calendar_features(sales_daily)
sales_daily = integrate_external_features(sales_daily)
sales_daily = transform_lagged_predictors(sales_daily)
sales_daily = transform_logs(sales_daily)

# Data Understanding

## Data Description Report: "Surface Properties"

In [None]:
sales_daily['id'].is_unique

### Volumetric Analyses

In [None]:
sales_daily.shape

In [None]:
sales_daily['series_id'].nunique()

In [None]:
sales_daily['date'].nunique()

In [None]:
sales_daily['series_id'].value_counts().value_counts()

In [None]:
sales_daily['product'].value_counts(dropna=False)

In [None]:
sales_daily['country'].value_counts(dropna=False)

In [None]:
sales_daily['store'].value_counts(dropna=False)

### Fields' Types and Values

In [None]:
sales_daily['date'].describe()

In [None]:
sales_daily['num_sold'].describe()

## Data Quality Report

In [None]:
sales_daily.isnull().mean()

In [None]:
# are null sales events concentrated on a particular date?
# doesn't appear so
sales_daily.query("num_sold.isnull()")['date'].value_counts()

In [None]:
sales_daily.query("num_sold.isnull()")['series_id'].value_counts()

## Data Exploration Report

In [None]:
sales_daily.groupby('series_id')['num_sold'].sum().sort_values(ascending=False)

In [None]:
# a naive model: series' historical average sales, daily

is_training = (
    (sales_daily['date'] >= pd.to_datetime('2010-01-01'))
    & (sales_daily['date'] < pd.to_datetime("2014-01-01"))
)

is_validation = sales_daily['date'] >= pd.to_datetime("2014-01-01")

# is_training.sum(), is_validation.sum()

# with original train data: using strictly train segment, groupby average
predictions_naive = (
    sales_daily
    .loc[is_training]
    .groupby('series_id')
    [['num_sold']]
    .agg('mean')
    .reset_index(drop=False)
)

predictions_naive_evaluate = pd.merge(
    sales_daily.loc[is_validation],
    predictions_naive.rename(columns={'num_sold': 'yhat'}),
    how='left'
    )

# expect a couple series with all null
predictions_naive.isnull().sum()
# recommended in this discussion: https://www.kaggle.com/competitions/playground-series-s5e1/discussion/554553 
predictions_naive_evaluate = predictions_naive_evaluate.dropna()

mean_absolute_percentage_error(
    predictions_naive_evaluate['num_sold'],
    predictions_naive_evaluate['yhat']
)

In [None]:
predictions_naive_evaluate.shape

In [None]:
# a naive model: series' historical average sales, daily

is_training = (
    (sales_daily['date'] >= pd.to_datetime('2013-01-01'))
    & (sales_daily['date'] < pd.to_datetime("2014-01-01"))
)

is_validation = sales_daily['date'] >= pd.to_datetime("2014-01-01")

# is_training.sum(), is_validation.sum()

# with original train data: using strictly train segment, groupby average
predictions_naive = (
    sales_daily
    .loc[is_training]
    .groupby('series_id')
    [['num_sold']]
    .agg('mean')
    .reset_index(drop=False)
)

predictions_naive_evaluate = pd.merge(
    sales_daily.loc[is_validation],
    predictions_naive.rename(columns={'num_sold': 'yhat'}),
    how='left'
    )

# expect a couple series with all null
predictions_naive.isnull().sum()
# recommended in this discussion: https://www.kaggle.com/competitions/playground-series-s5e1/discussion/554553 
predictions_naive_evaluate = predictions_naive_evaluate.dropna()

mean_absolute_percentage_error(
    predictions_naive_evaluate['num_sold'],
    predictions_naive_evaluate['yhat']
)

In [None]:
sales_sample_daily = sales_daily.query("series_id == 'Norway|Premium Sticker Mart|Kaggle'")

(
    sales_sample_daily
    .loc[is_training]
    [['date', 'num_sold']]
    .set_index('date')
    .plot
    .line()
)
;

In [27]:
# descend level of abstraction -- 

    # across-year patterns (multi-year business cycles)
    # within-year,
        # month-of-year seasonality
        # week-of-year seasonality
        # day-of-month seasonality
        # day-of-week seasonality

# outcome vs predictors
    # lagged outcome
    # gdp
    # 

In [None]:
(
    p9.ggplot(sales_daily.groupby(['date'])[['num_sold']].agg('sum').reset_index(drop=False)) + 
    p9.theme_bw() + 
    p9.geom_point(p9.aes('date', 'num_sold'), alpha=0.25)
)

In [None]:
(
    p9.ggplot(sales_daily) + 
    p9.theme_bw() + 
    p9.geom_boxplot(p9.aes('month', 'num_sold', group='month'), alpha=0.1)
)

In [None]:
(
    p9.ggplot(sales_daily.query("series_id == 'Norway|Premium Sticker Mart|Kaggle'")) + 
    p9.theme_bw() + 
    p9.geom_boxplot(p9.aes('month', 'num_sold', group='month'), alpha=0.1)
)

In [None]:
(
    p9.ggplot((
        sales_daily
        .assign(day_of_week = lambda df_: pd.Categorical(
            df_['day_of_week'], 
            ['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday']
            ))
    )) + 
    p9.theme_bw() + 
    p9.geom_boxplot(p9.aes('day_of_week', 'num_sold', group='day_of_week'), alpha=0.1)
)

In [None]:
(
    p9.ggplot(sales_daily.query("series_id == 'Norway|Premium Sticker Mart|Kaggle'")) + 
    p9.theme_bw() + 
    p9.geom_point(p9.aes('num_sold_lag1', 'num_sold'), alpha=0.1)
)

In [None]:
(
    p9.ggplot(sales_daily.query("series_id == 'Norway|Premium Sticker Mart|Kaggle'")) + 
    p9.theme_bw() + 
    p9.geom_point(p9.aes('num_sold_lag7', 'num_sold'), alpha=0.1)
)

# Data Preparation

Features set varies by model -- _global_ and _local_. 

"Universe" implies, all features that could be used. A subset enters into each "building block" of the overall modeling system.

In [34]:
FEATURES_UNIVERSE_TO_ONEHOT = [
    'country', 
    'store',
    'product',
    'country_store',
    'country_product',
    'store_product',
    'country_store_product',
    # year attempted, but then omitted. Because 
    # exogenous factors should help explain year-to-year shifts,
    # so that out-of-sample years' forecasts aren't flat
    'month', 
    'week_of_year', 
    'day_of_week'
    ]

# local model fits by country-store-product segment, 
# so those onehots would be invariant
FEATURES_LOCAL_MODEL_TO_ONEHOT = [
    x for x in FEATURES_UNIVERSE_TO_ONEHOT 
    if not any(stem in x for stem in ['country', 'store', 'product'])
    ]

FEATURES_UNIVERSE_NUMERIC_CONTINUOUS = [
    'gdp_per_capita_log',
    'day_of_month', 
    'day_of_month_sin',
    'day_of_month_cos',
    'day_of_year_sin',
    'day_of_year_cos',
    'day_of_year',
    'days_since_start_macro_sin',
    'days_since_start_macro_cos',
    'days_since_start'
    ]

FEATURES_UNIVERSE_ALREADY_ONEHOT = ['is_yearend'] + FEATURES_EASTER

FEATURES_AUTOREGRESSIVE = ['num_sold_lag1_log']

FEATURES_GLOBAL_MODEL = (
    FEATURES_UNIVERSE_TO_ONEHOT + 
    FEATURES_UNIVERSE_NUMERIC_CONTINUOUS + 
    FEATURES_UNIVERSE_ALREADY_ONEHOT
    )

FEATURES_LOCAL_MODEL = (
    FEATURES_LOCAL_MODEL_TO_ONEHOT + 
    FEATURES_UNIVERSE_NUMERIC_CONTINUOUS + 
    FEATURES_UNIVERSE_ALREADY_ONEHOT
    )

FEATURES_LOCAL_REMAINDER_MODEL = list( set(FEATURES_LOCAL_MODEL).intersection(set(FEATURES_GLOBAL_MODEL)) )
FEATURES_LOCAL_REMAINDER_MODEL.remove('gdp_per_capita_log')

ATTRIBUTES = ['series_id', 'date', 'id']

In [35]:
sales_daily_complete = sales_daily.dropna(subset=['num_sold', 'num_sold_lag1_log'])

# shorter alias
XY = sales_daily_complete

# from previous retail forecasting competitions' leaders,
# plus theoretically expected heterogeneity between series: 
# one model per segment

segments_XY = {grp: df for grp, df in XY.groupby('series_id')}

# Modeling

## Global Model

In [36]:
class TrendRemainderModelPipeline:
    def __init__(self, trend_model_ridge_alpha):

        self.trend_level_model_ridge_alpha = trend_model_ridge_alpha

    def fit(self, X, y):

        self.fit_trend_level_model(X, y)
        yhat_trend_level = self.predict_trend_level_model(X)
        y_detrended = y - yhat_trend_level

        self.fit_remainder_model(X, y_detrended)

        return self

    def predict(self, X):

        yhat_trend_level = self.predict_trend_level_model(X)
        yhat_remainder = self.predict_remainder_model(X)
        preds = yhat_trend_level + yhat_remainder

        return preds

    def fit_trend_level_model(self, X, y):
        """
        'Weak learner', strictly intended to explain trend (level) shifts between years.
        """

        self.trend_level_feature_transformer = ColumnTransformer([
            ('transformer_onehot', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), FEATURES_UNIVERSE_TO_ONEHOT),
            ('transformer_std', StandardScaler(), ['gdp_per_capita_log'])
            ],
            verbose_feature_names_out=False,
            remainder='drop'
            )
        X = self.trend_level_feature_transformer.fit_transform(X)

        model_trend_level = Ridge(self.trend_level_model_ridge_alpha)
        model_trend_level.fit(X, y)

        self.trend_level_model = model_trend_level

    def fit_remainder_model(self, X, y):

        self.remainder_feature_transformer = ColumnTransformer([
            ('transformer_onehot', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), FEATURES_UNIVERSE_TO_ONEHOT),
            ('transformer_std', StandardScaler(), FEATURES_UNIVERSE_NUMERIC_CONTINUOUS)
            ],
            verbose_feature_names_out=False,
            remainder='passthrough'
            )
        X = self.remainder_feature_transformer.fit_transform(X)

        model_remainder = RandomForestRegressor(n_estimators=100, n_jobs=-1)
        model_remainder.fit(X, y)

        self.remainder_model = model_remainder

    def predict_trend_level_model(self, X):
        X = self.trend_level_feature_transformer.transform(X)
        preds = self.trend_level_model.predict(X) 
        return preds
    
    def predict_remainder_model(self, X):
        X = self.remainder_feature_transformer.transform(X)
        preds = self.remainder_model.predict(X)
        return preds

In [None]:
kfolds = [

    # validation set 2014-16 matches ultimate test set's length, 2017-19
    ( 
        ( (XY['date'] >= pd.to_datetime('2010-01-01')) & (XY['date'] < pd.to_datetime("2014-01-01")) ),
        XY['date'] >= pd.to_datetime("2014-01-01")
    ),

    ( 
        ( (XY['date'] >= pd.to_datetime('2010-01-01')) & (XY['date'] < pd.to_datetime("2016-01-01")) ),
        XY['date'] >= pd.to_datetime("2016-01-01")
    )

    ]

kfolds_evaluation = []
for is_training, is_validation in kfolds:

    feature_transform_pipeline_global_model = ColumnTransformer([
        ('transformer_onehot', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), FEATURES_UNIVERSE_TO_ONEHOT),
        ('transformer_std', StandardScaler(), FEATURES_UNIVERSE_NUMERIC_CONTINUOUS)
        ],
        verbose_feature_names_out=False,
        # caller expected to provide some features that require no transforms.
        # don't drop those just because they received no special operations.
        # but do be sure to drop non-features in advance!
        remainder='passthrough'
        )
    feature_transform_pipeline_global_model.set_output(transform='pandas')
    
    pipeline_e2e = Pipeline([
        ('transform_features', feature_transform_pipeline_global_model), 
        ('model', RandomForestRegressor(n_estimators=100, n_jobs=-1))
        ])

    pipeline_e2e.fit(
        XY.loc[is_training, FEATURES_GLOBAL_MODEL],
        XY.loc[is_training, 'num_sold_log']
        )
    
    predictions = (
        XY
        .copy()
        .assign(yhat = lambda df_: np.exp(pipeline_e2e.predict(df_[FEATURES_GLOBAL_MODEL])))
        )
    
    scores = {
        'validation': {
            'nobs': predictions.loc[is_validation].shape[0],
            'score': mean_absolute_percentage_error( 
                predictions.loc[is_validation, 'num_sold'],
                predictions.loc[is_validation, 'yhat']
                )
            },
        'train': {
            'nobs': predictions.loc[is_training].shape[0],
            'score': mean_absolute_percentage_error( 
                predictions.loc[is_training, 'num_sold'],
                predictions.loc[is_training, 'yhat']
                )
            }
        }
    
    kfolds_evaluation.append(scores)

kfolds_evaluation

In [None]:
predictions_sample = predictions.query("series_id == 'Norway|Stickers for Less|Kaggle'")

predictions_sample.set_index('date')[['num_sold', 'yhat']].plot.line(alpha=0.5);

In [39]:
# from sklearn.inspection import permutation_importance

# result = permutation_importance(
#     model_global, 
#     XY.loc[is_validation, FEATURES_GLOBAL_MODEL],
#     XY.loc[is_validation, 'num_sold_log'], 
#     n_repeats=5, 
#     random_state=777, 
#     n_jobs=-1
# )
# importances = pd.Series(result.importances_mean, index=FEATURES_GLOBAL_MODEL)

# importances = pd.Series(model_global.feature_importances_, index=FEATURES_GLOBAL_MODEL)

In [None]:
kfolds = [

    # validation set 2014-16 matches ultimate test set's length, 2017-19
    ( 
        ( (XY['date'] >= pd.to_datetime('2010-01-01')) & (XY['date'] < pd.to_datetime("2014-01-01")) ),
        XY['date'] >= pd.to_datetime("2014-01-01")
    ),

    ( 
        ( (XY['date'] >= pd.to_datetime('2010-01-01')) & (XY['date'] < pd.to_datetime("2016-01-01")) ),
        XY['date'] >= pd.to_datetime("2016-01-01")
    )

    ]

kfolds_evaluation = []
for is_training, is_validation in kfolds:

    feature_transform_pipeline_global_model = ColumnTransformer([
        ('transformer_onehot', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), FEATURES_UNIVERSE_TO_ONEHOT),
        ('transformer_std', StandardScaler(), FEATURES_UNIVERSE_NUMERIC_CONTINUOUS)
        ],
        verbose_feature_names_out=False,
        remainder='passthrough'
        )
    feature_transform_pipeline_global_model.set_output(transform='pandas')

    pipeline_e2e = Pipeline([
        ('transform_features', feature_transform_pipeline_global_model), 
        ('model', Ridge(1e-2))
        ])

    pipeline_e2e.fit(
        XY.loc[is_training, FEATURES_GLOBAL_MODEL],
        XY.loc[is_training, 'num_sold_log']
        )
    
    predictions = (
        XY
        .copy()
        .assign(yhat = lambda df_: np.exp(pipeline_e2e.predict(df_[FEATURES_GLOBAL_MODEL])))
        )
    
    scores = {
        'validation': {
            'nobs': predictions.loc[is_validation].shape[0],
            'score': mean_absolute_percentage_error( 
                predictions.loc[is_validation, 'num_sold'],
                predictions.loc[is_validation, 'yhat']
                )
            },
        'train': {
            'nobs': predictions.loc[is_training].shape[0],
            'score': mean_absolute_percentage_error( 
                predictions.loc[is_training, 'num_sold'],
                predictions.loc[is_training, 'yhat']
                )
            }
        }
    
    kfolds_evaluation.append(scores)

kfolds_evaluation

In [None]:
predictions_sample = predictions.query("series_id == 'Norway|Stickers for Less|Kaggle'")
# predictions_sample = predictions.query("series_id == 'Norway|Premium Sticker Mart|Kaggle'")

predictions_sample.set_index('date')[['num_sold', 'yhat']].plot.line(alpha=0.5);

In [None]:
kfolds = [

    # validation set 2014-16 matches ultimate test set's length, 2017-19
    ( 
        ( (XY['date'] >= pd.to_datetime('2010-01-01')) & (XY['date'] < pd.to_datetime("2014-01-01")) ),
        XY['date'] >= pd.to_datetime("2014-01-01")
    ),

    ( 
        ( (XY['date'] >= pd.to_datetime('2010-01-01')) & (XY['date'] < pd.to_datetime("2016-01-01")) ),
        XY['date'] >= pd.to_datetime("2016-01-01")
    )

    ]

kfolds_evaluation = []
for is_training, is_validation in kfolds:

    model_global_pipeline = TrendRemainderModelPipeline(trend_model_ridge_alpha=1e-2)
    model_global_pipeline.fit(
        XY.loc[is_training, FEATURES_GLOBAL_MODEL],
        XY.loc[is_training, 'num_sold_log']
        )
    
    predictions = XY.copy()
    predictions = (
        predictions
        .assign(yhat_log = lambda df_: model_global_pipeline.predict(df_))
        .assign(yhat = lambda df_: np.exp(df_['yhat_log']) )
        )
    
    scores = {
        'validation': {
            'nobs': predictions.loc[is_validation].shape[0],
            'score': mean_absolute_percentage_error( 
                predictions.loc[is_validation, 'num_sold'],
                predictions.loc[is_validation, 'yhat']
                )
            },
        'train': {
            'nobs': predictions.loc[is_training].shape[0],
            'score': mean_absolute_percentage_error( 
                predictions.loc[is_training, 'num_sold'],
                predictions.loc[is_training, 'yhat']
                )
            }
        }
    
    kfolds_evaluation.append(scores)

kfolds_evaluation

In [None]:
predictions_sample = predictions.query("series_id == 'Norway|Stickers for Less|Kaggle'")
# predictions_sample = predictions.query("series_id == 'Norway|Premium Sticker Mart|Kaggle'")

predictions_sample.set_index('date')[['num_sold', 'yhat']].plot.line(alpha=0.5);

In [44]:
# def objective(trial):

#     ROUNDS_COUNT = 500

#     param = {
#         "objective": "reg:squarederror",
#         "booster": trial.suggest_categorical(
#             "booster", ["gbtree", "gblinear", "dart"]
#         ),
#         "lambda": trial.suggest_float("lambda", 1e-8, 1.0, log=True),
#         "alpha": trial.suggest_float("alpha", 1e-8, 1.0, log=True),
#         "subsample": trial.suggest_float("subsample", 0.4, 1.0),
#         "colsample_bytree": trial.suggest_float("colsample_bytree", 0.2, 1.0),
#         'device': 'cuda',
#         'tree_method': 'hist'
#     }

#     if param["booster"] in ["gbtree", "dart"]:
#         param["max_depth"] = trial.suggest_int("max_depth", 1, 9, step=2)

#         param["min_child_weight"] = trial.suggest_int("min_child_weight", 1, 10)
#         param["eta"] = trial.suggest_float("eta", 1e-5, 0.01, log=True)

#         param["gamma"] = trial.suggest_float("gamma", 1e-8, 1.0, log=True)
#         param["grow_policy"] = trial.suggest_categorical(
#             "grow_policy", ["depthwise", "lossguide"]
#         )

#     if param["booster"] == "dart":
#         param["sample_type"] = trial.suggest_categorical(
#             "sample_type", ["uniform", "weighted"]
#         )
#         param["normalize_type"] = trial.suggest_categorical(
#             "normalize_type", ["tree", "forest"]
#         )
#         param["rate_drop"] = trial.suggest_float(
#             "rate_drop", 1e-8, 1.0, log=True
#         )
#         param["skip_drop"] = trial.suggest_float(
#             "skip_drop", 1e-8, 1.0, log=True
#         )


#     kfolds_evaluation = []
#     for is_training, is_validation in kfolds:

#         model_global = Ridge(1e-2)
#         model_global.fit(
#             XY.loc[is_training, FEATURES_GLOBAL_TREND_LEVEL_MODEL],
#             XY.loc[is_training, 'num_sold_log']
#             )
        
#         predictions = (
#             XY
#             .copy()
#             .assign(yhat_trend_log = lambda df_: model_global.predict(df_[FEATURES_GLOBAL_TREND_LEVEL_MODEL]))
#             .assign(num_sold_log_detrend = lambda df_: df_['num_sold_log'] - df_['yhat_trend_log'])
#             )

#         dtrain = xgb.DMatrix(
#             predictions.loc[is_training, FEATURES_GLOBAL_MODEL], 
#             label=predictions.loc[is_training, 'num_sold_log_detrend']
#             )
#         dtest = xgb.DMatrix(predictions[FEATURES_GLOBAL_MODEL])

#         model_global = xgb.train(param, dtrain, ROUNDS_COUNT)
        
#         yhat_remainder = model_global.predict(dtest)
#         predictions = (
#             predictions
#             .assign(yhat_remainder = yhat_remainder)
#             .assign(yhat = lambda df_: np.exp(df_['yhat_trend_log'] + df_['yhat_remainder']))
#             )

#         scores = {
#             'validation': mean_absolute_percentage_error( 
#                 predictions.loc[is_validation, 'num_sold'],
#                 predictions.loc[is_validation, 'yhat']
#                 ),
#             'train': mean_absolute_percentage_error( 
#                 predictions.loc[is_training, 'num_sold'],
#                 predictions.loc[is_training, 'yhat']
#                 )
#             }
        
#         kfolds_evaluation.append(scores)

#     score_overall = np.mean([ 
#         kfolds_evaluation[0]['validation'], 
#         kfolds_evaluation[1]['validation'] 
#         ])
    
#     return score_overall

# study = optuna.create_study()

# study.optimize(
#     objective,
#     n_trials=50,
#     catch=(ValueError,),
#     n_jobs=-1,
#     timeout=12 * 60 * 60,
# )

In [None]:
MODEL_GLOBAL = TrendRemainderModelPipeline(trend_model_ridge_alpha=1e-2)
MODEL_GLOBAL.fit(XY[FEATURES_GLOBAL_MODEL], XY['num_sold_log'])

## Local Models

In [46]:
# even when split into many dataframes, *index-based* subsets.
# indexes maintained when dataframe splits.
kfolds = [

    # validation set 2014-16 matches ultimate test set's length, 2017-19
    ( 
        ( (XY['date'] >= pd.to_datetime('2010-01-01')) & (XY['date'] < pd.to_datetime("2014-01-01")) ),
        XY['date'] >= pd.to_datetime("2014-01-01")
    ),

    ( 
        ( (XY['date'] >= pd.to_datetime('2010-01-01')) & (XY['date'] < pd.to_datetime("2016-01-01")) ),
        XY['date'] >= pd.to_datetime("2016-01-01")
    )

    ]

kfolds_evaluation = []
for is_training, is_validation in kfolds:

    segments_models = {}
    segments_predictions = []

    for grp, XY_grp in segments_XY.items():

        feature_transform_pipeline_local_model = ColumnTransformer([
            ('transformer_onehot', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), FEATURES_LOCAL_MODEL_TO_ONEHOT),
            ('transformer_std', StandardScaler(), FEATURES_UNIVERSE_NUMERIC_CONTINUOUS)
            ],
            verbose_feature_names_out=False,
            remainder='passthrough'
            )
        feature_transform_pipeline_local_model.set_output(transform='pandas')

        pipeline_e2e = Pipeline([
            ('transform_features', feature_transform_pipeline_local_model), 
            ('model', Ridge(1e-1))
            ])

        pipeline_e2e.fit(
            XY_grp.loc[is_training, FEATURES_LOCAL_MODEL],
            XY_grp.loc[is_training, 'num_sold_log']
            )
        
        segments_models[grp] = pipeline_e2e

        predictions = (
            XY_grp
            .copy()
            .assign(yhat = lambda df_: np.exp(pipeline_e2e.predict(df_)))
            )
        segments_predictions.append(predictions)

    predictions = pd.concat(segments_predictions, axis=0)

    scores = {
        'validation': {
            'nobs': predictions.loc[is_validation].shape[0],
            'score': mean_absolute_percentage_error( 
                predictions.loc[is_validation, 'num_sold'],
                predictions.loc[is_validation, 'yhat']
                )
            },
        'train': {
            'nobs': predictions.loc[is_training].shape[0],
            'score': mean_absolute_percentage_error( 
                predictions.loc[is_training, 'num_sold'],
                predictions.loc[is_training, 'yhat']
                )
            }
        }
    
    kfolds_evaluation.append(scores)

In [None]:
kfolds_evaluation

In [None]:
predictions_sample = predictions.query("series_id == 'Norway|Premium Sticker Mart|Kaggle'")

predictions_sample.set_index('date')[['num_sold', 'yhat']].plot.line()

In [None]:
def objective(trial):

    parameters = {'alpha': trial.suggest_float("alpha", 1e-1, 50, log=True)}

    kfolds_evaluation = []
    for is_training, is_validation in kfolds:

        segments_models = {}
        segments_predictions = []

        for grp, XY_grp in segments_XY.items():

            feature_transform_pipeline_local_model = ColumnTransformer([
                ('transformer_onehot', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), FEATURES_LOCAL_MODEL_TO_ONEHOT),
                ('transformer_std', StandardScaler(), FEATURES_UNIVERSE_NUMERIC_CONTINUOUS)
                ],
                verbose_feature_names_out=False,
                remainder='passthrough'
                )
            feature_transform_pipeline_local_model.set_output(transform='pandas')

            pipeline_e2e = Pipeline([
                ('transform_features', feature_transform_pipeline_local_model), 
                ('model', Ridge(**parameters))
                ])

            pipeline_e2e.fit(
                XY_grp.loc[is_training, FEATURES_LOCAL_MODEL],
                XY_grp.loc[is_training, 'num_sold_log']
                )
            
            segments_models[grp] = pipeline_e2e

            predictions = (
                XY_grp
                .copy()
                .assign(yhat = lambda df_: np.exp(pipeline_e2e.predict(df_)))
                )
            segments_predictions.append(predictions)


        predictions = pd.concat(segments_predictions, axis=0)

        scores = {
            'validation': mean_absolute_percentage_error( 
                predictions.loc[is_validation, 'num_sold'],
                predictions.loc[is_validation, 'yhat']
                ),
            'train': mean_absolute_percentage_error( 
                predictions.loc[is_training, 'num_sold'],
                predictions.loc[is_training, 'yhat']
                )
            }
        
        kfolds_evaluation.append(scores)

    score_overall = np.mean([ 
        # kfolds_evaluation[0]['validation'], 
        kfolds_evaluation[1]['validation'] 
        ])
    
    return score_overall

study = optuna.create_study()

study.optimize(
    objective,
    n_trials=50,
    catch=(ValueError,),
    n_jobs=-1,
    timeout=12 * 60 * 60,
)

In [None]:
study.best_trial, study.best_params, study.best_value

In [51]:
# INCOMPLETE: stalled after ~20 minute runtime, and score not better than 0.1

# def objective(trial):

#     ROUNDS_COUNT = 100

#     param = {
#         "objective": "reg:squarederror",
#         "booster": trial.suggest_categorical(
#             "booster", ["gbtree", "gblinear", "dart"]
#         ),
#         "lambda": trial.suggest_float("lambda", 1e-8, 1.0, log=True),
#         "alpha": trial.suggest_float("alpha", 1e-8, 1.0, log=True),
#         "subsample": trial.suggest_float("subsample", 0.4, 1.0),
#         "colsample_bytree": trial.suggest_float("colsample_bytree", 0.2, 1.0),
#     }

#     if param["booster"] in ["gbtree", "dart"]:
#         param["max_depth"] = trial.suggest_int("max_depth", 1, 9, step=2)

#         param["min_child_weight"] = trial.suggest_int("min_child_weight", 1, 10)
#         param["eta"] = trial.suggest_float("eta", 1e-5, 0.01, log=True)

#         param["gamma"] = trial.suggest_float("gamma", 1e-8, 1.0, log=True)
#         param["grow_policy"] = trial.suggest_categorical(
#             "grow_policy", ["depthwise", "lossguide"]
#         )

#     if param["booster"] == "dart":
#         param["sample_type"] = trial.suggest_categorical(
#             "sample_type", ["uniform", "weighted"]
#         )
#         param["normalize_type"] = trial.suggest_categorical(
#             "normalize_type", ["tree", "forest"]
#         )
#         param["rate_drop"] = trial.suggest_float(
#             "rate_drop", 1e-8, 1.0, log=True
#         )
#         param["skip_drop"] = trial.suggest_float(
#             "skip_drop", 1e-8, 1.0, log=True
#         )

#     kfolds_evaluation = []
#     for is_training, is_validation in kfolds:

#         segments_models = {}
#         segments_predictions = []
#         for grp, XY in segments_XY.items():

#             dtrain = xgb.DMatrix(
#                 XY.loc[is_training, FEATURES_LOCAL_MODEL], 
#                 label=XY.loc[is_training, 'num_sold_log']
#                 )
#             dtest = xgb.DMatrix(XY[FEATURES_LOCAL_MODEL])

#             model_global = xgb.train(param, dtrain, ROUNDS_COUNT)
#             segments_models[grp] = model_global
        
#             yhat = model_global.predict(dtest)
#             predictions = (
#                 XY
#                 .copy()
#                 .assign(yhat = lambda df_: np.exp(yhat))
#                 )
#             segments_predictions.append(predictions)

#         predictions = pd.concat(segments_predictions, axis=0)

#         scores = {
#             'validation': mean_absolute_percentage_error( 
#                 predictions.loc[is_validation, 'num_sold'],
#                 predictions.loc[is_validation, 'yhat']
#                 ),
#             'train': mean_absolute_percentage_error( 
#                 predictions.loc[is_training, 'num_sold'],
#                 predictions.loc[is_training, 'yhat']
#                 )
#             }
        
#         kfolds_evaluation.append(scores)

#     score_overall = np.mean([ 
#         kfolds_evaluation[0]['validation'], 
#         kfolds_evaluation[1]['validation'] 
#         ])
    
#     return score_overall

# study = optuna.create_study()

# study.optimize(
#     objective,
#     n_trials=25,
#     catch=(ValueError,),
#     n_jobs=-1,
#     timeout=60 * 10,
# )

# study.best_trial, study.best_params, study.best_value

In [None]:
RIDGE_ALPHA_TUNED = study.best_params['alpha']

# even when split into many dataframes, *index-based* subsets.
# indexes maintained when dataframe splits.
kfolds = [
    
    # validation set 2014-16 matches ultimate test set's length, 2017-19
    ( 
        ( (XY['date'] >= pd.to_datetime('2010-01-01')) & (XY['date'] < pd.to_datetime("2014-01-01")) ),
        XY['date'] >= pd.to_datetime("2014-01-01")
    ),

    ( 
        ( (XY['date'] >= pd.to_datetime('2010-01-01')) & (XY['date'] < pd.to_datetime("2016-01-01")) ),
        XY['date'] >= pd.to_datetime("2016-01-01")
    )

    ]

kfolds_evaluation = []
for is_training, is_validation in kfolds:

    segments_models = {}
    segments_predictions = []

    for grp, XY_grp in segments_XY.items():

        feature_transform_pipeline_local_model = ColumnTransformer([
            ('transformer_onehot', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), FEATURES_LOCAL_MODEL_TO_ONEHOT),
            ('transformer_std', StandardScaler(), FEATURES_UNIVERSE_NUMERIC_CONTINUOUS)
            ],
            verbose_feature_names_out=False,
            remainder='passthrough'
            )
        feature_transform_pipeline_local_model.set_output(transform='pandas')

        pipeline_e2e = Pipeline([
            ('transform_features', feature_transform_pipeline_local_model), 
            ('model', Ridge(RIDGE_ALPHA_TUNED))
            ])

        pipeline_e2e.fit(
            XY_grp.loc[is_training, FEATURES_LOCAL_MODEL],
            XY_grp.loc[is_training, 'num_sold_log']
            )
        
        segments_models[grp] = pipeline_e2e

        predictions = (
            XY_grp
            .copy()
            .assign(yhat = lambda df_: np.exp(pipeline_e2e.predict(df_)))
            )
        segments_predictions.append(predictions)


    predictions = pd.concat(segments_predictions, axis=0)

    scores = {
        'validation': {
            'nobs': predictions.loc[is_validation].shape[0],
            'score': mean_absolute_percentage_error( 
                predictions.loc[is_validation, 'num_sold'],
                predictions.loc[is_validation, 'yhat']
                )
            },
        'train': {
            'nobs': predictions.loc[is_training].shape[0],
            'score': mean_absolute_percentage_error( 
                predictions.loc[is_training, 'num_sold'],
                predictions.loc[is_training, 'yhat']
                )
            }
        }

    kfolds_evaluation.append(scores)

kfolds_evaluation

In [53]:
assert predictions.shape[0] == sales_daily_complete.shape[0]

In [None]:
predictions_sample = predictions.query("series_id == 'Norway|Premium Sticker Mart|Kaggle'")

predictions_sample.set_index('date')[['num_sold', 'yhat']].plot.line()

In [55]:
# model = segments_models['Norway|Premium Sticker Mart|Kaggle'].named_steps.model
# coefs = pd.Series(model.coef_, index=model.feature_names_in_)
# coefs.to_csv("coefs.csv")
# coefs.sort_values(ascending=False).to_frame().head(25)
# coefs.sort_values(ascending=True).to_frame().head(25)

In [None]:
predictions_sample = (
    predictions_sample
    .assign(residual = lambda df_: df_['num_sold'] - df_['yhat'])
    .sort_values('date')
    .assign(
        residual_lag1 = lambda df_: df_['residual'].shift(1),
        residual_lag2 = lambda df_: df_['residual'].shift(2),
        residual_lag3 = lambda df_: df_['residual'].shift(3),
        residual_lag5 = lambda df_: df_['residual'].shift(5),
        residual_lag7 = lambda df_: df_['residual'].shift(7),
        residual_lag14 = lambda df_: df_['residual'].shift(14),
        )
    )

(
    p9.ggplot((
        predictions_sample
        .loc[predictions_sample['date'] <= pd.to_datetime("2016-01-01")]
        )) + 
    p9.theme_bw() + 
    p9.geom_point(p9.aes('residual_lag1', 'residual')) + 
    p9.geom_smooth(p9.aes('residual_lag1', 'residual'), method='lm', se=False, color='lightblue')
)

In [58]:
# (
#     predictions
#     .assign(mape = lambda df_: np.abs(100 * (df_['num_sold']/df_['yhat'] - 1)) )
#     .sort_values('mape', ascending=False)
#     .head(100)
#     .to_csv("./data/processed/predictions_errors_local_model.csv", index=False)
# )

In [None]:
(
    predictions
    .assign(mape = lambda df_: np.abs(100 * (df_['num_sold']/df_['yhat'] - 1)) )
    .sort_values('mape', ascending=False)
    .head(50)
)

In [60]:
# segments_XY = {grp: df for grp, df in XY.groupby('series_id')}

# # even when split into many dataframes, *index-based* subsets.
# # indexes maintained when dataframe splits.
# kfolds = [

#     # validation set 2014-16 matches ultimate test set's length, 2017-19
#     ( 
#         ( (XY['date'] >= pd.to_datetime('2010-01-01')) & (XY['date'] < pd.to_datetime("2014-01-01")) ),
#         XY['date'] >= pd.to_datetime("2014-01-01")
#     ),

#     ( 
#         ( (XY['date'] >= pd.to_datetime('2010-01-01')) & (XY['date'] < pd.to_datetime("2016-01-01")) ),
#         XY['date'] >= pd.to_datetime("2016-01-01")
#     )

#     ]

# kfolds_evaluation = []
# FEATURES_LOCAL_TREND_MODEL = FEATURES_LOCAL_MODEL
# for is_training, is_validation in kfolds:
    
#     segments_models = {
#         grp: (
#             TrendRemainderModelPipeline(FEATURES_LOCAL_TREND_MODEL, RIDGE_ALPHA_TUNED, ['day_of_year'])
#             .fit(XY.loc[is_training, FEATURES_LOCAL_MODEL], XY.loc[is_training, 'num_sold_log'])
#             )
#         for grp, XY in segments_XY.items()
#         }

#     segments_predictions = [
#         df.assign(
#             yhat = lambda df_: np.exp(segments_models[grp].predict(df_[FEATURES_LOCAL_MODEL])) 
#             )
#         for grp, df in segments_XY.items()
#         ]

#     predictions = pd.concat(segments_predictions, axis=0)

#     scores = {
#         'validation': mean_absolute_percentage_error( 
#             predictions.loc[is_validation, 'num_sold'],
#             predictions.loc[is_validation, 'yhat']
#             ),
#         'train': mean_absolute_percentage_error( 
#             predictions.loc[is_training, 'num_sold'],
#             predictions.loc[is_training, 'yhat']
#             )
#         }
    
#     kfolds_evaluation.append(scores)

# kfolds_evaluation

In [61]:
# predictions_sample = predictions.query("series_id == 'Norway|Premium Sticker Mart|Kaggle'")

# predictions_sample.set_index('date')[['num_sold', 'yhat']].plot.line()

In [62]:
SEGMENTS_MODELS = {}

for grp, XY_grp in segments_XY.items():

    feature_transform_pipeline_local_model = ColumnTransformer([
        ('transformer_onehot', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), FEATURES_LOCAL_MODEL_TO_ONEHOT),
        ('transformer_std', StandardScaler(), FEATURES_UNIVERSE_NUMERIC_CONTINUOUS)
        ],
        verbose_feature_names_out=False,
        remainder='passthrough'
        )
    feature_transform_pipeline_local_model.set_output(transform='pandas')

    pipeline_e2e = Pipeline([
        ('transform_features', feature_transform_pipeline_local_model), 
        ('model', Ridge(RIDGE_ALPHA_TUNED))
        ])

    pipeline_e2e.fit(XY_grp[FEATURES_LOCAL_MODEL], XY_grp['num_sold_log'])
    
    SEGMENTS_MODELS[grp] = pipeline_e2e

# Deployment

In [63]:
sales_test_daily = pd.read_csv("./data/external/test.csv").assign(

    date = lambda df_: pd.to_datetime(df_['date']),

    country_store = lambda df_: df_['country'].str.cat(df_['store'], sep='|'),
    country_product = lambda df_: df_['country'].str.cat(df_['product'], sep='|'),
    store_product = lambda df_: df_['store'].str.cat(df_['product'], sep='|'),
    country_store_product = lambda df_: df_['country'].str.cat([df_['store'], df_['product']], sep='|')

    ).assign(series_id = lambda df_: df_['country_store_product'])

In [None]:
sales_test_daily.head()

In [None]:
sales_test_daily['date'].describe()

In [66]:
sales_test_daily = transform_calendar_features(sales_test_daily)
sales_test_daily = integrate_external_features(sales_test_daily)
sales_test_daily = sales_test_daily.assign(
    num_sold = None,
    num_sold_log = None
    )

In [67]:
segments_X_test = {grp: df for grp, df in sales_test_daily.groupby('series_id')}

segments_predictions_test = []
for grp, df in segments_X_test.items():

    if grp in SEGMENTS_MODELS:
        df = df.assign(
            yhat = lambda df_: np.exp(SEGMENTS_MODELS[grp].predict(df_[FEATURES_LOCAL_MODEL])) 
            )  
        
    else:
        df = df.assign(
            yhat = lambda df_: np.exp(MODEL_GLOBAL.predict(df_[FEATURES_GLOBAL_MODEL])) 
            )

    segments_predictions_test.append(df)

predictions_test = pd.concat(segments_predictions_test, axis=0)

In [68]:
predictions_test_submit = (
    predictions_test
    [['id', 'yhat']]
    .rename(columns={'yhat': 'num_sold'})
    )

In [69]:
assert predictions_test_submit.shape[0] == 98_550
assert predictions_test_submit.notnull().all().all()
predictions_test_submit.to_csv("./data/processed/submission_pipelines_fix-gamma.csv", index=False)

## Differential Testing

In [70]:
predictions_compare = pd.read_csv("./data/processed/submission5.csv").rename(columns={'num_sold': 'num_sold_prev'})
predictions_compare = (
    pd.merge(predictions_compare, predictions_test_submit, how='left')
    .assign(diff = lambda df_: 100 * (df_['num_sold'] / df_['num_sold_prev'] - 1))
    )
assert predictions_compare.shape[0] == 98_550

predictions_compare = pd.merge(
    predictions_compare, 
    sales_test_daily[['id', 'country', 'product', 'store', 'year', 'series_id']]
    )

In [None]:
predictions_compare['diff'].describe()

In [None]:
predictions_compare.groupby('country')['diff'].describe()

In [None]:
predictions_compare.groupby('year')['diff'].describe()