Analysis of Store Time-Series Forcasting on Kaggle 
- [Found here](https://www.kaggle.com/competitions/store-sales-time-series-forecasting/data?select=holidays_events.csv)

In [1]:


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


def load_file(name):
    dir = "datasets/store_forecasting/"
    return pd.read_csv(f'{dir+name}')

train = load_file("train.csv")
test = load_file("test.csv")
oil = load_file("oil.csv")
transactions = load_file("transactions.csv")
holidays = load_file("holidays_events.csv")
stores = load_file("stores.csv")

In [2]:
train.dtypes

id               int64
date            object
store_nbr        int64
family          object
sales          float64
onpromotion      int64
dtype: object

In [3]:
train

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion
0,0,2013-01-01,1,AUTOMOTIVE,0.000,0
1,1,2013-01-01,1,BABY CARE,0.000,0
2,2,2013-01-01,1,BEAUTY,0.000,0
3,3,2013-01-01,1,BEVERAGES,0.000,0
4,4,2013-01-01,1,BOOKS,0.000,0
...,...,...,...,...,...,...
3000883,3000883,2017-08-15,9,POULTRY,438.133,0
3000884,3000884,2017-08-15,9,PREPARED FOODS,154.553,1
3000885,3000885,2017-08-15,9,PRODUCE,2419.729,148
3000886,3000886,2017-08-15,9,SCHOOL AND OFFICE SUPPLIES,121.000,8


In [4]:
stores

Unnamed: 0,store_nbr,city,state,type,cluster
0,1,Quito,Pichincha,D,13
1,2,Quito,Pichincha,D,13
2,3,Quito,Pichincha,D,8
3,4,Quito,Pichincha,D,9
4,5,Santo Domingo,Santo Domingo de los Tsachilas,D,4
5,6,Quito,Pichincha,D,13
6,7,Quito,Pichincha,D,8
7,8,Quito,Pichincha,D,8
8,9,Quito,Pichincha,B,6
9,10,Quito,Pichincha,C,15


In [43]:
o = oil.interpolate(method="nearest", order=2)

o.isna().sum()

date          0
dcoilwtico    1
dtype: int64

In [155]:
holidays_no = holidays[holidays.transferred == False].melt(id_vars=['locale', 'locale_name'], value_vars=['date'])

national = holidays_no.query('locale == "National"').drop_duplicates(subset=['value'])
regional = holidays_no.query('locale == "Regional"').drop_duplicates(subset=['value'])
local = holidays_no.query('locale == "Local"').drop_duplicates(subset=['value'])

regional

Unnamed: 0,locale,locale_name,variable,value
1,Regional,Cotopaxi,date,2012-04-01
7,Regional,Imbabura,date,2012-06-25
22,Regional,Santo Domingo de los Tsachilas,date,2012-11-06
23,Regional,Santa Elena,date,2012-11-07
46,Regional,Cotopaxi,date,2013-04-01
57,Regional,Imbabura,date,2013-06-25
74,Regional,Santo Domingo de los Tsachilas,date,2013-11-06
75,Regional,Santa Elena,date,2013-11-07
94,Regional,Cotopaxi,date,2014-04-01
110,Regional,Imbabura,date,2014-06-25


In [59]:
def merge_everything(df):
    holidays_no = holidays[holidays.transferred == False].melt(id_vars=['locale', 'locale_name'], value_vars=['date'])

    national = holidays_no.query('locale == "National"').drop_duplicates(subset=['value'])
    regional = holidays_no.query('locale == "Regional"').drop_duplicates(subset=['value'])
    local = holidays_no.query('locale == "Local"').drop_duplicates(subset=['value'])

    dates = pd.concat({
        f'oil_lag{i}' : oil.dcoilwtico.shift(i) 
        for i in [-7,-30]
    },axis=1)
    
    oil_new = pd.concat([oil, dates], axis=1)
    
    data = (
        df.merge(stores, on=['store_nbr'], validate="m:1")
        .merge(oil_new, on=["date"], validate="m:1", how="left")
        .merge(national, left_on=['date'], right_on=['value'], how="left", validate="m:1")
        .merge(regional, left_on=['date', 'state'], right_on=['value', 'locale_name'], how="left", validate="m:1")
        .merge(local,    left_on=['date', 'city'], right_on=['value', 'locale_name'], how="left", validate="m:1")
    )

    data['holiday'] = data.value.notnull() | data.value_y.notnull() | data.value_x.notnull()
    #data['holiday_type'] = data[['locale_x', 'locale_y', 'locale']].bfill(axis=1).iloc[:, 0]
    
    data['is_weekend'] = pd.to_datetime(df.date).dt.day_name().isin(['Saturday', 'Sunday'])
    
    data['dcoilwtico'] = data['dcoilwtico'].interpolate(method="nearest", order=2)
    data['dcoilwtico'] = data['dcoilwtico'].bfill()
    
    columns = ['date', 'store_nbr', 'family', 'onpromotion', 'city','state', 'type', 'cluster', 'dcoilwtico', 'holiday', 'is_weekend']
    for x in dates.keys():
        columns.append(x)
        
    if 'sales' in df.columns:
        data['sales'] = data.sales.shift(-30)
        columns.append('sales')
    
    # final list of features
    data = data[columns]
    
    categorical = ['family', 'city', 'cluster', 'holiday', 'store_nbr', 'family', 'city', 'state', 'type', 'is_weekend']
    for x in  categorical:
        data[x] = data[x].astype('category')
        
        
    data.date = pd.to_datetime(data.date)

    return data.set_index(['date', 'family'])


Joining the stores to the item data
- note the "many to one" join

In [60]:
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.pipeline import FunctionTransformer

import sklearn

sklearn.set_config(transform_output="pandas")

pipeline = make_pipeline(FunctionTransformer(merge_everything))

colmuns = make_column_transformer(
    (pipeline, make_column_selector(dtype_include=[np.generic])),remainder="drop", verbose_feature_names_out=False
)

final_pipeline = make_pipeline(colmuns)
                              
new_train = final_pipeline.fit_transform(train)

def create_pipeline(estimator):
    pipeline = make_pipeline(FunctionTransformer(merge_everything))

    colmuns = make_column_transformer(
        (pipeline, make_column_selector(dtype_include=[np.generic])), remainder="drop", verbose_feature_names_out=False
    )

    final_pipeline = make_pipeline(colmuns, estimator)
            
    return final_pipeline
    
new_train.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,store_nbr,onpromotion,city,state,type,cluster,dcoilwtico,holiday,is_weekend,oil_lag-7,oil_lag-30,sales
date,family,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2013-01-01,AUTOMOTIVE,1,0,Quito,Pichincha,D,13,93.14,True,False,93.81,97.48,0.0
2013-01-01,BABY CARE,1,0,Quito,Pichincha,D,13,93.14,True,False,93.81,97.48,0.0
2013-01-01,BEAUTY,1,0,Quito,Pichincha,D,13,93.14,True,False,93.81,97.48,0.0
2013-01-01,BEVERAGES,1,0,Quito,Pichincha,D,13,93.14,True,False,93.81,97.48,0.0
2013-01-01,BOOKS,1,0,Quito,Pichincha,D,13,93.14,True,False,93.81,97.48,0.0


Let's do some EDA

In [276]:
new_train

Unnamed: 0,date,store_nbr,family,onpromotion,city,state,type,cluster,dcoilwtico,holiday,is_weekend,oil_lag7,oil_lag30,oil_lag90
0,2013-01-01,1,AUTOMOTIVE,0,Quito,Pichincha,D,13,93.14,True,False,,,
1,2013-01-01,1,BABY CARE,0,Quito,Pichincha,D,13,93.14,True,False,,,
2,2013-01-01,1,BEAUTY,0,Quito,Pichincha,D,13,93.14,True,False,,,
3,2013-01-01,1,BEVERAGES,0,Quito,Pichincha,D,13,93.14,True,False,,,
4,2013-01-01,1,BOOKS,0,Quito,Pichincha,D,13,93.14,True,False,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3000883,2017-08-15,9,POULTRY,0,Quito,Pichincha,B,6,47.57,False,False,49.57,,53.38
3000884,2017-08-15,9,PREPARED FOODS,1,Quito,Pichincha,B,6,47.57,False,False,49.57,,53.38
3000885,2017-08-15,9,PRODUCE,148,Quito,Pichincha,B,6,47.57,False,False,49.57,,53.38
3000886,2017-08-15,9,SCHOOL AND OFFICE SUPPLIES,8,Quito,Pichincha,B,6,47.57,False,False,49.57,,53.38


Join on Oil

Collecting skforecast
  Downloading skforecast-0.12.1-py3-none-any.whl.metadata (22 kB)
Collecting optuna<3.7,>=2.10 (from skforecast)
  Downloading optuna-3.6.1-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna<3.7,>=2.10->skforecast)
  Downloading alembic-1.13.1-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna<3.7,>=2.10->skforecast)
  Downloading colorlog-6.8.2-py3-none-any.whl.metadata (10 kB)


Collecting sqlalchemy>=1.3.0 (from optuna<3.7,>=2.10->skforecast)
  Downloading SQLAlchemy-2.0.30-cp311-cp311-macosx_11_0_arm64.whl.metadata (9.6 kB)
Collecting Mako (from alembic>=1.5.0->optuna<3.7,>=2.10->skforecast)
  Downloading Mako-1.3.5-py3-none-any.whl.metadata (2.9 kB)
Downloading skforecast-0.12.1-py3-none-any.whl (560 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m560.6/560.6 kB[0m [31m15.2 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hDownloading optuna-3.6.1-py3-none-any.whl (380 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m380.1/380.1 kB[0m [31m29.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.13.1-py3-none-any.whl (233 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.4/233.4 kB[0m [31m16.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading SQLAlchemy-2.0.30-cp311-cp311-macosx_11_0_arm64.whl (2.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m54.9 MB/s[0m et

# Holidays 
- look at transferred holidays
- be note of "bridge" holidays, additional holidays given out
- take into consideration if the holiday is for thet region/locale

In [10]:
from sklearn.metrics import root_mean_squared_log_error


def calculate_error(y, y_true):
    error = root_mean_squared_log_error(y, y_true)
    print(f'The Root Mean Squared Log Error is {error:.3f}')
    return error
    
    
    

In [11]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

# Create splits


X = new_train.drop("sales", axis=1)
y = new_train.sales



Join the city holidays on eachother

There are 3 types of holidays:
- national holidays (all of Ecuador) (National)
- state holidys (Regional)
- city holidays (Local)

In [61]:
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.preprocessing import OrdinalEncoder
from lightgbm import LGBMRegressor
from sklearn.linear_model import Ridge
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

def date_conversion(df):
    df.date = pd.to_datetime(df.date)
    df.date = pd.to_numeric(df.date)
    df.date = df.date.astype('Float32')
    return df

dates_column = make_column_transformer(
    (OneHotEncoder(sparse_output=False, handle_unknown="ignore"), make_column_selector(dtype_include=['category'])),
    remainder="passthrough", verbose_feature_names_out=False
)
from sklearn.ensemble import RandomForestRegressor

#pipeline = make_pipeline(final_pipeline, LGBMRegressor())
pipeline = make_pipeline(final_pipeline, dates_column, SimpleImputer(strategy="most_frequent"),Ridge())

def create_pipe():
    return make_pipeline(final_pipeline, dates_column)

params = {
    'num_leaves': 31,
    'objective': 'regression',
    'min_data_in_leaf': 200,
    'learning_rate': 0.02,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.7,
    'bagging_freq': 1,
    'metric': 'l2',
    'num_threads': 4
}

train_y = train.sales
train_x = train
from sklearn.model_selection import TimeSeriesSplit
tss = TimeSeriesSplit(n_splits=7)

errors = []
for i, (train_idx, test_idx) in enumerate(tss.split(train)):
    y = train.iloc[train_idx,:].sales
    X = train.iloc[train_idx,:].drop('sales', axis=1)
    
    #pipeline.fit(X,y, lgbmregressor__categorical_feature=cat_features)
    
    pipeline.fit(X,y)
    
    errors.append(calculate_error(train.iloc[test_idx,:].sales, np.abs(pipeline.predict(train.iloc[test_idx,:]))))
print(f'Average error: {np.mean(errors):.3f}')


The Root Mean Squared Log Error is 3.725
The Root Mean Squared Log Error is 3.639
The Root Mean Squared Log Error is 3.593
The Root Mean Squared Log Error is 3.432
The Root Mean Squared Log Error is 3.428
The Root Mean Squared Log Error is 3.377
The Root Mean Squared Log Error is 3.141
Average error: 3.476


You were merging on national and then reusing that but losing the columns in the process

Make a holiday column after all this

In [301]:
from sklearn.model_selection import TimeSeriesSplit
tss = TimeSeriesSplit(n_splits=7)


errors = []
for i, (train_idx, test_idx) in enumerate(tss.split(train)):
    y = train.iloc[train_idx,:].sales
    X = train.iloc[train_idx,:]
    
    pipeline.fit(X,y)
    
    errors.append(calculate_error(train.iloc[test_idx,:].sales, np.abs(pipeline.predict(train.iloc[test_idx,:]))))
print(f'Average error: {np.mean(errors):.3f}')

The Root Mean Squared Log Error is 1.119
The Root Mean Squared Log Error is 1.394
The Root Mean Squared Log Error is 1.629
The Root Mean Squared Log Error is 1.202
The Root Mean Squared Log Error is 1.094
The Root Mean Squared Log Error is 1.147
The Root Mean Squared Log Error is 0.972
Average error: 1.222


In [28]:
y_pred = pipeline.predict(test)

In [29]:



submission = pd.DataFrame()

submission['id'] = test['id']
submission['sales'] = y_pred

submission.to_csv("submissions/Ecuador_Stores5.csv", header=True, index=False)



In [2]:
test

Unnamed: 0,id,date,store_nbr,family,onpromotion
0,3000888,2017-08-16,1,AUTOMOTIVE,0
1,3000889,2017-08-16,1,BABY CARE,0
2,3000890,2017-08-16,1,BEAUTY,2
3,3000891,2017-08-16,1,BEVERAGES,20
4,3000892,2017-08-16,1,BOOKS,0
...,...,...,...,...,...
28507,3029395,2017-08-31,9,POULTRY,1
28508,3029396,2017-08-31,9,PREPARED FOODS,0
28509,3029397,2017-08-31,9,PRODUCE,1
28510,3029398,2017-08-31,9,SCHOOL AND OFFICE SUPPLIES,9


In [42]:
cat_features =new_train.select_dtypes(include="category").columns.tolist()

In [44]:
cat_features

['city', 'state', 'type', 'cluster', 'holiday', 'is_weekend']