Analysis of Store Time-Series Forcasting on Kaggle 
- [Found here](https://www.kaggle.com/competitions/store-sales-time-series-forecasting/data?select=holidays_events.csv)

In [None]:


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


def load_file(name):
    dir = "datasets/store_forecasting/"
    return pd.read_csv(f'{dir+name}')

train = load_file("train.csv")
test = load_file("test.csv")
oil = load_file("oil.csv")
transactions = load_file("transactions.csv")
holidays = load_file("holidays_events.csv")
stores = load_file("stores.csv")

In [None]:
train.dtypes

In [None]:
train

In [None]:
len(train.store_nbr.unique())

In [None]:
stores

Joining the stores to the item data
- note the "many to one" join

In [None]:
joined_store_train  = train.join(stores.set_index("store_nbr"), on="store_nbr", validate="m:1")

joined_store_train

In [None]:
oil

Join on Oil

In [None]:
joined_oil_store_train = joined_store_train.join(oil.set_index("date"), on="date", validate="m:1")


print(joined_oil_store_train.state.unique())

(joined_oil_store_train)

# Holidays 
- look at transferred holidays
- be note of "bridge" holidays, additional holidays given out
- take into consideration if the holiday is for thet region/locale

In [None]:
holidays_no_transfer = holidays[holidays.transferred == False]

holidays.type.unique()

np.sort(holidays_no_transfer.locale.unique())

holidays_no_transfer

Join the city holidays on eachother

There are 3 types of holidays:
- national holidays (all of Ecuador) (National)
- state holidys (Regional)
- city holidays (Local)

In [None]:
national_holidays = holidays_no_transfer[holidays_no_transfer['locale'] == 'National']
holidays_no_transfer.drop_duplicates(subset="date", inplace=True)

#holidays_no_transfer.columns
#pivot_holidays = holidays_no_transfer.pivot(index=["date", "locale"], columns="type", values="locale_name")

table = holidays_no_transfer.set_index(['locale', 'date'])
table = table[['description', 'locale_name']]

regional = table.loc['Regional']

local = table.loc['Local']

national = table.loc['National']

national


You were merging on national and then reusing that but losing the columns in the process

In [117]:

merged = joined_oil_store_train.merge(national, on="date",suffixes=('_merged', "_a"), how="left", validate="m:1")
merged_regional = merged.merge(regional, left_on=['date', 'state'], right_on=['date' , 'locale_name'], how='left', suffixes=("_regional", "_national"))
merged_al = merged_regional.merge(local,left_on=['date', 'city'], right_on=['date', 'locale_name'], how='left', suffixes=("_city","_regional", "_national"))

merged_al

MergeError: Merge keys are not unique in left dataset; not a one-to-many merge

Make a holiday column after all this

In [None]:
merged_al['locale_name_regional'].isna() |  (merged_al['locale_name_national']).isna() | (merged_al['locale_name']).isna()

merged_al['Holiday'] =  merged_al['locale_name_regional'].notna() | (merged_al['locale_name_national']).notna() | (merged_al['locale_name']).notna()

merged_al

In [118]:

from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder

# Preprocessing step
def merge_holiday_store_oil(dataset):
    dataset  = dataset.join(stores.set_index("store_nbr"), on="store_nbr", validate="m:1")
    dataset = dataset.join(oil.set_index("date"), on="date", validate="m:1")
    
    dataset = dataset.merge(transactions, on=['date', 'store_nbr'], how="left")

    holidays_no_transfer = holidays[holidays.transferred == False]
    holidays_no_transfer.drop_duplicates(subset="date", inplace=True)
    table = holidays_no_transfer.set_index(['locale', 'date'])
    table = table[['description', 'locale_name']]
    regional = table.loc['Regional']
    local = table.loc['Local']
    national = table.loc['National']
    
    merged = dataset.merge(national, on="date",suffixes=('_merged', "_a"), how="left")
    merged_regional = merged.merge(regional, left_on=['date', 'state'], right_on=['date' , 'locale_name'], how='left', suffixes=("_regional", "_national"))
    merged_all = merged_regional.merge(local,left_on=['date', 'city'], right_on=['date', 'locale_name'], how='left', suffixes=("_city","_regional", "_national"))
    merged_all['Holiday'] =  merged_all['locale_name_regional'].notna() | (merged_all['locale_name_national']).notna() | (merged_all['locale_name']).notna()
    
    merged_all['date'] = pd.to_datetime(dataset['date'])
    merged_all['date'] = pd.to_numeric(merged_all['date'])
    
    return merged_all
     



In [None]:

train_data = merge_holiday_store_oil(train) 
y = train_data['sales']
X = train_data.drop(["sales"], axis=1)

categories = X.select_dtypes(include=[object]).columns.to_list()

X[categories] = X[categories].astype("category")

X['Holiday'] = X['Holiday'].astype("object")

X.dtypes

columns_to_select = ['date', 'store_nbr', 'family', 'state', 'type', 'cluster', 'Holiday', 'dcoilwtico', 'id', "transactions"]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  holidays_no_transfer.drop_duplicates(subset="date", inplace=True)


In [120]:

import sklearn 

from sklearn.impute import SimpleImputer
sklearn.set_config(transform_output="pandas")

numerical = make_pipeline(
    StandardScaler()
)


categorical = make_pipeline(
    SimpleImputer()
)


columns = make_column_transformer(
    (categorical, ['family', 'type','Holiday', 'dcoilwtico'])
     , remainder="passthrough", verbose_feature_names_out=False
)





In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.model_selection import cross_validate

model = HistGradientBoostingRegressor(random_state=42, categorical_features="from_dtype")

pipe = make_pipeline( 
     make_column_transformer(("passthrough", columns_to_select),  verbose_feature_names_out=False),
                     model)

pipe.fit(X,y)


In [124]:


test_data = merge_holiday_store_oil(test)

y_pred = pipe.predict(test_data)

submission = pd.DataFrame()

submission['id'] = test['id']
submission['sales'] = y_pred

submission.to_csv("submissions/Ecuador_Stores2.csv", header=True, index=False)



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  holidays_no_transfer.drop_duplicates(subset="date", inplace=True)


array([nan])

In [None]:
test_data