In [3]:
import pandas as pd
import numpy as np
from statsforecast import StatsForecast
import matplotlib.pyplot as plt
#import seaborn as sns
import datetime as dt
import os
import warnings
import matplotlib.dates as mpl_dates
from statsmodels.tsa.seasonal import seasonal_decompose
import operator
from mlforecast import MLForecast
from mlforecast.lag_transforms import Combine, RollingMean
from sklearn.inspection import permutation_importance
from sklearn.ensemble import RandomForestRegressor
from statsforecast.models import AutoARIMA, AutoETS, Naive, RandomWalkWithDrift, SeasonalNaive, SeasonalWindowAverage, WindowAverage
import utilsforecast.losses as ufl
from utilsforecast.evaluation import evaluate



  from .autonotebook import tqdm as notebook_tqdm


In [4]:
#Loading data
future_values = pd.read_csv('future_values.csv',parse_dates = ['date']).rename(columns={'date':'ds','store_id':'unique_id'})
metadata = pd.read_csv('metadata.csv').rename(columns={'store_id':'unique_id'})
sales_data = pd.read_csv('sales_data.csv',parse_dates = ['date']).rename(columns={'date':'ds','store_id':'unique_id','sales':'y'})
sales_data.dtypes

  sales_data = pd.read_csv('sales_data.csv',parse_dates = ['date']).rename(columns={'date':'ds','store_id':'unique_id','sales':'y'})


unique_id                 object
ds                datetime64[ns]
y                          int64
customers                  int64
open                       int64
promo                      int64
state_holiday             object
school_holiday             int64
dtype: object

In [5]:
# Grouping data by each time series
grouped = sales_data.groupby('unique_id')
summary = grouped.agg(
    count_observed=('ds', 'count'),
    start_date=('ds', 'min'),
    end_date=('ds', 'max')
).reset_index()
print(summary)

     unique_id  count_observed start_date   end_date
0      store_1             924 2013-01-07 2015-07-19
1     store_10             924 2013-01-07 2015-07-19
2    store_100             924 2013-01-07 2015-07-19
3    store_101             924 2013-01-07 2015-07-19
4    store_102             924 2013-01-07 2015-07-19
..         ...             ...        ...        ...
671   store_95             924 2013-01-07 2015-07-19
672   store_96             924 2013-01-07 2015-07-19
673   store_97             924 2013-01-07 2015-07-19
674   store_98             924 2013-01-07 2015-07-19
675   store_99             924 2013-01-07 2015-07-19

[676 rows x 4 columns]


In [6]:
# Grouping data by each time series
grouped = future_values.groupby('unique_id')
summary = grouped.agg(
    count_observed=('ds', 'count'),
    start_date=('ds', 'min'),
    end_date=('ds', 'max')
).reset_index()
print(summary)

     unique_id  count_observed start_date   end_date
0      store_1              60 2015-07-20 2015-09-17
1     store_10              60 2015-07-20 2015-09-17
2    store_100              60 2015-07-20 2015-09-17
3    store_101              60 2015-07-20 2015-09-17
4    store_102              60 2015-07-20 2015-09-17
..         ...             ...        ...        ...
671   store_95              60 2015-07-20 2015-09-17
672   store_96              60 2015-07-20 2015-09-17
673   store_97              60 2015-07-20 2015-09-17
674   store_98              60 2015-07-20 2015-09-17
675   store_99              60 2015-07-20 2015-09-17

[676 rows x 4 columns]


In [7]:
#Checking na 
future_values.isna ().sum ()
metadata.isna ().sum ()
sales_data.isna().sum()

unique_id         0
ds                0
y                 0
customers         0
open              0
promo             0
state_holiday     0
school_holiday    0
dtype: int64

In [8]:
sales_merged = pd.merge(sales_data, metadata, on='unique_id', how='left')
sales_merged.head()

Unnamed: 0,unique_id,ds,y,customers,open,promo,state_holiday,school_holiday,store_type,assortment,competition_distance
0,store_1,2015-07-19,0,0,0,0,0,0,c,a,1270.0
1,store_2,2015-07-19,0,0,0,0,0,0,a,a,14130.0
2,store_3,2015-07-19,0,0,0,0,0,0,a,c,24000.0
3,store_4,2015-07-19,0,0,0,0,0,0,a,a,7520.0
4,store_5,2015-07-19,0,0,0,0,0,0,a,c,2030.0


In [12]:
sales_merged['ds'] = pd.to_datetime(sales_merged['ds'])

# Create a new column "week"
sales_merged['week'] = sales_merged['ds'].dt.to_period('W-MON').dt.start_time

# Aggregate by store_id and week and take the sum
weekly_data = sales_merged.groupby(['unique_id', 'week'], as_index=False).agg({
    'y': 'sum',
    'customers': 'sum',
    'promo': 'sum',
    'open': 'sum',
    'school_holiday': 'sum',
    'store_type': 'first',
    'assortment': 'first',
    'competition_distance': 'first'
})

In [15]:
# 确保 state_holiday 是字符串类型
sales_merged['state_holiday'] = sales_merged['state_holiday'].astype(str)

# 创建 one-hot 编码列，例如：state_holiday_a, state_holiday_b, ...
state_holiday_dummies = pd.get_dummies(sales_merged['state_holiday'], prefix='state_holiday')

# 拼接到原始数据
sales_merged = pd.concat([sales_merged, state_holiday_dummies], axis=1)

# 按 store_id 和 week 对 one-hot 编码列求和
holiday_weekly = sales_merged.groupby(['unique_id', 'week'], as_index=False)[
    [col for col in state_holiday_dummies.columns]
].sum()

# 合并计数后的 holiday 列
weekly_data = pd.merge(weekly_data, holiday_weekly, on=['unique_id', 'week'], how='left')

weekly_data

Unnamed: 0,unique_id,week,y,customers,promo,open,school_holiday,store_type,assortment,competition_distance,...,state_holiday_0_y,state_holiday_a_y,state_holiday_a_y.1,state_holiday_a_y.2,state_holiday_b_y,state_holiday_b_y.1,state_holiday_b_y.2,state_holiday_c_y,state_holiday_c_y.1,state_holiday_c_y.2
0,store_1,2013-01-01,7176,785,1,1,1,c,a,1270.0,...,1,0,0,0,0,0,0,0,0,0
1,store_1,2013-01-08,30493,3749,4,6,4,c,a,1270.0,...,7,0,0,0,0,0,0,0,0,0
2,store_1,2013-01-15,26655,3408,1,6,0,c,a,1270.0,...,7,0,0,0,0,0,0,0,0,0
3,store_1,2013-01-22,31732,3804,4,6,0,c,a,1270.0,...,7,0,0,0,0,0,0,0,0,0
4,store_1,2013-01-29,31670,3774,1,6,0,c,a,1270.0,...,7,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89903,store_99,2015-06-16,51761,5071,4,6,0,a,a,2640.0,...,7,0,0,0,0,0,0,0,0,0
89904,store_99,2015-06-23,48696,4871,1,6,1,a,a,2640.0,...,7,0,0,0,0,0,0,0,0,0
89905,store_99,2015-06-30,55631,5181,4,6,5,a,a,2640.0,...,7,0,0,0,0,0,0,0,0,0
89906,store_99,2015-07-07,44007,4350,1,6,5,a,a,2640.0,...,7,0,0,0,0,0,0,0,0,0
