In [6]:
import pandas as pd
import numpy as np
from statsmodels.tsa.seasonal import seasonal_decompose, STL

In [17]:
df = pd.read_csv('proj1_exampleinput.csv', parse_dates=['Month'])
df = df.rename(columns={'product_class': 'unique_id', 'Month': 'ds', 'sales_volume': 'y'})

### Filter the time series which are less than 2 seasonality periods as df_short
As the data is monthly data, so we consider 1 seasonality period is 12 months, 2 periods should have 24 data points

In [21]:
# 每个商品的非缺失y值数量（即有效月度销售记录）
counts = df.groupby('unique_id')['y'].count()

# 筛选出数据点少于24（即少于两个季节性周期）的商品
short_series_ids = counts[counts < 24].index

# 筛选出这些商品的完整记录
df_short = df[df['unique_id'].isin(short_series_ids)]

df_short

Unnamed: 0,unique_id,ds,y
550,C1018,2015-09-01,236.0
551,C1018,2015-10-01,287.0
552,C1018,2015-11-01,218.0
553,C1018,2015-12-01,203.0
554,C1018,2016-01-01,239.0
...,...,...,...
14244,C6936,2017-05-01,6692.0
14245,C6936,2017-06-01,5904.0
14246,C6936,2017-07-01,6332.0
14292,C6960,2017-02-01,8103.0


### Calculate the strength of seasonality (as'fs')
Define: if fs>= 0.5, obvious or high seasonality
if fs<0.5, no or low seasonality

In [23]:
def quantify_fs_rv(ts: pd.DataFrame, seasonality_period: int, lo_frac: float):
    # Make the input a numeric time series (keep only the 'y' column, and sort by 'ds')
    ts_series = ts.set_index('ds')['y'].sort_index()

    # STL decomposition
    stl = STL(ts_series, period=seasonality_period).fit()

    # Calculate the strength of seasonality（fs）and residual variability（rv）
    fs = max(0, 1 - np.var(stl.resid) / np.var(stl.resid + stl.seasonal))
    rv = np.std(stl.resid) / ts_series.mean()

    return pd.Series({'fs': fs, 'rv': rv})

# Calculate fs & rv
seasonality_results = df.groupby('unique_id').apply(
    lambda x: quantify_fs_rv(x, seasonality_period=12, lo_frac=0.4)
).reset_index()

# filter fs >= 0.5
high_seasonal_ids = seasonality_results[seasonality_results['fs'] >= 0.5]['unique_id']
low_seasonal_ids = seasonality_results[seasonality_results['fs'] < 0.5]['unique_id']

# Create df_seasonal with all the time series with high seasonality
df_seasonal = df[df['unique_id'].isin(high_seasonal_ids)].copy()
df_low_seasonal = df[df['unique_id'].isin(low_seasonal_ids)].copy()

df_seasonal


  seasonality_results = df.groupby('unique_id').apply(


Unnamed: 0,unique_id,ds,y
0,C1002,2013-01-01,43660.0
1,C1002,2013-02-01,42340.0
2,C1002,2013-03-01,49176.0
3,C1002,2013-04-01,47835.0
4,C1002,2013-05-01,50915.0
...,...,...,...
14487,C7780,2017-03-01,19647.0
14488,C7780,2017-04-01,19616.0
14489,C7780,2017-05-01,21431.0
14490,C7780,2017-06-01,19333.0


In [25]:
df_low_seasonal

Unnamed: 0,unique_id,ds,y
55,C1003,2013-01-01,10203.0
56,C1003,2013-02-01,9169.0
57,C1003,2013-03-01,10428.0
58,C1003,2013-04-01,9714.0
59,C1003,2013-05-01,10489.0
...,...,...,...
14160,C6918,2017-03-01,2847.0
14161,C6918,2017-04-01,2940.0
14162,C6918,2017-05-01,2980.0
14163,C6918,2017-06-01,3120.0


### Category 4: fs<0.5, contain 0
models: Drift, Mean, ARIMA

In [27]:
# Filter the time series which has at least 1 value is 0
zero_ids = df_low_seasonal[df_low_seasonal['y'] == 0]['unique_id'].unique()

# Create df_4 as category 4
df_4 = df_low_seasonal[df_low_seasonal['unique_id'].isin(zero_ids)].copy()

if df_4.empty:
    print("No low or no seasonality time series with 0")

df_4

No low or no seasonality time series with 0


Unnamed: 0,unique_id,ds,y


### Category 5: with negative values

models: ARIMA, TSLM, Mean

In [28]:
# Filter the time series which has at least 1 negative value
negative_ids = df[df['y'] < 0]['unique_id'].unique()

# Create df_5 as category 5
df_5 = df[df['unique_id'].isin(negative_ids)].copy()

if df_5.empty:
    print("No time series with negative values")

df_5

No time series with negative values


Unnamed: 0,unique_id,ds,y
