# Time Series EDA

In [1]:
import pandas as pd
import numpy as np
#%matplotlib notebook
import random
import matplotlib.pyplot as plt
%matplotlib inline
import warnings 
warnings.filterwarnings('ignore')

In [2]:
plt.rcParams.update({'font.size': 18})
plt.rcParams['figure.figsize'] = 12,8

In [3]:
from statsmodels.tsa.seasonal import seasonal_decompose
from sklearn.preprocessing import LabelEncoder

## Load data

In [4]:
FilePath = "/Users/yabindong/Program_Dataset/M5-Forcasting/m5-forecasting-accuracy/"

In [5]:
df_cal0 = pd.read_csv(FilePath+'calendar.csv')
df_train0 = pd.read_csv(FilePath+"sales_train_validation.csv")
df_price0 = pd.read_csv(FilePath+"sell_prices.csv")

In [6]:
df_sample_submission = pd.read_csv(FilePath+'sample_submission.csv')

In [7]:
df_train0.head()
#df_train0.shape

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,...,d_1904,d_1905,d_1906,d_1907,d_1908,d_1909,d_1910,d_1911,d_1912,d_1913
0,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,3,0,1,1,1,3,0,1,1
1,HOBBIES_1_002_CA_1_validation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,HOBBIES_1_003_CA_1_validation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,2,1,2,1,1,1,0,1,1,1
3,HOBBIES_1_004_CA_1_validation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,0,5,4,1,0,1,3,7,2
4,HOBBIES_1_005_CA_1_validation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,2,1,1,0,1,1,2,2,2,4


In [8]:
df_train_ids = df_train0.iloc[:,0:6]
df_train_ids["cat_store_id"] = df_train0["cat_id"] + "_" + df_train0["store_id"]
df_train_ids["dept_store_id"] = df_train0["dept_id"] + "_" + df_train0["store_id"]

In [9]:
df_train0 = pd.concat((df_train_ids, df_train0.iloc[:,6:]), axis=1)

In [10]:
df_train0.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,cat_store_id,dept_store_id,d_1,d_2,...,d_1904,d_1905,d_1906,d_1907,d_1908,d_1909,d_1910,d_1911,d_1912,d_1913
0,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,HOBBIES_CA_1,HOBBIES_1_CA_1,0,0,...,1,3,0,1,1,1,3,0,1,1
1,HOBBIES_1_002_CA_1_validation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,HOBBIES_CA_1,HOBBIES_1_CA_1,0,0,...,0,0,0,0,0,1,0,0,0,0
2,HOBBIES_1_003_CA_1_validation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,HOBBIES_CA_1,HOBBIES_1_CA_1,0,0,...,2,1,2,1,1,1,0,1,1,1
3,HOBBIES_1_004_CA_1_validation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,HOBBIES_CA_1,HOBBIES_1_CA_1,0,0,...,1,0,5,4,1,0,1,3,7,2
4,HOBBIES_1_005_CA_1_validation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,HOBBIES_CA_1,HOBBIES_1_CA_1,0,0,...,2,1,1,0,1,1,2,2,2,4


## Define time series index
- The timeline is 2011-01-29 to 2016-06-19: This is the whole 1969 days
- The given training data is from 2011-01-29 to 2016-04-24
- The validation time period is from 2016-04-25 to 2016-06-19

In [11]:
# create time series index
idx_train = pd.date_range(start='2011-01-29', periods=1913, freq='D')
#idx_val = pd.date_range(start='2016-04-25', periods=28, freq='D') #For validation time period
#idx_eval = pd.date_range(start='2016-05-23', periods=28, freq='D') #For evaluation time period

## Time series level
- There are 12 levels of time series in total: Total - State - Store - Category - Department - item
- Each row of the original training data refers to an item

In [None]:
def ReadTimeSeries(df, level):
    """
    The function aims to return a dataframe for the time series of the given level 
    level = [state_id, store_id, cat_store_id, dept_store_id, all]
    """
    df_new = []
    indexes = []
    if level == 'all':
        df_new = df.iloc[:,8:]
        
    else:
        for i,index in enumerate(df[level].unique()):
            indexes.append(index)
            BoolInd = df[level] == index
            df_new.append(df.loc[BoolInd].iloc[:,8:].sum())
            
        df_new = pd.DataFrame(df_new)
        indexes = np.array(indexes)
        df_new.set_index(np.array(indexes), inplace=True)
    
    return df_new

In [None]:
# pass the second argument one of the following: [state_id, store_id, cat_store_id, dept_store_id, all]
df_train1 = ReadTimeSeries(df_train0, "dept_store_id")

In [None]:
df_train1.head()

### Time series of total items

In [12]:
df_total = pd.DataFrame(df_train0.iloc[:,8:].sum().transpose(), columns=['Sales_Total'])

In [13]:
# set datetime as the index
df_total.set_index(idx_train,inplace=True)

In [None]:
#df_total.index
df_total.head()

In [None]:
df_total.plot(kind='line', legend=False)
plt.xlabel("Date");
plt.ylabel("Sold items");

In [None]:
# tsa resample 
total_tsa = seasonal_decompose(df_total[-165:], model="additive")

In [None]:
total_tsa.plot();

### Time series at the level of each item

In [15]:
# randomly take items
i = random.randint(0,30489)
df_item = pd.DataFrame(df_train0.iloc[i,8:].transpose())
df_item.set_index(idx_train, inplace=True)
print("The selected item is: {}".format(df_train0['id'][i]))

The selected item is: FOODS_3_287_WI_3_validation


In [None]:
df_item.plot(kind='line')
plt.xlabel("Date");
plt.ylabel("Sold items");

In [None]:
# tsa resample 
item_tsa = seasonal_decompose(df_item[-730:], model="additive")

In [None]:
item_tsa.plot();

In [20]:
# check all items
df_all = pd.DataFrame(df_train0.iloc[:,8:].transpose())
df_all.set_index(idx_train, inplace=True)

In [37]:
print(df_all.shape)
df_all.head()

(1913, 30490)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,30480,30481,30482,30483,30484,30485,30486,30487,30488,30489
2011-01-29,0,0,0,0,0,0,0,12,2,0,...,0,14,1,0,4,0,0,0,0,0
2011-01-30,0,0,0,0,0,0,0,15,0,0,...,0,11,1,0,4,0,0,6,0,0
2011-01-31,0,0,0,0,0,0,0,0,7,1,...,0,5,1,0,2,2,0,0,0,0
2011-02-01,0,0,0,0,0,0,0,0,3,0,...,0,6,1,0,5,2,0,2,0,0
2011-02-02,0,0,0,0,0,0,0,0,0,0,...,0,5,1,0,2,0,0,2,0,0


In [38]:
print(df_cal0.shape)
df_cal0.head()

(1969, 14)


Unnamed: 0,date,wm_yr_wk,weekday,wday,month,year,d,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI
0,2011-01-29,11101,Saturday,1,1,2011,d_1,,,,,0,0,0
1,2011-01-30,11101,Sunday,2,1,2011,d_2,,,,,0,0,0
2,2011-01-31,11101,Monday,3,1,2011,d_3,,,,,0,0,0
3,2011-02-01,11101,Tuesday,4,2,2011,d_4,,,,,1,1,0
4,2011-02-02,11101,Wednesday,5,2,2011,d_5,,,,,1,0,1


In [87]:
# add calendar feature
cal_features = ["wday", "month", 
                "event_name_1", "event_type_1", "event_name_2","event_type_2",
               "snap_CA", "snap_TX", "snap_WI"]
df_cal1 = df_cal0[cal_features]
df_cal1.head(10)

Unnamed: 0,wday,month,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI
0,1,1,,,,,0,0,0
1,2,1,,,,,0,0,0
2,3,1,,,,,0,0,0
3,4,2,,,,,1,1,0
4,5,2,,,,,1,0,1
5,6,2,,,,,1,1,1
6,7,2,,,,,1,0,0
7,1,2,,,,,1,1,1
8,2,2,SuperBowl,Sporting,,,1,1,1
9,3,2,,,,,1,1,0


In [88]:
def transform(data):
    
    # deal with na of events
    nan_features = ['event_name_1', 'event_type_1', 'event_name_2', 'event_type_2']
    for feature in nan_features:
        data[feature].fillna('unknown', inplace = True)
        
    
    cat = ['event_name_1','event_type_1','event_name_2','event_type_2','snap_CA','snap_TX','snap_WI']
    for feature in cat:
        encoder = LabelEncoder()
        data[feature] = encoder.fit_transform(data[feature])
    
    return data

In [91]:
df_cal2 = transform(df_cal1)
df_cal2.set_index(df_cal0["date"], inplace=True)

In [99]:
df_price0.head()

Unnamed: 0,store_id,item_id,wm_yr_wk,sell_price
0,CA_1,HOBBIES_1_001,11325,9.58
1,CA_1,HOBBIES_1_001,11326,9.58
2,CA_1,HOBBIES_1_001,11327,8.26
3,CA_1,HOBBIES_1_001,11328,8.26
4,CA_1,HOBBIES_1_001,11329,8.26


In [103]:
price_feature = df_cal0[['date','wm_yr_wk']].merge(df_price0, on=['wm_yr_wk'], how='left')

In [146]:
price_feature['id'] = price_feature['item_id']+'_'+price_feature['store_id']+'_validation'
df = price_feature.pivot('id','date','sell_price')

In [None]:
df.head()

In [134]:
#df_PriceFeature = df_train0.merge(df,on=['id'],how= 'left')
#df_PriceFeature.set_index(df_PriceFeature["id"], inplace=True);

In [137]:
#df_PriceFeature.head(10)
#df_PriceFeature.shape

### Time series at the level of Department

In [None]:
df_dept = df_train1

In [None]:
df_dept.head()
#df_dept.shape

In [None]:
# randomly take items
#i = random.randint(0,69)
df_item = pd.DataFrame(df_dept.iloc[1,:].transpose())
df_item.set_index(idx_train, inplace=True)
#print("The selected item is: {}".format(df_train0['id'][i]))

In [None]:
df_item.plot(kind='line')
plt.xlabel("Date");
plt.ylabel("Sold items");

In [None]:
# tsa resample 
item_tsa = seasonal_decompose(df_item[:], model="additive")
item_tsa.plot();

### Time series at the level of Category

### Time series at the level of Store

### Time series at the level of State