# Purpose
- Purpose of this notebook is to;
    - Perform EDA to see if some time series has trend ,seasonal and/or cyclical patterns
    - This will inform how we feature engineer

In [1]:
# autoreload
%load_ext autoreload
%autoreload 2

# change current working directory to the root of the project
import os
os.chdir(os.path.dirname(os.getcwd()))
import pandas as pd
import numpy as np

from IPython.display import display
from src import plot

# Import Pre-processed Data

In [2]:
# import csv data into pandas dataframe
df_train = pd.read_csv('data/preprocessed/train_preprocessed.csv')
df_test = pd.read_csv('data/preprocessed/test_preprocessed.csv')
df_holidays = pd.read_csv('data/preprocessed/holidays_preprocessed.csv')
df_stores = pd.read_csv('data/preprocessed/stores_preprocessed.csv')
df_oil = pd.read_csv('data/preprocessed/oil_preprocessed.csv')
df_transactions = pd.read_csv('data/preprocessed/transactions_preprocessed.csv')

In [3]:
# combine store and family columns to create unique IDs
df_train_preprocessed = df_train.copy()
df_train_preprocessed['store_family'] = df_train['store_nbr'].astype(str) + '_' + df_train['family'].astype(str)
df_train_preprocessed.drop(['family', 'store_nbr'], axis=1, inplace=True)
df_train_preprocessed

Unnamed: 0,date,sales,onpromotion,store_family
0,2013-01-01,0.0,0,1_AUTOMOTIVE
1,2013-01-02,2.0,0,1_AUTOMOTIVE
2,2013-01-03,3.0,0,1_AUTOMOTIVE
3,2013-01-04,3.0,0,1_AUTOMOTIVE
4,2013-01-05,5.0,0,1_AUTOMOTIVE
...,...,...,...,...
3008011,2017-08-11,0.0,0,54_SEAFOOD
3008012,2017-08-12,1.0,1,54_SEAFOOD
3008013,2017-08-13,2.0,0,54_SEAFOOD
3008014,2017-08-14,0.0,0,54_SEAFOOD


In [4]:
# combine store and family columns to create unique IDs
df_test_preprocessed = df_test.copy()
df_test_preprocessed['store_family'] = df_test['store_nbr'].astype(str) + '_' + df_test['family'].astype(str)
df_test_preprocessed.drop(['family', 'store_nbr', 'id'], axis=1, inplace=True)
df_test_preprocessed

Unnamed: 0,date,onpromotion,store_family
0,2017-08-16,0,1_AUTOMOTIVE
1,2017-08-16,0,1_BABY CARE
2,2017-08-16,2,1_BEAUTY
3,2017-08-16,20,1_BEVERAGES
4,2017-08-16,0,1_BOOKS
...,...,...,...
28507,2017-08-31,1,9_POULTRY
28508,2017-08-31,0,9_PREPARED FOODS
28509,2017-08-31,1,9_PRODUCE
28510,2017-08-31,9,9_SCHOOL AND OFFICE SUPPLIES


# EDA

In [5]:
# unique IDs values
display(len(df_train_preprocessed['store_family'].unique()))
display(len(df_test_preprocessed['store_family'].unique()))


1782

1782

- Plot time series for one store and one family

In [6]:
display(df_train['family'].unique())
display(df_train['store_nbr'].unique())

array(['AUTOMOTIVE', 'BABY CARE', 'BEAUTY', 'BEVERAGES', 'BOOKS',
       'BREAD/BAKERY', 'CELEBRATION', 'CLEANING', 'DAIRY', 'DELI', 'EGGS',
       'FROZEN FOODS', 'GROCERY I', 'GROCERY II', 'HARDWARE',
       'HOME AND KITCHEN I', 'HOME AND KITCHEN II', 'HOME APPLIANCES',
       'HOME CARE', 'LADIESWEAR', 'LAWN AND GARDEN', 'LINGERIE',
       'LIQUOR,WINE,BEER', 'MAGAZINES', 'MEATS', 'PERSONAL CARE',
       'PET SUPPLIES', 'PLAYERS AND ELECTRONICS', 'POULTRY',
       'PREPARED FOODS', 'PRODUCE', 'SCHOOL AND OFFICE SUPPLIES',
       'SEAFOOD'], dtype=object)

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
       35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51,
       52, 53, 54])

In [7]:
plot.plot_one_sample(df_train_preprocessed, '1_AUTOMOTIVE')

# Trend & Seasonality

- Trend

In [17]:
plot.plot_trend(df_train_preprocessed, '10_POULTRY')

- Seasonality

In [9]:
plot.plot_seasonality(df_train_preprocessed, '1_AUTOMOTIVE', 'monthly', 5)



Series.dt.weekofyear and Series.dt.week have been deprecated. Please use Series.dt.isocalendar().week instead.



In [10]:
plot.plot_seasonality(df_train_preprocessed, '1_AUTOMOTIVE', 'weekly', 25)


Series.dt.weekofyear and Series.dt.week have been deprecated. Please use Series.dt.isocalendar().week instead.

