### Download Data 

In [None]:
!wget -O m5-forecasting-accuracy.zip https://tinyurl.com/ybqbco7j
!unzip m5-forecasting-accuracy.zip

### Import Necessary Packages

In [None]:
import gc
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import seaborn as sns
from sklearn import preprocessing, metrics
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)


### Define Utility Functions 
* reduce memory usage 
* read data 

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
def read_data():
    print('Reading files...')
    calendar = pd.read_csv('calendar.csv')
    calendar = reduce_mem_usage(calendar)
    print('Calendar has {} rows and {} columns'.format(calendar.shape[0], calendar.shape[1]))
    sell_prices = pd.read_csv('sell_prices.csv')
    sell_prices = reduce_mem_usage(sell_prices)
    print('Sell prices has {} rows and {} columns'.format(sell_prices.shape[0], sell_prices.shape[1]))
    item_demands = pd.read_csv('sales_train_validation.csv')
    print('Sales train validation has {} rows and {} columns'.format(item_demands.shape[0], item_demands.shape[1]))
    return calendar, sell_prices, item_demands

### Show 3 Tables 
* calendar 
* sell prices 
* item demands per day 

In [None]:
calendar, sell_prices, item_demands  = read_data()

In [None]:
calendar.head()

In [None]:
sell_prices.head()

In [None]:
item_demands.head()

In [None]:
foodproduct = item_demands[item_demands['cat_id']=='FOODS']
target_product = foodproduct['item_id'].unique()[:300]
item_demands = item_demands[(item_demands['item_id'].isin(target_product))]
sell_prices = sell_prices[(sell_prices['item_id'].isin(target_product))]

In [None]:
def melt_item_demands(item_demands):
    id_vars = ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']
    value_vars = item_demands.columns[6:]
    item_demands = pd.melt(item_demands, id_vars = ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'], value_vars=value_vars, var_name = 'day', value_name = 'demand')
    item_demands = reduce_mem_usage(item_demands)
    return item_demands

In [None]:
item_demands_melt = melt_item_demands(item_demands)
item_demands_melt.head()

In [None]:
data = pd.merge(item_demands_melt, calendar, how = 'left', left_on = ['day'], right_on = ['d'])
data.drop(['d', 'day'], inplace = True, axis = 1)
data.head()

In [None]:
data = data.merge(sell_prices, on = ['store_id', 'item_id', 'wm_yr_wk'], how = 'left')
data.head()

In [None]:
def transform(data):
    nan_features = ['event_name_1', 'event_type_1', 'event_name_2', 'event_type_2']
    for feature in nan_features:
        data[feature].fillna('unknown', inplace = True)
        
    cat = ['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2']
    for feature in cat:
        encoder = preprocessing.LabelEncoder()
        data[feature] = encoder.fit_transform(data[feature].astype("str"))
    
    return data



In [None]:
transformed = transform(data)
transformed.head()

In [None]:
data.corr()

In [None]:
import seaborn as sns
import math 

plt.figure(figsize=(10,10))
sns.heatmap(np.log(data.corr().fillna(0)+1))


In [None]:
import plotly.express as px
import plotly.graph_objects as go
fig_c = go.Figure()
state_color = {1:'red', 100:'blue'}
for item_id in [1, 100]: 
    fig_c = go.Figure()
    to_plot = data[(data['item_id']==item_id) & (data['date'] <'2011-09-01')]
    to_plot = to_plot.groupby(['date']).sum()
    to_plot = to_plot.reset_index()
    fig_c.add_trace(go.Scatter(x=to_plot['date'], y=to_plot['demand'], line_color=state_color[item_id]))
    fig_c.show()    

In [None]:
features = ['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'year', 'month', 'week', 'day', 'dayofweek', 'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2', 
            'snap_CA', 'snap_TX', 'snap_WI', 'sell_price', 'lag_t28', 'lag_t29', 'lag_t30', 'rolling_mean_t7', 'rolling_std_t7', 'rolling_mean_t30', 'rolling_mean_t90', 
            'rolling_mean_t180', 'rolling_std_t30', 'price_change_t1', 'price_change_t365', 'rolling_price_std_t7', 'rolling_price_std_t30', 'rolling_skew_t30', 'rolling_kurt_t30']

In [None]:
def aggregate_temporal_features(data):
    
    # rolling demand features
    data['lag_t28'] = data.groupby(['id'])['demand'].transform(lambda x: x.shift(28))
    data['lag_t29'] = data.groupby(['id'])['demand'].transform(lambda x: x.shift(29))
    data['lag_t30'] = data.groupby(['id'])['demand'].transform(lambda x: x.shift(30))
    data['rolling_mean_t7'] = data.groupby(['id'])['demand'].transform(lambda x: x.shift(28).rolling(7).mean())
    data['rolling_std_t7'] = data.groupby(['id'])['demand'].transform(lambda x: x.shift(28).rolling(7).std())
    data['rolling_mean_t30'] = data.groupby(['id'])['demand'].transform(lambda x: x.shift(28).rolling(30).mean())
    data['rolling_mean_t90'] = data.groupby(['id'])['demand'].transform(lambda x: x.shift(28).rolling(90).mean())
    data['rolling_mean_t180'] = data.groupby(['id'])['demand'].transform(lambda x: x.shift(28).rolling(180).mean())
    data['rolling_std_t30'] = data.groupby(['id'])['demand'].transform(lambda x: x.shift(28).rolling(30).std())
    data['rolling_skew_t30'] = data.groupby(['id'])['demand'].transform(lambda x: x.shift(28).rolling(30).skew())
    data['rolling_kurt_t30'] = data.groupby(['id'])['demand'].transform(lambda x: x.shift(28).rolling(30).kurt())
    
    
    # price features
    data['lag_price_t1'] = data.groupby(['id'])['sell_price'].transform(lambda x: x.shift(1))
    data['price_change_t1'] = (data['lag_price_t1'] - data['sell_price']) / (data['lag_price_t1'])
    data['rolling_price_max_t365'] = data.groupby(['id'])['sell_price'].transform(lambda x: x.shift(1).rolling(365).max())
    data['price_change_t365'] = (data['rolling_price_max_t365'] - data['sell_price']) / (data['rolling_price_max_t365'])
    data['rolling_price_std_t7'] = data.groupby(['id'])['sell_price'].transform(lambda x: x.rolling(7).std())
    data['rolling_price_std_t30'] = data.groupby(['id'])['sell_price'].transform(lambda x: x.rolling(30).std())
    data.drop(['rolling_price_max_t365', 'lag_price_t1'], inplace = True, axis = 1)
    
    # time features
    data['date'] = pd.to_datetime(data['date'])
    data['year'] = data['date'].dt.year
    data['month'] = data['date'].dt.month
    data['week'] = data['date'].dt.week
    data['day'] = data['date'].dt.day
    data['dayofweek'] = data['date'].dt.dayofweek
    
    
    return data

In [None]:
def transform_train_and_eval(data):
    data = transform(data)
    data = aggregate_temporal_features(data)
    # reduce memory for new features so we can train
    data = reduce_mem_usage(data)
    x_train = data[data['date'] <= '2016-03-27']
    x_train = x_train[['demand']+features]
    x_val = data[(data['date'] > '2016-03-27') & (data['date'] <= '2016-04-24')]
    x_val = x_val[['demand']+features]

    return x_train, x_val

    
train, validation = transform_train_and_eval(data)

In [None]:
train.info()

In [None]:
train.tail()

In [None]:
import sagemaker
import boto3
import os 

local_train_file = "train.csv"
local_test_file = "test.csv"
local_autopilot_file = "autopilot_train.csv"
train.to_csv(local_autopilot_file, index=False)
train.to_csv(local_train_file, header=False, index=False)
validation.to_csv(local_test_file, header=False, index=False)

bucket = sagemaker.Session().default_bucket()
prefix = 'demand-prediction'

boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'autopilot/input', local_autopilot_file)).upload_file(local_autopilot_file)
boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'train', local_train_file)).upload_file(local_train_file)
boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'validation', local_test_file)).upload_file(local_test_file)


In [None]:
!pip install xgboost 

In [None]:
import xgboost
import shap

y = train['demand']
X = train.drop(['demand'], axis=1)


In [None]:
model = xgboost.XGBRegressor().fit(X, y)

In [None]:
explainer = shap.Explainer(model)
shap_values = explainer(X)

In [None]:
shap.plots.waterfall(shap_values[5654999])

In [None]:

shap.initjs()

shap.plots.force(shap_values[5654999])
