In [None]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from scipy import stats
from statsmodels.tsa.arima_model import ARIMA
import seaborn as sns
import matplotlib.pyplot as plt
import arima
import datetime
%matplotlib inline


In [None]:
features = pd.read_csv('raw_data/features.csv')
features.head(5)

In [None]:
features[np.logical_not(np.isnan(features.MarkDown3))].head()

In [None]:
stores = pd.read_csv('raw_data/stores.csv')
stores.head(5)

In [None]:
train_data = pd.read_csv('raw_data/train.csv')
train_data.info()

lookup store1 dept1's historical sales

In [None]:
dept_data=train_data[np.logical_and(train_data.Store==1,train_data.Dept == 1)]

dept_data=pd.DataFrame(dept_data[['Weekly_Sales','IsHoliday','Date']].values,
                       index=pd.DatetimeIndex(dept_data['Date']),columns=['Weekly_Sales','IsHoliday','Date'])
dept_train=dept_data.iloc[:105,]
dept_test=dept_data.iloc[105:,]
for df in (dept_data,dept_train,dept_test):
    df.Weekly_Sales = df.Weekly_Sales.astype('float64')
    df.IsHoliday = df.IsHoliday.astype('int64')
dept_train.info()

In [None]:
dept_data.Weekly_Sales.diff().plot()

In [None]:
fig = plt.figure(figsize=(12,8))
ax1 = fig.add_subplot(211)
fig = sm.graphics.tsa.plot_acf(dept_data.Weekly_Sales.dropna(), ax=ax1)
ax2 = fig.add_subplot(212)
fig = sm.graphics.tsa.plot_pacf(dept_data.Weekly_Sales.dropna(), ax=ax2)

In [None]:
from statsmodels.tsa.stattools import adfuller
dftest=adfuller(dept_data.Weekly_Sales)
dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
for key,value in dftest[4].items():
    dfoutput['Critical Value (%s)'%key] = value
dfoutput

In [None]:
fig = plt.figure(figsize=(12,8))
ax1 = fig.add_subplot(211)
fig = sm.graphics.tsa.plot_acf(dept_data.Weekly_Sales.diff().dropna(), ax=ax1)
ax2 = fig.add_subplot(212)
fig = sm.graphics.tsa.plot_pacf(dept_data.Weekly_Sales.diff().dropna(), ax=ax2)

In [None]:
from statsmodels.tsa.stattools import adfuller
dftest=adfuller(dept_data.Weekly_Sales.diff().dropna())
dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
for key,value in dftest[4].items():
    dfoutput['Critical Value (%s)'%key] = value
dfoutput

In [None]:
def score(predict_data,validate_data):
    validate_data['Weight'] = np.array([5 if holiday else 1 for holiday in validate_data['IsHoliday']])
    return np.sum(np.abs((predict_data.Weekly_Sales-validate_data.Weekly_Sales)*validate_data.Weight)) / np.sum(validate_data.Weight)

In [None]:
import statsmodels.api as sm
import traceback

def select_arima_model(sales, orders, season, factors=None,speedup=False):
    best_r = None
    best_o = None
    min_aic=9999999999999
    min_bic=9999999999999
        
    for o,s in zip(orders,season):
        try:
            if speedup:
                m=sm.tsa.statespace.SARIMAX(sales,order=o,seasonal_order=s,exog=factors,
                         simple_differencing=True, enforce_stationarity=False, enforce_invertibility=False)
                    
            else:
                m=sm.tsa.statespace.SARIMAX(sales,order=o,seasonal_order=s,exog=factors)
                
            r=m.fit(disp=False)
            if r.aic < min_aic and r.bic < min_bic:
                best_r = r
                best_o = o+s
                min_aic=r.aic
                min_bic=r.bic
                print(best_o,r.aic,r.bic)

        except Exception as e:
            print(e)
#             traceback.print_exc()
            
    return best_r,best_o

def make_orders(range_num, seq_num):
    if seq_num == 0:
        return [[]]
    else:
        orders=[]
        sub_orders=make_orders(range_num,seq_num-1)
        for o in sub_orders:
            for i in range(range_num):
                s=o.copy()
                s.append(i)
                orders.append(s)
        return orders


In [None]:
class ArimaModel():
    def __init__(self):
        pass
    
    def fit(self, train_data, orders, seasonal_orders, factors=None):
        train_resample=train_data.resample('1W').sum()
        train_resample.fillna(0)
        train_resample.index = train_resample.index-(train_resample.index[0]-train_data.index[0])
        if factors is None:
            self.m, self.o = select_arima_model(train_resample.Weekly_Sales,orders,seasonal_orders)
        else:
            self.m, self.o = select_arima_model(train_resample.Weekly_Sales,orders,seasonal_orders,train_resample[factors])

    def predict(self, test_data,factors=None):
        predict_data = test_data.copy()
        predict_data=predict_data.resample('1W').sum()
        predict_data.index = predict_data.index-(predict_data.index[0]-test_data.index[0])
        predict_data.fillna(0)
        predict_data['Weekly_Sales']=0
        
        if factors is None:
            predicts=self.m.forecast(len(test_data))
        else:
            predicts=self.m.forecast(len(test_data),exog=predict_data[factors])
        for date in predict_data.index:
            sale_data = predicts[predicts.index == date]
            if len(sale_data) == 1:
                predict_data.loc[date,'Weekly_Sales'] = sale_data[date]
        return predict_data
    
arima_m = ArimaModel()
orders=make_orders(3,3)
arma_orders=[(o[0],o[1],o[2]) for o in orders]
seasonal_orders=[(0,0,0,0) for o in orders]
arima_m.fit(dept_train,arma_orders,seasonal_orders)
dept_predict=arima_m.predict(dept_test)
arima_m.o, arima_m.m.aic, arima_m.m.bic, score(dept_predict,dept_test)

In [None]:
# arima_f_m = ArimaModel()
# orders=make_orders(3,3)
# arma_orders=[(o[0],o[1],o[2]) for o in orders]
# seasonal_orders=[(0,0,0,0) for o in orders]
# arima_f_m.fit(dept_train,arma_orders,seasonal_orders,['IsHoliday'])
# dept_predict=arima_f_m.predict(dept_test,['IsHoliday'])
# arima_f_m.o, arima_f_m.m.aic, arima_f_m.m.bic, score(dept_predict,dept_test)

In [None]:
# sarima_m = ArimaModel()
# orders=make_orders(3,6)
# arma_orders=[(o[0],o[1],o[2]) for o in orders]
# seasonal_orders=[(o[3],o[4],o[5],52) for o in orders]
# sarima_m.fit(dept_train,arma_orders,seasonal_orders)
# dept_predict=sarima_m.predict(dept_test)
# sarima_m.o, sarima_m.m.aic, sarima_m.m.bic, score(dept_predict,dept_test)

In [None]:
# sarima_f_m = ArimaModel()
# orders=make_orders(3,6)
# arma_orders=[(o[0],o[1],o[2]) for o in orders]
# seasonal_orders=[(o[3],o[4],o[5],52) for o in orders]
# sarima_f_m.fit(dept_train,arma_orders,seasonal_orders,['IsHoliday'])
# dept_predict=sarima_f_m.predict(dept_test, ['IsHoliday'])
# sarima_f_m.o, sarima_f_m.m.aic, sarima_f_m.m.bic, score(dept_predict,dept_test)

In [None]:
sarima_cust_m = ArimaModel()
orders=[(2,1,1)] # best choice by AIC
# orders=make_orders(5,3)
# orders=[(0,0,0)]
arma_orders=[(o[0],o[1],o[2]) for o in orders]
seasonal_orders=[(1,0,0,52) for o in orders]
sarima_cust_m.fit(dept_train,arma_orders,seasonal_orders)
dept_predict=sarima_cust_m.predict(dept_test)
sarima_cust_m.o, sarima_cust_m.m.aic, sarima_cust_m.m.bic, score(dept_predict,dept_test)

In [None]:
store_features=features[features.Store==1]
store_features=pd.DataFrame(store_features[['Temperature','Fuel_Price','CPI','Unemployment']].values,
                       index=pd.DatetimeIndex(store_features['Date']),columns=['Temperature','Fuel_Price','CPI','Unemployment'])
dept_f_data=pd.concat([dept_data,store_features],axis=1,join='inner')
dept_f_train=dept_f_data.iloc[:105,]
dept_f_test=dept_f_data.iloc[105:,]
for df in (dept_f_data,dept_f_train,dept_f_test):
    df.Weekly_Sales = df.Weekly_Sales.astype('float64')
    df.IsHoliday = df.IsHoliday.astype('int64')
    

In [None]:
sarima_cust_m = ArimaModel()
orders=[(2,1,1)] # best choice by AIC
# orders=make_orders(5,3)
arma_orders=[(o[0],o[1],o[2]) for o in orders]
seasonal_orders=[(1,0,0,52) for o in orders]
sarima_cust_m.fit(dept_f_train,arma_orders,seasonal_orders,['IsHoliday'])
dept_predict=sarima_cust_m.predict(dept_f_test,['IsHoliday'])
sarima_cust_m.o, sarima_cust_m.m.aic, sarima_cust_m.m.bic, score(dept_predict,dept_test)

In [None]:
dept_f_train['Sales_Log']=np.log(dept_f_train.Weekly_Sales)
dept_f_train[['Sales_Log','IsHoliday']].plot()

In [None]:
# tmp_1_2=train_data[np.logical_and(train_data.Store==1,train_data.Dept == 2)]
# id_1_2=pd.DataFrame(tmp_1_2['Weekly_Sales'].values,index=pd.DatetimeIndex(tmp_1_2['Date']),columns=['Weekly_Sales'])
# id_1_2.plot(figsize=(20,5))

In [None]:
# from statsmodels.tsa.stattools import adfuller
# dftest = adfuller(id_1_1.Weekly_Sales.diff().dropna(), autolag='AIC')
# dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
# for key,value in dftest[4].items():
#     dfoutput['Critical Value (%s)'%key] = value
# dfoutput

In [None]:
# mod = sm.tsa.statespace.SARIMAX(id_1_1.Weekly_Sales, order=(1,1,1), seasonal_order=(1,1,0,52))
# res = mod.fit()
# res.summary()

In [None]:
# res.aic,res.bic

In [None]:
# r,q,p = sm.tsa.acf(res.resid.values.squeeze(), qstat=True)
# data = np.c_[range(1,41), r[1:], q, p]
# table = pd.DataFrame(data, columns=['lag', "AC", "Q", "Prob(>Q)"])
# table.set_index('lag')

use lr

In [None]:
full_train=train_data.merge(stores,left_on='Store',right_on='Store').merge(features,left_on=('Store','Date'),right_on=('Store','Date'))
full_train.head()

In [None]:
full_train[np.logical_and(full_train.Store==1,full_train.Dept==1)].head()

In [None]:
full_train['Week_In_Year']=full_train.Date.astype(np.datetime64).apply(lambda d: datetime.date.isocalendar(d)[1])
full_train.head()

In [None]:
full_train['IsHoliday']=full_train.IsHoliday_x.astype('int')
full_train.drop(['IsHoliday_x','IsHoliday_y'],axis=1,inplace=True)
full_train.head()

In [None]:
depts1=full_train[full_train.Dept==1][['Store','Type','Weekly_Sales','Size','Temperature','Fuel_Price','CPI','Unemployment','IsHoliday','Week_In_Year']]
depts1.head()

In [None]:
import sys
sys.path.append('../')
from pipeline import *
from onehot import *
from util import *

full_pipeline=DataFramePipeline([
        FeaturePipeline('Type','',Pipeline([('onehot',LabelBinarizerEx(['Type']))])),
]
)

depts1p=full_pipeline.fit_transform(depts1)
depts1p.head()

In [None]:
# test_data = pd.read_csv('raw_data/test.csv')
# test_data.head()

In [None]:
# result.to_csv('result.csv',header=True,index=False)