In [34]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from scipy import stats
from statsmodels.tsa.arima_model import ARIMA
import seaborn as sns
import matplotlib.pyplot as plt
import arima
import datetime
%matplotlib inline
from sklearn.decomposition import PCA
import stl

import sys
sys.path.append('../')
from pipeline import *
from onehot import *
from util import *

# load data
train_data = pd.read_csv('raw_data/train.csv')
test_data = pd.read_csv('raw_data/test.csv')

In [53]:
def select_arima_model(sales, orders, factors=None,speedup=False):
    best_r = None
    best_o = None
#     min_aic=9999999999999
    min_bic=9999999999999
        
    for o in orders:
        try:
            if speedup:
                m=sm.tsa.statespace.SARIMAX(sales,order=o,exog=factors,
                         simple_differencing=True, enforce_stationarity=False, enforce_invertibility=False)
                    
            else:
                m=sm.tsa.statespace.SARIMAX(sales,order=o,exog=factors)
                
            r=m.fit(disp=False)
            if r.bic < min_bic:
                best_r = r
                best_o = o
#                 min_aic=r.aic
                min_bic=r.bic
            print(o,r.bic)

        except Exception as e:
            print('%s %s'% (o,e))
#             traceback.print_exc()
            
    return best_r,best_o

def make_orders(range_num, seq_num):
    if seq_num == 0:
        return [[]]
    else:
        orders=[]
        sub_orders=make_orders(range_num,seq_num-1)
        for o in sub_orders:
            for i in range(range_num):
                s=o.copy()
                s.append(i)
                orders.append(s)
        return orders
    
depts=train_data.Dept.unique()
# iterate every dept
for dept in depts:
    print('dept %d' % dept)
    # transform train to dept, stores(in columns) matrix, index is date fill na with 0
    dept_data = train_data[train_data.Dept == dept][['Date','Store','Weekly_Sales']]
    dept_stores = pd.pivot_table(dept_data, values='Weekly_Sales', index='Date', columns='Store', fill_value=0.)
    
    # pca preprocess the sales between stores to filter noise ,use 12 components    
    idxs=dept_stores.index
    cols=dept_stores.columns
    pca = PCA(n_components=12)
    dept_pca = pca.fit_transform(dept_stores)
    dept_restore = pca.inverse_transform(dept_pca)
    dept_stores = pd.DataFrame(dept_restore,index=pd.DatetimeIndex(idxs),columns=cols)
    
    # iterate every store
    for store in dept_stores.columns:
        store_data = dept_stores.loc[:,store]
        # stl the ts
        de = stl.stl_decompose(store_data)
        # select arima for trend comp
        orders = make_orders(3,3)
        r,o=select_arima_model(store_data.values,orders,speedup=True)
        print('best %s %s',o,r.bic)
        # forecast until test end date
        # add last season comp to forecasts
        break
    break


dept 1
[0, 0, 0] Number of states in statespace model must be a positive number.
[0, 0, 1] 3154.70115585
[0, 0, 2] 3119.75162681
[0, 1, 0] Number of states in statespace model must be a positive number.
[0, 1, 1] 2963.5258377
[0, 1, 2] 2912.25018302
[0, 2, 0] Number of states in statespace model must be a positive number.
[0, 2, 1] 2960.47553358
[0, 2, 2] 2937.30601857
[1, 0, 0] 3004.58413137
[1, 0, 1] 2982.83576536
[1, 0, 2] 2928.89450564
[1, 1, 0] 2984.38212874
[1, 1, 1] 2958.31172166
[1, 1, 2] 2917.60031697
[1, 2, 0] 3045.1416999
[1, 2, 1] 2961.44323238
[1, 2, 2] 2928.3903694
[2, 0, 0] 2982.97695981
[2, 0, 1] 3173.66229006
[2, 0, 2] 2933.43364036
[2, 1, 0] 2950.93353122
[2, 1, 1] 2941.50659312
[2, 1, 2] 2919.85535492
[2, 2, 0] 2997.9387122
[2, 2, 1] 2946.13155026
[2, 2, 2] 2921.75173224
best %s %s [0, 1, 2] 2912.25018302


In [44]:
help(sm.nonparametric.lowess)

Help on function lowess in module statsmodels.nonparametric.smoothers_lowess:

lowess(endog, exog, frac=0.6666666666666666, it=3, delta=0.0, is_sorted=False, missing='drop', return_sorted=True)
    LOWESS (Locally Weighted Scatterplot Smoothing)
    
    A lowess function that outs smoothed estimates of endog
    at the given exog values from points (exog, endog)
    
    Parameters
    ----------
    endog: 1-D numpy array
        The y-values of the observed points
    exog: 1-D numpy array
        The x-values of the observed points
    frac: float
        Between 0 and 1. The fraction of the data used
        when estimating each y-value.
    it: int
        The number of residual-based reweightings
        to perform.
    delta: float
        Distance within which to use linear-interpolation
        instead of weighted regression.
    is_sorted : bool
        If False (default), then the data will be sorted by exog before
        calculating lowess. If True, then it is assumed tha