In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import datetime
import numpy as np
from tqdm import tqdm
from statsmodels.tsa.arima_model import ARIMA
from sklearn.metrics import mean_squared_error
from tqdm import tqdm
%matplotlib inline
test_path = 'https://media.githubusercontent.com/media/cchopade/gahack2/master/test.csv'
train_path = 'https://media.githubusercontent.com/media/cchopade/gahack2/master/train.csv'

In [7]:
df = pd.read_csv(train_path)
df.Datetime = pd.to_datetime(df.Datetime)
df = df.set_index(['Item_ID','Datetime'])
df = df.sort_index(ascending=False)
df.sort_values(['Item_ID','Datetime'],ascending=[False,True],inplace=True)
df.reset_index(inplace=True)
df.drop(labels='ID', axis = 1, inplace=True)
df['year'] = df["Datetime"].apply(lambda x: x.year)
df.head()

Unnamed: 0,Item_ID,Datetime,Category_3,Category_2,Category_1,Price,Number_Of_Sales
0,31375,2015-05-27,0,5.0,235,0.882,133
1,31375,2015-05-28,0,5.0,235,0.787,113
2,31375,2015-05-29,0,5.0,235,0.942,59
3,31375,2015-05-30,0,5.0,235,1.049,56
4,31375,2015-05-31,0,5.0,235,1.381,37


In [19]:
x_train = df[df["year"] != 2016] # train data
x_valid = df[df["year"] == 2016]

items = list(df.Item_ID.unique())

In [20]:
def evaluate_arima_model(X, arima_order):
    # prepare training dataset
    train_size = int(len(X) * 0.8)
    train, test = X[0:train_size], X[train_size:]
    history = [x for x in train]
    # make predictions
    predictions = list()
    for t in range(len(test)):
        model = ARIMA(history, order=arima_order)
        model_fit = model.fit(disp=0)
        yhat = model_fit.forecast()[0]
        predictions.append(yhat)
        history.append(test[t])
    # calculate out of sample error
    error = mean_squared_error(test, predictions)
    return error

In [21]:
def evaluate_models(dataset, p_values, d_values, q_values):
    dataset = dataset.astype('float32')
    best_score, best_cfg = float("inf"), None
    for p in tqdm(p_values):
        for d in d_values:
            for q in q_values:
                order = (p,d,q)
                try:
                    mse = evaluate_arima_model(dataset, order)
                    if mse < best_score:
                        best_score, best_cfg = mse, order
                    print('ARIMA%s MSE=%.3f' % (order,mse))
                except:
                    continue
    print('Best ARIMA%s MSE=%.3f' % (best_cfg, best_score))

In [None]:
p = [1,2,4,6,8,10]
q = [1,2]
d = [1,2]

item1 = df[df.Item_ID == 30258]
item1.set_index(item1.Datetime,inplace=True)
#item1.drop(labels=['index'], axis = 1,inplace=True)

evaluate_models(item1['Price'],p,d,q)

  0%|          | 0/6 [00:00<?, ?it/s]

ARIMA(1, 1, 1) MSE=0.168


  R_mat, T_mat)
  newparams = ((1-np.exp(-params))/(1+np.exp(-params))).copy()
  tmp = ((1-np.exp(-params))/(1+np.exp(-params))).copy()
  (1+np.exp(-params))).copy()
  (1+np.exp(-params))).copy()
 17%|█▋        | 1/6 [02:01<10:07, 121.50s/it]

ARIMA(2, 1, 1) MSE=0.168




ARIMA(4, 1, 1) MSE=0.167




ARIMA(6, 1, 1) MSE=0.167




 67%|██████▋   | 4/6 [22:34<11:17, 338.56s/it]

ARIMA(8, 1, 1) MSE=0.167


 83%|████████▎ | 5/6 [31:14<06:14, 374.94s/it]