In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm

In [2]:
# Import data
df = pd.read_csv('data/train.csv', index_col='date')

# Set index
df.drop('id', axis=1, inplace=True)
df.index = pd.to_datetime(df.index)

df

Unnamed: 0_level_0,country,store,product,num_sold
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2010-01-01,Canada,Discount Stickers,Holographic Goose,
2010-01-01,Canada,Discount Stickers,Kaggle,973.0
2010-01-01,Canada,Discount Stickers,Kaggle Tiers,906.0
2010-01-01,Canada,Discount Stickers,Kerneler,423.0
2010-01-01,Canada,Discount Stickers,Kerneler Dark Mode,491.0
...,...,...,...,...
2016-12-31,Singapore,Premium Sticker Mart,Holographic Goose,466.0
2016-12-31,Singapore,Premium Sticker Mart,Kaggle,2907.0
2016-12-31,Singapore,Premium Sticker Mart,Kaggle Tiers,2299.0
2016-12-31,Singapore,Premium Sticker Mart,Kerneler,1242.0


In [57]:
# Delete NaNs
print(f'Size df pre-drop = {df.shape[0]}')
df.dropna(inplace=True)
print(f'Size df post-drop = {df.shape[0]}')

Size df pre-drop = 221259
Size df post-drop = 221259


In [5]:
# Check for stationarity
from statsmodels.tsa.stattools import adfuller

# Group the data by relevant categories
grouped = df.groupby(['country', 'store', 'product'])

# Save whether stationary or not
stationary = {}

# Iterate through each group and apply the ADF test
for (country, store, product), group in grouped:
    result = adfuller(group["num_sold"])
    p_val = result[1]
    stationary[(country, store, product)] = int((p_val > 0.05))

In [None]:
import itertools
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

# Fit ARIMA for each country, store & product
models = {}

p = range(0, 3)
q = range(0, 3)


for (country, store, product), group in tqdm(grouped):
    best_aic = float("inf")
    best_model = None
    
    data = group['num_sold']
    data = data.asfreq('D')

    for params in itertools.product(p, q):
        try:
            order = (params[0], stationary[(country, store, product)], params[1])
            model = sm.tsa.ARIMA(data, order=order).fit()
            if model.aic < best_aic:
                best_aic = model.aic
                best_model = model
        except:
            continue
    models[(country,store,product)] = best_model
    

In [49]:

# Load test data
test = pd.read_csv('data/test.csv', index_col='date')
test.index = pd.to_datetime(test.index)
test




Unnamed: 0_level_0,id,country,store,product
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2017-01-01,230130,Canada,Discount Stickers,Holographic Goose
2017-01-01,230131,Canada,Discount Stickers,Kaggle
2017-01-01,230132,Canada,Discount Stickers,Kaggle Tiers
2017-01-01,230133,Canada,Discount Stickers,Kerneler
2017-01-01,230134,Canada,Discount Stickers,Kerneler Dark Mode
...,...,...,...,...
2019-12-31,328675,Singapore,Premium Sticker Mart,Holographic Goose
2019-12-31,328676,Singapore,Premium Sticker Mart,Kaggle
2019-12-31,328677,Singapore,Premium Sticker Mart,Kaggle Tiers
2019-12-31,328678,Singapore,Premium Sticker Mart,Kerneler


In [50]:
# Generate predictions
groups_test = test.groupby(['country','store', 'product'])
forecasts = {}

for (country, store, product), group in groups_test:
    locator = (country, store, product)
    min_date, max_date = group.index.min(), group.index.max()
    if locator in models:
        forecasts[locator] = models[locator].predict(start=min_date, end=max_date)

    # Handle later cases where no model was built
    else:
        continue

# If no model was built, take average of predictions 
# for that store that same day
for (country, store, product), group in groups_test:
    locator = (country, store, product)
    if locator in forecasts:
        continue

    store_predictions = [forecasts[(country, store, prod)] for prod in test[(test['country'] == country) & (test['store'] == store)]['product'].unique() if (country, store, prod) in forecasts]
    forecasts[locator] = pd.Series([np.mean([s_pred.loc[date] for s_pred in store_predictions]) for date in group.index], index=group.index)

In [53]:
# Create predictions data frame
predictions = []        

for i in range(len(test)):
    locator = tuple(test.iloc[i][['country','store','product']])
    date = test.index[i]
    try:
        prediction = forecasts[locator].loc[date]
    except:
        prediction = None
    predictions.append(prediction)

result = pd.DataFrame({'id': test['id'], 'num_sold' : predictions})
print(result)

                id     num_sold
date                           
2017-01-01  230130   508.467796
2017-01-01  230131   722.042230
2017-01-01  230132   557.852058
2017-01-01  230133   353.914264
2017-01-01  230134   400.062632
...            ...          ...
2019-12-31  328675   354.762301
2019-12-31  328676  2188.972691
2019-12-31  328677  1682.215402
2019-12-31  328678  1046.355072
2019-12-31  328679  1232.240782

[98550 rows x 2 columns]


In [56]:
# Save data
result.to_csv('data/results.csv', index=False)