In [2]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt        
import seaborn as sns
from statsmodels.tsa.deterministic import CalendarFourier, DeterministicProcess
import copy

In [3]:
#Get the dataset
store_sales = pd.read_csv(
    '/kaggle/input/store-sales-time-series-forecasting/train.csv',
    usecols=['store_nbr', 'family', 'date', 'sales'],
    dtype={
        'store_nbr': 'category',
        'family': 'category',
        'sales': 'float32',
    },
    parse_dates=['date'],
    infer_datetime_format=True,
)

#Change datetime period
store_sales['date'] = store_sales.date.dt.to_period('D')
store_sales = store_sales.set_index(['store_nbr', 'family', 'date']).sort_index()

#Get target dataFrame having a new level of column labels whose inner-most level
#consists of the pivoted index labels.
y = store_sales.unstack(['store_nbr', 'family']).loc["2017"]

fourier = CalendarFourier(freq='M', order=3)

'''Deterministic Process generates supports constants, time trends, and either seasonal dummies or 
fourier terms for a single cycle.'''
dp = DeterministicProcess(
    index=y.index,
    constant=False,
    order=5,
    seasonal=True,
    additional_terms=[fourier],
    drop=True,
)

X = dp.in_sample()
X['NewYear'] = (X.index.dayofyear == 1)

In [6]:
#independent Variables (with Deterministic Process variables)
X.head()

Unnamed: 0_level_0,trend,trend_squared,trend_cubed,trend**4,trend**5,"s(1,7)","s(2,7)","s(3,7)","s(4,7)","s(5,7)","s(6,7)","s(7,7)","sin(1,freq=M)","cos(1,freq=M)","sin(2,freq=M)","cos(2,freq=M)","sin(3,freq=M)","cos(3,freq=M)",NewYear
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2017-01-01,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,True
2017-01-02,2.0,4.0,8.0,16.0,32.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.201299,0.97953,0.394356,0.918958,0.571268,0.820763,False
2017-01-03,3.0,9.0,27.0,81.0,243.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.394356,0.918958,0.724793,0.688967,0.937752,0.347305,False
2017-01-04,4.0,16.0,64.0,256.0,1024.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.571268,0.820763,0.937752,0.347305,0.968077,-0.250653,False
2017-01-05,5.0,25.0,125.0,625.0,3125.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.724793,0.688967,0.998717,-0.050649,0.651372,-0.758758,False


In [8]:
y.head()

Unnamed: 0_level_0,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales
store_nbr,1,1,1,1,1,1,1,1,1,1,...,9,9,9,9,9,9,9,9,9,9
family,AUTOMOTIVE,BABY CARE,BEAUTY,BEVERAGES,BOOKS,BREAD/BAKERY,CELEBRATION,CLEANING,DAIRY,DELI,...,MAGAZINES,MEATS,PERSONAL CARE,PET SUPPLIES,PLAYERS AND ELECTRONICS,POULTRY,PREPARED FOODS,PRODUCE,SCHOOL AND OFFICE SUPPLIES,SEAFOOD
date,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3
2017-01-01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2017-01-02,5.0,0.0,0.0,1434.0,0.0,166.819,0.0,332.0,376.0,44.98,...,5.0,659.570007,1243.0,11.0,41.0,843.596008,115.188995,3136.895996,1.0,23.0
2017-01-03,4.0,0.0,4.0,3081.0,2.0,519.348022,15.0,952.0,1045.0,209.300003,...,2.0,547.364014,876.0,6.0,15.0,714.659973,133.039001,3229.558105,1.0,14.0
2017-01-04,1.0,0.0,4.0,3039.0,2.0,543.250977,17.0,1055.0,1029.0,135.944,...,3.0,395.287994,677.0,6.0,13.0,536.830017,75.201004,1491.416992,7.0,0.0
2017-01-05,2.0,0.0,3.0,2617.0,0.0,533.47998,40.0,918.0,853.0,137.005997,...,2.0,470.768005,604.0,7.0,10.0,414.100006,113.698997,1566.821045,1.0,17.0


## Models

In [9]:
from sklearn.model_selection import train_test_split

#Train test split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.001, random_state=42)

from sklearn.preprocessing import RobustScaler

#remove the median and scale the data according to the quantile range 
#which is robust to outliers.
transformerL = RobustScaler().fit(X_train)
X_val = transformerL.transform(X_val)
X_train = transformerL.transform(X_train)

In [10]:
X_train.shape

(226, 19)

In [11]:
#Import Linear models
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn import metrics

### Train Lasso Model

In [12]:
# fit train
modelL = Lasso(alpha=1, fit_intercept = True, max_iter=7000).fit(X_train, y_train)
#print(model1.score(X_train, y_train))
y_predL = modelL.predict(X_val)
#print(model1.score(X_val, y_val))
print('Mean Absolute Error:', metrics.mean_absolute_error(y_val, y_predL))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_val, y_predL)))

y_predL[y_predL < 0] = 0

print(metrics.mean_absolute_error(y_val, y_predL/1.03))

print('======')


  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)


Mean Absolute Error: 46.80523848557713
Root Mean Squared Error: 193.46421859972216
42.252206257649874


### Train Ridge Model

In [13]:
#fit train
modelR = Ridge(alpha=0.4, fit_intercept = True, max_iter=7000).fit(X_train, y_train)
#print(model1.score(X_train, y_train))
y_predR = modelR.predict(X_val)
#print(model1.score(X_val, y_val))
print('Mean Absolute Error:', metrics.mean_absolute_error(y_val, y_predR))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_val, y_predR)))

y_predL[y_predL < 0] = 0

print(metrics.mean_absolute_error(y_val, y_predL/1.03))

print('======')


print(metrics.mean_absolute_error(y_val, 0.5*(y_predL + y_predR)/1.03))

Mean Absolute Error: 44.66347480824516
Root Mean Squared Error: 191.5615790420504
42.252206257649874
42.56897045330052


### Train Keras Neural Network Model

In [14]:
# Train test split
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=42)

from sklearn.preprocessing import RobustScaler

#remove the median and scale the data according to the quantile range 
#which is robust to outliers.
transformerKERAS = RobustScaler().fit(X_train)
X_val = transformerKERAS.transform(X_val)
X_train = transformerKERAS.transform(X_train)

In [15]:
from tensorflow import keras
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization

from tensorflow.keras.callbacks import ReduceLROnPlateau
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=20, min_lr=0.00005, verbose=1, mode='min')

# Keras Model architecture
def create_model():

    model = Sequential()
    model.add(Dense(units=500, activation='swish', input_dim=19))
    model.add(Dense(units=500, activation='swish'))
    model.add(Dense(units=1500, activation='swish'))
    model.add(Dense(units=1782, activation='swish'))

    model.compile(loss='mae', optimizer='adam')
    
    return model

# Keras train
model = create_model()
model.fit(X_train, y_train, epochs=1500, batch_size=2000, validation_data=(X_val, y_val),callbacks=[reduce_lr])

y_pred1 = model.predict(X_val)
#print(model1.score(X_val, y_val))

y_pred1[y_pred1 < 0] = 0

print('Mean Absolute Error:', metrics.mean_absolute_error(y_val, y_pred1))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_val, y_pred1)))

print('======')

2022-03-13 21:04:56.304053: W tensorflow/stream_executor/platform/default/dso_loader.cc:60] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /opt/conda/lib
2022-03-13 21:04:56.304144: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2022-03-13 21:05:00.971264: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2022-03-13 21:05:00.974926: W tensorflow/stream_executor/platform/default/dso_loader.cc:60] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /opt/conda/lib
2022-03-13 21:05:00.974962: W tensorflow/stream_executor/cuda/cuda_driver.cc:326] failed call to cuInit: UNKNOWN ERROR (303)
2022-03-13 21:05:00.974989: I tensorflow/stream_executor/cuda/cud

Epoch 1/1500
Epoch 2/1500
Epoch 3/1500
Epoch 4/1500
Epoch 5/1500
Epoch 6/1500
Epoch 7/1500
Epoch 8/1500
Epoch 9/1500
Epoch 10/1500
Epoch 11/1500
Epoch 12/1500
Epoch 13/1500
Epoch 14/1500
Epoch 15/1500
Epoch 16/1500
Epoch 17/1500
Epoch 18/1500
Epoch 19/1500
Epoch 20/1500
Epoch 21/1500
Epoch 22/1500
Epoch 23/1500
Epoch 24/1500
Epoch 25/1500
Epoch 26/1500
Epoch 27/1500
Epoch 28/1500
Epoch 29/1500
Epoch 30/1500
Epoch 31/1500
Epoch 32/1500
Epoch 33/1500
Epoch 34/1500
Epoch 35/1500
Epoch 36/1500
Epoch 37/1500
Epoch 38/1500
Epoch 39/1500
Epoch 40/1500
Epoch 41/1500
Epoch 42/1500
Epoch 43/1500
Epoch 44/1500
Epoch 45/1500
Epoch 46/1500
Epoch 47/1500
Epoch 48/1500
Epoch 49/1500
Epoch 50/1500
Epoch 51/1500
Epoch 52/1500
Epoch 53/1500
Epoch 54/1500
Epoch 55/1500
Epoch 56/1500
Epoch 57/1500
Epoch 58/1500
Epoch 59/1500
Epoch 60/1500
Epoch 61/1500
Epoch 62/1500
Epoch 63/1500
Epoch 64/1500
Epoch 65/1500
Epoch 66/1500
Epoch 67/1500
Epoch 68/1500
Epoch 69/1500
Epoch 70/1500
Epoch 71/1500
Epoch 72/1500
E

### Load and process test data

In [16]:
df_test = pd.read_csv(
    '/kaggle/input/store-sales-time-series-forecasting/test.csv',
    dtype={
        'store_nbr': 'category',
        'family': 'category',
        'onpromotion': 'uint32',
    },
    parse_dates=['date'],
    infer_datetime_format=True,
)
df_test['date'] = df_test.date.dt.to_period('D')
df_test = df_test.set_index(['store_nbr', 'family', 'date']).sort_index()

X_test = dp.out_of_sample(steps=16)
X_test.index.name = 'date'
X_test['NewYear'] = (X_test.index.dayofyear == 1)

X_test_copy = copy.deepcopy(X_test)

X_test = transformerL.transform(X_test)


### Ensemble & Make Predictions

In [19]:
# Make predicctions using Lasso Model
predictionL = modelL.predict(X_test)
predictionL[predictionL < 0] = 0
predictionL = predictionL/1.03

# Make predicctions using Ridge Model
predictionR = modelR.predict(X_test)
predictionR[predictionR < 0] = 0
predictionR = predictionR/1.03

X_test = copy.deepcopy(X_test_copy)

X_test = transformerKERAS.transform(X_test)
predictionKERAS = model.predict(X_test)
predictionKERAS[predictionKERAS < 0] = 0
predictionKERAS = predictionKERAS/1.03

#Ensemble based score ratios
prediction = 0.76*(predictionL*0.7 + predictionR*0.3) + 0.24*predictionKERAS

#Convert predictions to dataframe
prediction_df = pd.DataFrame(prediction, index=X_test_copy.index, columns=y.columns)

prediction_df = prediction_df.stack(['store_nbr', 'family'])

prediction_df = prediction_df.join(df_test.id).reindex(columns=['id', 'sales'])

prediction_df.to_csv('predictions.csv', index=False)

In [20]:
prediction_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,id,sales
date,store_nbr,family,Unnamed: 3_level_1,Unnamed: 4_level_1
2017-08-16,1,AUTOMOTIVE,3000888,3.06543
2017-08-16,1,BABY CARE,3000889,0.0
2017-08-16,1,BEAUTY,3000890,2.454494
2017-08-16,1,BEVERAGES,3000891,2171.142922
2017-08-16,1,BOOKS,3000892,0.260844
