In [1]:
import pandas as pd
import os
# Keras
# ==============================================================================
os.environ["KERAS_BACKEND"] = "tensorflow" # 'tensorflow', 'jax´ or 'torch'
import keras
from keras.optimizers import Adam
from keras.losses import MeanSquaredError
from keras.callbacks import EarlyStopping
import skforecast
from skforecast.ForecasterRnn import ForecasterRnn
from skforecast.ForecasterRnn.utils import create_and_compile_model
from sklearn.preprocessing import MinMaxScaler
from skforecast.model_selection_multiseries import backtesting_forecaster_multiseries

# Warning configuration
# ==============================================================================
import warnings
warnings.filterwarnings('once')

In [2]:
holiday_events_df = pd.read_csv('RNNs/Tensorflow-Keras/Predictions/Store Sales - Time Series Forecasting/holidays_events.csv')
stores_df = pd.read_csv('RNNs/Tensorflow-Keras/Predictions/Store Sales - Time Series Forecasting/stores.csv')
transactions_df = pd.read_csv('RNNs/Tensorflow-Keras/Predictions/Store Sales - Time Series Forecasting/transactions.csv')
train_df = pd.read_csv('RNNs/Tensorflow-Keras/Predictions/Store Sales - Time Series Forecasting/train.csv')
test_df = pd.read_csv('RNNs/Tensorflow-Keras/Predictions/Store Sales - Time Series Forecasting/test.csv')
oil_df = pd.read_csv('RNNs/Tensorflow-Keras/Predictions/Store Sales - Time Series Forecasting/oil.csv')

In [3]:
holiday_events_df = holiday_events_df.sort_values(by='transferred', ascending=False).drop_duplicates(subset='date', keep='first')
holiday_events_df[holiday_events_df['transferred'] == True].head()
holiday_events_df.head()

In [4]:
stores_df.head()

In [5]:
transactions_df.head()

In [6]:
train_df.head()

In [7]:
oil_df['dcoilwtico'] = oil_df['dcoilwtico'].bfill()
oil_df.head()

In [8]:
full_train_df = train_df.merge(stores_df, how='left', on='store_nbr').merge(transactions_df, how='left', on=['date', 'store_nbr']).merge(oil_df, how='left', on='date').merge(holiday_events_df, how='left', on='date')
full_train_df.head()

In [9]:
full_test_df = test_df.merge(stores_df, how='left', on='store_nbr').merge(transactions_df, how='left', on=['date', 'store_nbr']).merge(oil_df, how='left', on='date').merge(holiday_events_df, how='left', on='date')
full_test_df.head()

In [10]:
# Re-format the date column and set it as index
full_train_df['date'] = pd.to_datetime(full_train_df['date'], format = '%Y-%m-%d')
# Re-format the date column and set it as index
full_test_df['date'] = pd.to_datetime(full_test_df['date'], format = '%Y-%m-%d')

In [11]:
full_train_df['day_of_week'] = full_train_df['date'].dt.dayofweek
full_test_df['day_of_week'] = full_test_df['date'].dt.dayofweek

In [12]:
full_train_df['month'] = full_train_df['date'].dt.month
full_test_df['month'] = full_test_df['date'].dt.month

In [13]:
# Reset the index to access 'date' column
full_train_df.reset_index(inplace=True)
full_test_df.reset_index(inplace=True)

# Set the date as index
full_train_df.set_index('date', inplace=True)
full_test_df.set_index('date', inplace=True)

In [14]:
# Set the dataset frequency to be (D)aily data
# full_train_df = full_train_df.asfreq('D', method = 'bfill') 
# full_test_df = full_test_df.asfreq('D', method = 'bfill') 
# Fill missing value with the latest available data

full_train_df.head()

In [15]:
# Encoding the type of products
family_label_encoder = LabelEncoder()
full_train_df['family_encoded'] = family_label_encoder.fit_transform(full_train_df['family'])
full_test_df['family_encoded'] = family_label_encoder.transform(full_test_df['family'])

#Encoding the citys and states
city_label_encoder = LabelEncoder()
full_train_df['city_encoded'] = city_label_encoder.fit_transform(full_train_df['city'])
full_test_df['city_encoded'] = city_label_encoder.transform(full_test_df['city'])

state_label_encoder = LabelEncoder()
full_train_df['state_encoded'] = state_label_encoder.fit_transform(full_train_df['state'])
full_test_df['state_encoded'] = state_label_encoder.transform(full_test_df['state'])

#Encoding the type of stores
type_store_label_encoder = LabelEncoder()
full_train_df['type_store_encoded'] = state_label_encoder.fit_transform(full_train_df['type_x'])
full_test_df['type_store_encoded'] = state_label_encoder.transform(full_test_df['type_x'])

#Transactions fillna
full_train_df['transactions'] = full_train_df['transactions'].fillna(0)
full_test_df['transactions'] = full_test_df['transactions'].fillna(0)
#Transactions Scale
transactions_scaler = MinMaxScaler()
full_train_df['transactions_scaled'] = transactions_scaler.fit_transform(full_train_df[['transactions']])
full_test_df['transactions_scaled'] = transactions_scaler.transform(full_test_df[['transactions']])

#Encoding the type of holidays
type_holiday_label_encoder = LabelEncoder()
full_train_df['type_y'].fillna('no_holiday', inplace=True)
full_test_df['type_y'].fillna('no_holiday', inplace=True)
combined_type_y = pd.concat([full_train_df['type_y'], full_test_df['type_y']])
type_holiday_label_encoder.fit(combined_type_y)
full_train_df['type_holiday_encoded'] = type_holiday_label_encoder.transform(full_train_df['type_y'])
full_test_df['type_holiday_encoded'] = type_holiday_label_encoder.transform(full_test_df['type_y'])

#Encoding the type of holidays
locale_label_encoder = LabelEncoder()
full_train_df['locale'].fillna('no_holiday', inplace=True)
full_test_df['locale'].fillna('no_holiday', inplace=True)
combined_type_y = pd.concat([full_train_df['locale'], full_test_df['locale']])
locale_label_encoder.fit(combined_type_y)
full_train_df['locale_encoded'] = locale_label_encoder.transform(full_train_df['locale'])
full_test_df['locale_encoded'] = locale_label_encoder.transform(full_test_df['locale'])


#Encoding the type of holidays
transferred_label_encoder = LabelEncoder()
full_train_df['transferred'].fillna('no_holiday', inplace=True)
full_test_df['transferred'].fillna('no_holiday', inplace=True)
full_train_df['transferred'].replace({True: 'transferred', False: 'not_transferred'}, inplace=True)
full_test_df['transferred'].replace({True: 'transferred', False: 'not_transferred'}, inplace=True)
combined_type_y = pd.concat([full_train_df['transferred'], full_test_df['transferred']])
transferred_label_encoder.fit(combined_type_y)
full_train_df['transferred_encoded'] = transferred_label_encoder.transform(full_train_df['transferred'])
full_test_df['transferred_encoded'] = transferred_label_encoder.transform(full_test_df['transferred'])

oil_scaler = MinMaxScaler()
full_train_df['dcoilwtico'] = full_train_df['dcoilwtico'].bfill()
full_test_df['dcoilwtico'] = full_test_df['dcoilwtico'].bfill()
full_train_df['dcoilwtico_scaled'] = oil_scaler.fit_transform(full_train_df[['dcoilwtico']])
full_test_df['dcoilwtico_scaled'] = oil_scaler.transform(full_test_df[['dcoilwtico']])

sales_scaler = MinMaxScaler()
full_train_df['sales_scaled'] = sales_scaler.fit_transform(full_train_df[['sales']])
# full_test_df['sales_scaled'] = sales_scaler.transform(full_test_df[['sales']])



In [16]:
train_data=full_train_df[['store_nbr', 'onpromotion', 'cluster', 'day_of_week', 'month', 'family_encoded', 'city_encoded', 'state_encoded', 'type_store_encoded', 'type_holiday_encoded', 'locale_encoded', 'transferred_encoded', 'dcoilwtico', 'sales']]
test_data=full_test_df[['store_nbr', 'onpromotion', 'cluster', 'day_of_week', 'month', 'family_encoded', 'city_encoded', 'state_encoded', 'type_store_encoded', 'type_holiday_encoded', 'locale_encoded', 'transferred_encoded', 'dcoilwtico']]

# Ensure the date ranges are correctly defined
train_data_start=pd.to_datetime('2013-01-01')
train_data_end=pd.to_datetime('2017-08-15')

test_data_start=pd.to_datetime('2017-08-16')
test_data_end=pd.to_datetime('2017-08-31')

train_data.head()



In [17]:
# Group the train_data by 'store_nbr' and 'family'
grouped_train = train_data.groupby(['store_nbr', 'family_encoded'])
# Group the train_data by 'store_nbr' and 'family'
grouped_test = test_data.groupby(['store_nbr', 'family_encoded'])

# Create an array of DataFrames
dataframes_array_train = [group for _, group in grouped_train]

# Create an array of DataFrames
dataframes_array_test = [group for _, group in grouped_test]




In [18]:
# Add a row with the day 2012-12-25 to each DataFrame in dataframes_array_train
for idx, df in enumerate(dataframes_array_train):
    for i in range(2013, 2017):
        new_row = pd.DataFrame({
            'date': [pd.to_datetime(f'{i}-12-25')],
            'store_nbr': [df['store_nbr'].iloc[0]],
            'onpromotion': [0],
            'cluster': [df['cluster'].iloc[0]],
            'day_of_week': [df.loc[f'{i}-12-24', 'day_of_week'] + 1 if df.loc[f'{i}-12-24', 'day_of_week'] + 1 != 8 else 1],  # Assuming 1 for Monday, adjust as needed
            'month': [12],
            'family_encoded': [df['family_encoded'].iloc[0]],
            'city_encoded': [df['city_encoded'].iloc[0]],
            'state_encoded': [df['state_encoded'].iloc[0]],
            'type_store_encoded': [df['type_store_encoded'].iloc[0]],
            'type_holiday_encoded': [1],  # Assuming 1 for holiday, adjust as needed
            'locale_encoded': [1],  # Assuming 1 for locale, adjust as needed
            'transferred_encoded': [0],  # Assuming 0 for not transferred, adjust as needed
            'dcoilwtico': [df.loc[f'{i}-12-24', 'dcoilwtico']],  # Assuming a default value, adjust as needed
            'sales': [0]  # Assuming no sales, adjust as needed
        })
        new_row.set_index('date', inplace=True)
        df = pd.concat([df, new_row])
        df.sort_index(inplace=True)
        #df.reset_index(inplace=True)
        #df.set_index('date', inplace=True)
    dataframes_array_train[idx] = df



In [314]:
dataframes_array_train[0].head()

In [19]:
# Generate a complete date range
complete_date_range = pd.date_range(start=train_data_start, end=train_data_end)

# Find missing dates
missing_dates = complete_date_range.difference(dataframes_array_train[0].loc[train_data_start:train_data_end].index)

# Display missing dates
print(missing_dates)

In [20]:
# Keep only unique rows in each DataFrame in dataframes_array_train
dataframes_array_train = [df.drop_duplicates(keep='first') for df in dataframes_array_train]

# Fill missing dates in each DataFrame in dataframes_array_train
for df in dataframes_array_train:
#    complete_date_range = pd.date_range(start=df.index.min(), end=df.index.max())
#    df = df.reindex(complete_date_range, method='ffill')
#    df.sort_index(inplace=True)
    df.asfreq('D', method='bfill')

# Display the first few DataFrames in the array
for df in dataframes_array_train[:3]:
    print(df.head())

for df in dataframes_array_test:
#    complete_date_range = pd.date_range(start=df.index.min(), end=df.index.max())
#    df = df.reindex(complete_date_range, method='ffill')
#    df.sort_index(inplace=True)
    df.asfreq('D', method='bfill')

# Display the first few DataFrames in the array
for df in dataframes_array_train[:3]:
    print(df.head())

# Create an array of DataFrames
dataframes_array_test = [group for _, group in grouped_test]
for df in dataframes_array_test[:3]:
    print(df.head())
    



In [21]:
target = {}
for idx, df in enumerate(dataframes_array_train):
    target[idx] = df.loc[train_data_start:train_data_end, df.columns == 'sales']
    
target[0].head()


In [22]:
# Complete target with 0 between test_data_start and test_data_end
for idx, df in target.items():
    complete_date_range = pd.date_range(start=test_data_start, end=test_data_end)
    missing_dates = complete_date_range.difference(df.index)
    for date in missing_dates:
        df.loc[date] = 0
    df.sort_index(inplace=True)
    df.asfreq('D', method='bfill')


In [23]:
for df in dataframes_array_train:
    df.drop(columns=['sales'], inplace=True)

In [24]:
for c in range(0, len(dataframes_array_train)):
    dataframes_array_train[c] = pd.concat([dataframes_array_train[c], dataframes_array_test[c]])
    

In [25]:
# Generate a complete date range
complete_date_range = pd.date_range(start=train_data_start, end=train_data_end)

# Find missing dates
missing_dates = complete_date_range.difference(target[0].loc[train_data_start:train_data_end].index)

# Display missing dates
missing_dates


In [26]:
print(target[0].loc[train_data_start:test_data_end, 'sales'].head())
print(target[0].loc[train_data_start:test_data_end, 'sales'].tail())

In [27]:

for df in target.values():
    df.index = pd.to_datetime(df.index)
    df.asfreq('D', method='bfill')
    
target[0].head()


In [28]:
exog = dataframes_array_train[0].loc[train_data_start:train_data_end, ['store_nbr', 'onpromotion', 'cluster', 'day_of_week', 'month', 'family_encoded', 'city_encoded', 'state_encoded', 'type_store_encoded', 'type_holiday_encoded', 'locale_encoded', 'transferred_encoded', 'dcoilwtico']]
exog = exog.asfreq('D')
target_test=target[0].loc[train_data_start:train_data_end, 'sales']
target_test=target_test.asfreq('D')


In [29]:
forecaster = ForecasterSarimax(
                 regressor = Sarimax(
                                order          = (1, 1, 1),
                                seasonal_order =(1, 1, 1, 12),
                                maxiter        = 200
                             )
             )
metric, predictions = backtesting_sarimax(
                          forecaster            = forecaster,
                          y                     = target_test,
                          initial_train_size    = len(dataframes_array_train[0].loc[train_data_start:train_data_end])-1,
                          fixed_train_size      = False,
                          steps                 = len(dataframes_array_train[0].loc[test_data_start:test_data_end]),
                          metric                = 'mean_absolute_error',
                          refit                 = True,
                          n_jobs                = "auto",
                          suppress_warnings_fit = True,
                          verbose               = True,
                          show_progress         = True,
                          exog=exog
                      )
display(metric)
predictions.head(4)



In [32]:
forecaster = ForecasterSarimax(
                 regressor = Sarimax(
                                order   = (1, 1, 1), # Placeholder replaced in the grid search
                                maxiter = 500
                             )
             )

param_grid = {
    'order': [(0, 1, 0), (0, 1, 1), (1, 1, 0), (1, 1, 1), (2, 1, 1)],
    'seasonal_order': [(0, 0, 0, 0), (0, 1, 0, 12), (1, 1, 1, 12)],
    'trend': [None, 'n']
}
results_grid = grid_search_sarimax(
                   forecaster            = forecaster,
                   y                     = target_test,
                   param_grid            = param_grid,
                   steps                 = len(dataframes_array_train[0].loc[test_data_start:test_data_end]),
                   refit                 = True,
                   metric                = 'mean_absolute_error',
                   initial_train_size    = len(dataframes_array_train[0].loc[train_data_start:train_data_end])-1,
                   fixed_train_size      = False,
                   return_best           = True,
                   n_jobs                = 'auto',
                   suppress_warnings_fit = True,
                   verbose               = False,
                   show_progress         = True,
                   exog                  = exog
               )
results_grid.head(5)


In [353]:

target_test=target[1].loc[train_data_start:train_data_end, 'sales']
target_test=target_test.asfreq('D')
target_test.head()


In [38]:


final_df = pd.DataFrame()

for i in range(0, len(dataframes_array_train)):
    exog = dataframes_array_train[i].loc[train_data_start:train_data_end, ['store_nbr', 'onpromotion', 'cluster', 'day_of_week', 'month', 'family_encoded', 'city_encoded', 'state_encoded', 'type_store_encoded', 'type_holiday_encoded', 'locale_encoded', 'transferred_encoded', 'dcoilwtico']]
    exog = exog.asfreq('D').ffill()
    target_test=target[i].loc[train_data_start:train_data_end, 'sales']
    target_test=target_test.asfreq('D').ffill()  # Fill missing values
    forecaster.fit(y=target_test, exog=exog)
    # prediction
    exog = dataframes_array_train[i].loc[test_data_start:test_data_end, ['store_nbr', 'onpromotion', 'cluster', 'day_of_week', 'month', 'family_encoded', 'city_encoded', 'state_encoded', 'type_store_encoded', 'type_holiday_encoded', 'locale_encoded', 'transferred_encoded', 'dcoilwtico']]
    exog.index = pd.to_datetime(exog.index)
    exog = exog.asfreq('D').ffill()
    predicted_values = forecaster.predict(steps=len(dataframes_array_train[i].loc[test_data_start:test_data_end]), exog=exog)
    predicted_values = pd.DataFrame({'store_nbr': [dataframes_array_train[i]['store_nbr'].iloc[0]], 'family_encoded': [dataframes_array_train[i]['family_encoded'].iloc[0]], 'sales': predicted_values}, index=exog.index)
    final_df = pd.concat([final_df, predicted_values])
    print (str(i) + ' of ' + str(len(dataframes_array_train)))
    
# Fit the model using train data with multiple series
#exog = dataframes_array_train[0].loc[train_data_start:train_data_end, ['store_nbr', 'onpromotion', 'cluster', 'day_of_week', 'month', 'family_encoded', 'city_encoded', 'state_encoded', 'type_store_encoded', 'type_holiday_encoded', 'locale_encoded', 'transferred_encoded', 'dcoilwtico']]
#exog = exog.asfreq('D')
#target_test=target[0].loc[train_data_start:train_data_end, 'sales']
#target_test=target_test.asfreq('D')
#forecaster.fit(y=target_test, exog=exog)




In [36]:
final_df.tail()

In [360]:
# Ensure the date column is in the index for both dataframes
final_df.reset_index(inplace=True)
full_test_df.reset_index(inplace=True)

# Merge the dataframes on 'family_encoded', 'store_nbr', and 'date'
merged_df = pd.merge(final_df, full_test_df, on=['family_encoded', 'store_nbr', 'date'], how='inner')

# Display the merged dataframe
merged_df.head()

In [362]:
submission_df=merged_df[['id', 'sales']]
submission_df.head()

In [363]:
!pwd

In [364]:
submission_df.to_csv('RNNs/Tensorflow-Keras/Predictions/Store Sales - Time Series Forecasting/submission_3.csv', index=False)