#### Final Modeling with Entire Data and Future Prediction:

In [1]:
# import all libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_acf,plot_pacf
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX
from prophet import Prophet
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor

In [2]:
# read csv file and store it into a dataframe
df = pd.read_csv("2. train_sales_forecast.csv")
df['Date'] = pd.to_datetime(df['Date'], format='%d-%m-%Y')
df.head()

Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday
0,1,1,2010-02-05,25037.08,False
1,1,1,2010-02-12,46247.44,True
2,1,1,2010-02-19,41783.43,False
3,1,1,2010-02-26,19491.18,False
4,1,1,2010-03-05,21926.49,False


In [3]:
def impute_missing_rows(df):
    """
    This function imputes missing rows to the dataframe to make it evenly spaned.

    Parameters:
    df: The pandas DataFrame object

    Returns:
    df: The pandas DataFrame object after imputation
    """
    print(f"Shape of dataframe before imputation: {df.shape}")
    # get minimum and maximum dates from the entire dataframe
    end_date = df.Date.max()
    start_date = df.Date.min()
    # get all the unique stores in the dataframe
    for store_no in df.Store.unique():
        df_store = df[df.Store == store_no]
        # get all the departments from that store
        for dept_no in df_store.Dept.unique():
            df_store_dept = df_store[df_store.Dept == dept_no]
            # start_date = df_store_dept.Date.min()
            current_date = start_date
            while current_date <= end_date:
                # if a date is not present, impute it with Zero sales
                if current_date not in df_store_dept.Date.values:
                    # print([store_no, dept_no, current_date])
                    try:
                        # get the holiday information of the missing date from the entire df
                        is_holiday = df[df.Date == current_date]['IsHoliday'].values[0]
                    except:
                        print("Date not present in the entire dataframe, considering False")
                        is_holiday = False
                    df.loc[len(df)] = [store_no, dept_no, current_date, 0, is_holiday]
                current_date += timedelta(days=7)
    print(f"Shape of dataframe after imputation: {df.shape}")
    return df

In [4]:
def preprocess_dataframe(df):
    """
    This function preprocess the data before training.

    Parameters:
    df: The pandas DataFrame object

    Returns:
    df: The pandas DataFrame object after preprocessing
    """
    # impute missing rows in the dataframe
    df = impute_missing_rows(df)
    df.IsHoliday = df.IsHoliday.astype(int)
    return df

## Final Modeling with Entire Data for Future Prediction:

#### ARIMA Model with Exogenous Variable (Holiday)
As we need to forecast next 6 months, this will be the best model as found from the validation and Test MAPE scores.

In [5]:
def fit_arima_model_exogenous(df, order):
    """
    This function fits ARIMA model

    Parameters:
    df: dataframe object
    order: (p, d, q)

    Returns:
    fitted model object
    """
    # Ensure the data is sorted by date
    df = df.sort_values(by='Date')
     # Set the date as index and set the frequency Weekly-Friday (Dates are of Friday)
    df = df.set_index('Date').asfreq('W-FRI')
   
    # Fit the ARIMA model
    model = ARIMA(df['Weekly_Sales'], exog=df['IsHoliday'],order=order)
    model_fit = model.fit()
    
    return model_fit

In [6]:
def train_arima_model_exogenous(df, order):
    """
    This function trains every series of the dataframe.
    Parameters:
    df: dataframe object
    order: (p, d, q)

    Returns:
    Dictionary containing fitted models with key tuple(store, dept) and value model object
    """
    models_dict = {}
    for store_no in df.Store.unique():
        df_store = df[df.Store == store_no]
        for dept_no in df_store.Dept.unique():
            df_store_dept = df_store[df_store.Dept == dept_no]
            try:
                model_fit = fit_arima_model_exogenous(df_store_dept, order)
                models_dict[(store_no, dept_no)] = model_fit
            except Exception as e:
                print(f'Failed to fit ARIMA model for store {store_no}, category {dept_no}: {e}')
    return models_dict

In [17]:
def future_predict_arima_model_exogenous(df_train, df_test, models_dict, forecast_steps):
    predicted_arima_df = pd.DataFrame()
    df_test['IsHoliday'] = df_test['IsHoliday'].astype(int)
    val_holidays = df_test[['Date', 'IsHoliday']].drop_duplicates().sort_values(by='Date')
    for store_no in df_train.Store.unique():
        df_store = df_train[df_train.Store == store_no]
        for dept_no in df_store.Dept.unique():
            forecast = models_dict[(store_no, dept_no)].get_forecast(steps=forecast_steps, exog=val_holidays.IsHoliday)
            forecast_df = forecast.summary_frame()
            forecast_df = forecast_df[['mean']]
            forecast_df['Store'] = store_no
            forecast_df['Dept'] = dept_no
            predicted_arima_df  = pd.concat([predicted_arima_df, forecast_df], ignore_index=False)
    predicted_arima_df = predicted_arima_df.reset_index()
    predicted_arima_df = predicted_arima_df.rename(columns={'index': 'Date', 'mean': 'Weekly_Sales'})
    predicted_arima_df = predicted_arima_df.reset_index(drop=True)

    # drop_empty Weekly_Sales column from df_test
    df_test = df_test.drop(columns=['Weekly_Sales'])
    # merge the predicted forecast with df_test
    df_merged_predicted_sales = pd.merge(df_test, predicted_arima_df, on=['Store', 'Dept', 'Date'], how='left')
    return df_merged_predicted_sales

In [8]:
import warnings
warnings.filterwarnings('ignore')

#### Preprocess the data before training:

In [9]:
df_train = preprocess_dataframe(df)
df_train.head()

Shape of dataframe before imputation: (84056, 5)
Shape of dataframe after imputation: (90909, 5)


Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday
0,1,1,2010-02-05,25037.08,0
1,1,1,2010-02-12,46247.44,1
2,1,1,2010-02-19,41783.43,0
3,1,1,2010-02-26,19491.18,0
4,1,1,2010-03-05,21926.49,0


#### Train arima model with holiday as exogenous variable:

In [10]:
# considering p, d, q = (1, 1, 1)
arima_trained_models_dict_exogen = train_arima_model_exogenous(df_train, order=(1,1,1))

#### Load test data:

In [12]:
# read csv file and store it into a dataframe
df_test = pd.read_csv("3. Scoring_Template.csv")
df_test.head()

Unnamed: 0,Store,Dept,Date,IsHoliday,Weekly_Sales
0,1,1,04-05-2012,False,
1,1,1,11-05-2012,False,
2,1,1,18-05-2012,False,
3,1,1,25-05-2012,False,
4,1,1,01-06-2012,False,


In [13]:
df_test.shape

(18589, 5)

#### Compute Forecast span:

In [14]:
def compute_forecast_steps(df):
    count = 0
    df['Date'] = pd.to_datetime(df['Date'], format='%d-%m-%Y')
    start_date = df.Date.min()
    end_date = df.Date.max()
    current_date = start_date
    
    while current_date <= end_date:
        current_date += timedelta(days=7)
        count += 1
    return count

In [15]:
forecast_steps = compute_forecast_steps(df_test)
print(f"Forecast Steps = {forecast_steps}")

Forecast Steps = 26


#### Get Future Prediction:

In [18]:
test_forecast_arima_exogen_df = future_predict_arima_model_exogenous(df_train, df_test, arima_trained_models_dict_exogen, forecast_steps)
test_forecast_arima_exogen_df.head()

Unnamed: 0,Store,Dept,Date,IsHoliday,Weekly_Sales
0,1,1,2012-05-04,0,19381.993184
1,1,1,2012-05-11,0,21038.239385
2,1,1,2012-05-18,0,21964.806586
3,1,1,2012-05-25,0,22483.163565
4,1,1,2012-06-01,0,22773.152201


In [28]:
# convert IsHoliday to boolean as it was originally
test_forecast_arima_exogen_df.IsHoliday = test_forecast_arima_exogen_df.IsHoliday.astype(bool)
test_forecast_arima_exogen_df.head()

Unnamed: 0,Store,Dept,Date,IsHoliday,Weekly_Sales
0,1,1,2012-05-04,False,19381.993184
1,1,1,2012-05-11,False,21038.239385
2,1,1,2012-05-18,False,21964.806586
3,1,1,2012-05-25,False,22483.163565
4,1,1,2012-06-01,False,22773.152201


In [29]:
test_forecast_arima_exogen_df.shape

(18589, 5)

In [30]:
# check if there is any null value
test_forecast_arima_exogen_df.isna().sum()

Store           0
Dept            0
Date            0
IsHoliday       0
Weekly_Sales    0
dtype: int64

#### Store final prediction file locally:

In [31]:
# Store this final forecast locally for further use
test_forecast_arima_exogen_df.to_csv('test_sales_forecast.csv', index=False)