### Forecasting Algorithm reusable code


#### Loading Library

In [29]:
import warnings
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from statsmodels.tsa.statespace.sarimax import SARIMAX
import xgboost as xgb
from lightgbm import LGBMRegressor
from statsmodels.tsa.holtwinters import ExponentialSmoothing, SimpleExpSmoothing, Holt
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score, TimeSeriesSplit
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error
import pandas as pd
from prophet import Prophet
# Suppress warnings
warnings.filterwarnings("ignore")

#### Defining Functions to create holiday and discount flag in the input dataset

In [30]:
def create_features(df, rolling_windows, lag_periods, holiday_data, discount_data):
    # Create additional features
    df['is_holiday'] = df['date'].apply(lambda x: 1 if is_holiday(x, holiday_data) else 0)
    df['discount_flag'] = df['date'].apply(lambda x: get_discount_flag(x, discount_data))

    # Create rolling mean features
    for window in rolling_windows:
        df[f'rolling_mean_{window}'] = df['sales'].rolling(window).mean()

    # Create lag features
    for lag in lag_periods:
        df[f'lag_{lag}'] = df['sales'].shift(lag)

    # Remove rows with NaN values
    df.dropna(inplace=True)

    return df


def is_holiday(date, holiday_data):
    # Function to determine if a given date is a holiday
    # Use the holiday_data DataFrame to check if the date is a holiday
    holiday_flag = holiday_data[holiday_data['date'] == date]['flag'].values
    if len(holiday_flag) > 0:
        return bool(holiday_flag[0])
    else:
        return False


def get_discount_flag(date, discount_data):
    # Function to determine the discount flag for a given date
    # Use the discount_data DataFrame to check if the date has a discount flag
    discount_flag = discount_data[discount_data['date'] == date]['discount_flag'].values
    if len(discount_flag) > 0:
        return discount_flag[0]
    else:
        return 0


#### Reading Input data

In [31]:
# Load the dataset
data = pd.read_csv('store_sales.csv')  # Replace with the actual filename and path of your dataset
holiday_data = pd.read_csv('holiday_data.csv')
discount_data = pd.read_csv('discount_data.csv')  # Replace with the actual filename and path of your discount data

#### Operations

In [32]:
# Convert 'date' column to datetime type
data['date'] = pd.to_datetime(data['date'], format='%d-%m-%Y', dayfirst=True)

# Specify the rolling mean windows and lag periods
rolling_windows = [2, 3, 6, 9, 12, 18]
lag_periods = [2, 3, 4, 5, 6, 7, 8]

# Specify the number of weeks for training and testing
num_train_weeks = len(data['date'].unique()) - 8  # Train on all weeks except the last 8
num_test_weeks = 8  # Test on the last 8 weeks

predictions_list = []

#### Creating the Store dataframe with all the features

In [33]:
store_df=pd.DataFrame()

for store in data['store'].unique():
    store_data = data[data['store'] == store].copy()

    # Sort the data by date
    store_data.sort_values('date', inplace=True)

    # inserting the missing dates
    min_date = store_data['date'].min()
    max_date = store_data['date'].max()
    date_range = pd.date_range(min_date, max_date, freq='W-FRI')
    missing_dates = set(date_range) - set(store_data['date'])
    missing_data = pd.DataFrame({'date': list(missing_dates)})
    store_data = pd.concat([store_data, missing_data]).sort_values('date')

    # Fill missing sales values with 0
    store_data['sales'].fillna(0, inplace=True)

    # Create additional features
    store_data = create_features(store_data, rolling_windows, lag_periods, holiday_data, discount_data)
    store_df = pd.concat([store_df, store_data])
    
#store_df.set_index('date', inplace=True)

#### Model Training & Prediction 

In [34]:
df = store_df[['date', 'store', 'sales']].copy()

### Model Training to determine the best parameters

In [35]:
# Convert the 'date' column to datetime format
df['date'] = pd.to_datetime(df['date'])

# Split the dataset into train and test sets
# Convert the 'date' column to datetime format
df['date'] = pd.to_datetime(df['date'])

# Sort the dataset by date in ascending order
df = df.sort_values('date')

# Determine the cutoff date for splitting train and test sets
cutoff_date = df['date'].max() - pd.DateOffset(weeks=8)

train_df=pd.DataFrame()
test_df=pd.DataFrame()

for store in df['store'].unique():
    temp = df[df['store'] == store]    
    train = temp[temp['date'] <= cutoff_date]
    train_df=pd.concat([train,train_df])
    test = temp[temp['date'] > cutoff_date]
    test_df=pd.concat([test,test_df])

# Define the hyperparameter values to try
hyperparameters = {
    'changepoint_prior_scale': [0.01, 0.1, 0.5],
    'seasonality_prior_scale': [0.01, 0.1, 1.0],
    'seasonality_mode': ['additive', 'multiplicative']
}

best_mape_per_store = {}

merged_dfs = {}

# Iterate over the unique stores
for store in train_df['store'].unique():
    store_train_df = train_df[train_df['store'] == store][['date', 'sales']].rename(columns={'date': 'ds', 'sales': 'y'})
    store_test_df = test_df[test_df['store'] == store][['date', 'sales']].rename(columns={'date': 'ds', 'sales': 'y'})

    best_mape = float('inf')
    best_params = {}

    # Iterate over the hyperparameter combinations
    for cp_scale in hyperparameters['changepoint_prior_scale']:
        for sp_scale in hyperparameters['seasonality_prior_scale']:
            for season_mode in hyperparameters['seasonality_mode']:
                # Create and fit the Prophet model
                model = Prophet(changepoint_prior_scale=cp_scale,
                                seasonality_prior_scale=sp_scale,
                                seasonality_mode=season_mode)
                model.fit(store_train_df)

                # Make predictions for the test set
                future = model.make_future_dataframe(periods=8, freq='W-FRI')
                forecast = model.predict(future)

                # Evaluate the model's performance using MAPE
                merged_df = forecast.merge(store_test_df, on='ds')
                merged_df['error'] = abs(merged_df['yhat'] - merged_df['y']) / merged_df['y']
                mape = merged_df['error'].mean()

                # Check if this combination of hyperparameters gives a better result
                if mape < best_mape:
                    best_mape = mape
                    best_params = {'changepoint_prior_scale': cp_scale,
                                   'seasonality_prior_scale': sp_scale,
                                   'seasonality_mode': season_mode}
                    best_merged_df = merged_df.copy()

    # Store the best hyperparameters and MAPE for the store
    best_mape_per_store[store] = {'best_params': best_params, 'best_mape': best_mape}
    merged_dfs[store] = best_merged_df

# Print the best hyperparameters and MAPE for each store
for store, params in best_mape_per_store.items():
    merged_df = merged_dfs[store]
    merged_df['store'] = store
    print(f"Store {store}:")
    print('Best Hyperparameters:', params['best_params'])
    print('Best MAPE:', params['best_mape'])
    print()
df_prophet=merged_df[['ds','error','store','y','yhat']]

02:07:34 - cmdstanpy - INFO - Chain [1] start processing
02:07:34 - cmdstanpy - INFO - Chain [1] done processing
02:07:34 - cmdstanpy - INFO - Chain [1] start processing
02:07:34 - cmdstanpy - INFO - Chain [1] done processing
02:07:34 - cmdstanpy - INFO - Chain [1] start processing
02:07:34 - cmdstanpy - INFO - Chain [1] done processing
02:07:34 - cmdstanpy - INFO - Chain [1] start processing
02:07:34 - cmdstanpy - INFO - Chain [1] done processing
02:07:34 - cmdstanpy - INFO - Chain [1] start processing
02:07:34 - cmdstanpy - INFO - Chain [1] done processing
02:07:35 - cmdstanpy - INFO - Chain [1] start processing
02:07:35 - cmdstanpy - INFO - Chain [1] done processing
02:07:35 - cmdstanpy - INFO - Chain [1] start processing
02:07:35 - cmdstanpy - INFO - Chain [1] done processing
02:07:35 - cmdstanpy - INFO - Chain [1] start processing
02:07:35 - cmdstanpy - INFO - Chain [1] done processing
02:07:35 - cmdstanpy - INFO - Chain [1] start processing
02:07:35 - cmdstanpy - INFO - Chain [1]

Store 100.0:
Best Hyperparameters: {'changepoint_prior_scale': 0.1, 'seasonality_prior_scale': 0.1, 'seasonality_mode': 'additive'}
Best MAPE: 0.016885582886475375



### Predicting with best parameters

In [36]:
# Convert the 'date' column to datetime format
df['date'] = pd.to_datetime(df['date'])

# Find the maximum week in the dataset
max_week = df['date'].max()

# Create an empty DataFrame to store the forecasts
forecast_df = pd.DataFrame()

# Iterate over the unique stores
for store, params in best_mape_per_store.items():
    # Extract the best hyperparameters for the store
    best_params = params['best_params']

    # Filter the dataset for the specific store
    store_df = df[df['store'] == store][['date', 'sales']].rename(columns={'date': 'ds', 'sales': 'y'})

    # Create and fit the Prophet model with the best hyperparameters
    model = Prophet(**best_params)
    model.fit(store_df)

    # Make predictions for the next 8 weeks
    future_dates = pd.date_range(start=max_week + pd.DateOffset(days=1), periods=8, freq='W-FRI')
    future = pd.DataFrame({'ds': future_dates})
    forecast = model.predict(future)

    # Add the store and the forecasted sales to the DataFrame
    forecast['store'] = store
    forecast_df = forecast_df.append(forecast)

# Rename the columns and select the necessary columns
forecast_df = forecast_df.rename(columns={'ds': 'date', 'yhat': 'sales'})
forecast_df = forecast_df[['date', 'store', 'sales']]

# Print the forecast for the next 8 weeks
print(forecast_df)


02:07:37 - cmdstanpy - INFO - Chain [1] start processing
02:07:37 - cmdstanpy - INFO - Chain [1] done processing


        date  store         sales
0 2012-11-02  100.0  4.669403e+07
1 2012-11-09  100.0  4.960658e+07
2 2012-11-16  100.0  5.217169e+07
3 2012-11-23  100.0  5.379849e+07
4 2012-11-30  100.0  5.593464e+07
5 2012-12-07  100.0  5.992546e+07
6 2012-12-14  100.0  6.425177e+07
7 2012-12-21  100.0  6.489349e+07
