# **Sales and Demand Forecasting**

## **Setup**

In [5]:
import pandas as pd
import numpy as np
from random import gauss, shuffle
import os, warnings, gc, copy, itertools, subprocess
import matplotlib.pyplot as plt
from datetime import datetime, timedelta

from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.stats.diagnostic import het_arch, acorr_ljungbox

from sklearn.metrics import mean_absolute_percentage_error as mape, mean_squared_error as mse
from scipy.stats import shapiro, probplot
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.multioutput import MultiOutputRegressor
from sklearn.cluster import KMeans

import lightgbm as lgb
from gensim.models import Word2Vec # requires scipy<1.3 due to deprecation of lin. alg. funcs.

plt.style.use('fivethirtyeight')
warnings.simplefilter(action='ignore', category=FutureWarning)

In [6]:
# Setup
data = 'data/'

class CFG:
    img_dim1 = 12
    img_dim2 = 7
    fontsize = 8
    marker = 2
    lines = 1.5

# plt.rcParams.keys() to list params
# adjust the parameters for displayed figures    
plt.rcParams.update({'figure.figsize': (CFG.img_dim1,CFG.img_dim2),
                     'font.size': (CFG.fontsize),
                     'lines.markersize': (CFG.marker),
                     'lines.linewidth': (CFG.lines)})   

In [9]:
# Get data from kaggle
from pathlib import Path
import zipfile
cred_path = Path('~/.kaggle/kaggle.json').expanduser()

def download_data(dataset, path, comp_check=True):
    os.makedirs(path, exist_ok=True)
    if comp_check==True: kaggle_call = f"kaggle competitions download -c {dataset} -p {path}"
    else               : kaggle_call = f"kaggle datasets download -d {dataset} -p {path} --unzip"
    subprocess.run(kaggle_call, shell=True, check=True)
    for file in Path(path).glob("*.zip"):
        with zipfile.ZipFile(file, 'r') as zip_ref:
            zip_ref.extractall(path)
        print(f"Extracted {file}")
        file.unlink()
    print(f"Dataset '{dataset}' has been downloaded to '{path}'")

In [10]:
# Download M data
dataset = "m5-forecasting-accuracy"
download_path = "data/sales-forecasting/m5/"

download_data(dataset, download_path)

Downloading m5-forecasting-accuracy.zip to data/sales-forecasting/m5


100%|██████████| 45.8M/45.8M [00:17<00:00, 2.77MB/s]



Extracted data/sales-forecasting/m5/m5-forecasting-accuracy.zip
Dataset 'm5-forecasting-accuracy' has been downloaded to 'data/sales-forecasting/m5/'


In [12]:
# Download M data
dataset = "konradb/partial-visuelle"
download_path = "data/sales-forecasting/partial-visuelle/"

download_data(dataset, download_path, comp_check=False)

Dataset URL: https://www.kaggle.com/datasets/konradb/partial-visuelle
License(s): unknown
Downloading partial-visuelle.zip to data/sales-forecasting/partial-visuelle


  0%|          | 0.00/247k [00:00<?, ?B/s]


Dataset 'konradb/partial-visuelle' has been downloaded to 'data/sales-forecasting/partial-visuelle/'


100%|██████████| 247k/247k [00:00<00:00, 284kB/s]


## **Data**

We will be working with data from the M5 competition titled ["M5 Forecasting Accuracy"](https://www.kaggle.com/c/m5-forecasting-accuracy).

>  The contest used a hierarchical sales data from Walmart to forecast daily sales for the next 28 days. From the comp description page: "The data covers stores in three US States (California, Texas, and Wisconsin) and includes item level, department, product categories, and store details. In addition, it has explanatory variables such as price, promotions, day of the week, and special events".

In [None]:
# Load and format the data
xdat = pd.read_csv('data/m5')