##Install ForecastFlowML

In [0]:
%pip install git+https://github.com/canerturkseven/forecastflowml

## Some Spark settings

In [0]:
# We have 2 cores in Databricks Spark cluster
spark.conf.set("spark.sql.shuffle.partitions", 2)
# Enable PyArrow for faster transformation to Pandas DF
spark.conf.set("spark.sql.execution.arrow.enabled", "true")
# Prevent combining small tasks. We want to run all models in parallel even though they are small.
spark.conf.set("spark.sql.adaptive.enabled", "false")

## Import packages

In [0]:
import mlflow
from forecastflowml.meta_model import MetaModel
from forecastflowml.preprocessing import FeatureExtractor
from forecastflowml.data.loader import load_walmart_m5
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

## Load sample dataset (Wallmart M5 Kaggle)

In [0]:
df = load_walmart_m5(spark)
df.show(n=10,truncate=False)
df.count()

# Feature Engineering

## Initialize feature extractor model

In [0]:
preprocessor = FeatureExtractor(
    id_col="id",
    date_col="date",
    date_frequency="day",
    target_col="sales",
    target_encodings=[
        {
            "partition_cols": ["item_id", "store_id"],
            "windows": [7, 14, 28],
            "lags": [7, 14, 21, 28],
            "functions": ["mean"],
        },
        {
            "partition_cols": ["item_id", "store_id"],
            "windows": [1],
            "lags": [7, 8, 9, 14, 15, 16, 21, 22, 23, 28, 29, 30],
            "functions": ["mean"],
        },
    ],
    date_features=[
        "day_of_month",
        "day_of_week",
        "week_of_year",
        "quarter",
        "month",
        "year",
    ],
    history_lengths=["item_id", ["item_id", "store_id"]],
    encode_events={"cols": ["christmas"], "window": 15},
    count_consecutive_values={"value": 0, "lags": [7, 14, 21, 28]},
)

## Create features and save dataframe as intermediate step

In [0]:
df_preprocessed = preprocessor.transform(df).localCheckpoint()
df_preprocessed.filter(F.col("date") >= "2014-01-01").show(n=5, truncate=False, vertical=True)

#Training

##Split dataset into train and test

In [0]:
df_train = df_preprocessed.filter(F.col("date") <= "2016-05-22")
df_test = df_preprocessed.filter(F.col("date") > "2016-05-22")

## Initialize meta model

In [0]:
model = MetaModel(
    
    # dataset parameters
    group_col="state_id",  # column to slice dataframe
    id_cols=["id"],  # columns to use as time series identifier
    date_col="date",  # date column
    target_col="sales",  # target column
    date_frequency="days",  # date frequency (days, weeks, months, years) of dataset
    
    # model parameters
    model_horizon=7,  # horizon per model
    max_forecast_horizon=28,  # total forecast horizon
    lag_feature_range=2,  # extra lags to include as features based on model horizon
    
    # cross validation and optimisation parameters
    n_cv_splits=1,  # number of time-based cv splits
    cv_step_length=28,  # number of dates between each cv folds
    max_hyperparam_evals=1,  # total number of optuna trials
    scoring="neg_mean_squared_error",  # sklearn scoring metric
    
    # optuna hyperparameter space
    hyperparam_space_fn=lambda trial: {
        "objective": trial.suggest_categorical("objective", ['tweedie']),
        "num_leaves": trial.suggest_int("num_leaves", 50, 50),
        "n_estimators": trial.suggest_int("n_estimators", 30, 30)
    },
    
    # mlflow parameters
    tracking_uri="databricks",  # Mlflow tracking URI. Provide "./mlruns" in case of running locally.
)

## Train the model

In [0]:
model.train(df_train)

In [0]:
model.cv_forecast_graph['TX']

In [0]:
model.cv_forecast_graph["CA"]

In [0]:
model.feature_importance_graphs['TX']['horizon_1']

## All artifacts/metrics can be found in MLFlow UI 
<img src="/files/tables/1-2.PNG" width=50% height=50%>

## Models and feature schemas
<img src="/files/tables/3.PNG" width=50% height=50%>

## Forecast on time based cross validation folds
<img src="/files/tables/4.PNG" width=70% height=70%>

## Feature importance
<img src="/files/tables/5.PNG" width=70% height=70%>

## Bayesian optimisation visualisations
<img src="/files/tables/6.PNG" width=70% height=70%>

# Prediction

## Load meta model

In [0]:
loaded_model = mlflow.pyfunc.load_model(f"runs:/{model.run_id}/meta_model")

## Make predictions

In [0]:
loaded_model.predict(df_test).show(truncate=False)