# Feature Importance

This quick guide shows how the feature importances can be reached from ``ForecastFlowML``.

## Import packages

In [1]:
from forecastflowml import ForecastFlowML
from forecastflowml import FeatureExtractor
from forecastflowml.data.loader import load_walmart_m5
from lightgbm import LGBMRegressor
from pyspark.sql import SparkSession

## Initialize Spark

In [2]:
spark = (
    SparkSession.builder.master("local[4]")
    .config("spark.driver.memory", "4g")
    .config("spark.sql.shuffle.partitions", "4")
    .config("spark.sql.execution.pyarrow.enabled", "true")
    .getOrCreate()
)

## Sample Dataset

In [3]:
df = load_walmart_m5(spark)
df.show(10)

+--------------------+-----------+-------+------+--------+--------+----------+-----+
|                  id|    item_id|dept_id|cat_id|store_id|state_id|      date|sales|
+--------------------+-----------+-------+------+--------+--------+----------+-----+
|FOODS_1_013_TX_2_...|FOODS_1_013|FOODS_1| FOODS|    TX_2|      TX|2015-01-01|  0.0|
|FOODS_1_013_TX_2_...|FOODS_1_013|FOODS_1| FOODS|    TX_2|      TX|2015-01-02|  0.0|
|FOODS_1_013_TX_2_...|FOODS_1_013|FOODS_1| FOODS|    TX_2|      TX|2015-01-03|  0.0|
|FOODS_1_013_TX_2_...|FOODS_1_013|FOODS_1| FOODS|    TX_2|      TX|2015-01-04|  0.0|
|FOODS_1_013_TX_2_...|FOODS_1_013|FOODS_1| FOODS|    TX_2|      TX|2015-01-05|  0.0|
|FOODS_1_013_TX_2_...|FOODS_1_013|FOODS_1| FOODS|    TX_2|      TX|2015-01-06|  0.0|
|FOODS_1_013_TX_2_...|FOODS_1_013|FOODS_1| FOODS|    TX_2|      TX|2015-01-07|  0.0|
|FOODS_1_013_TX_2_...|FOODS_1_013|FOODS_1| FOODS|    TX_2|      TX|2015-01-08|  0.0|
|FOODS_1_013_TX_2_...|FOODS_1_013|FOODS_1| FOODS|    TX_2|      T

## Feature Engineering

In [4]:
feature_extractor = FeatureExtractor(
    id_col="id",
    date_col="date",
    target_col="sales",
    lag_window_features={
        "lag": [7 * (i + 1) for i in range(4)],
    },
    date_features=[
        "day_of_month",
        "day_of_week",
        "week_of_year",
        "week_of_month",
        "weekend",
        "quarter",
        "month",
        "year",
    ],
)
df_train = feature_extractor.transform(df).localCheckpoint()
df_train.show(10)

+--------------------+-----------+-------+------+--------+--------+----------+-----+-----+------+------+------+------------+-----------+------------+-------------+-------+-------+-----+----+
|                  id|    item_id|dept_id|cat_id|store_id|state_id|      date|sales|lag_7|lag_14|lag_21|lag_28|day_of_month|day_of_week|week_of_year|week_of_month|weekend|quarter|month|year|
+--------------------+-----------+-------+------+--------+--------+----------+-----+-----+------+------+------+------------+-----------+------------+-------------+-------+-------+-----+----+
|FOODS_1_002_TX_1_...|FOODS_1_002|FOODS_1| FOODS|    TX_1|      TX|2015-01-01|  0.0| null|  null|  null|  null|           1|          5|           1|            1|      0|      1|    1|2015|
|FOODS_1_002_TX_1_...|FOODS_1_002|FOODS_1| FOODS|    TX_1|      TX|2015-01-02|  0.0| null|  null|  null|  null|           2|          6|           1|            1|      0|      1|    1|2015|
|FOODS_1_002_TX_1_...|FOODS_1_002|FOODS_1| FO

## Feature Importance

In [5]:
forecast_flow = ForecastFlowML(
    group_col="store_id",
    id_col="id",
    date_col="date",
    target_col="sales",
    date_frequency="days",
    model_horizon=7,
    max_forecast_horizon=28,
    model=LGBMRegressor(),
)

### PySpark DataFrame with Distributed Results

In [6]:
trained_models = forecast_flow.train(df_train).localCheckpoint()
forecast_flow.get_feature_importance(trained_models)

Unnamed: 0,group,forecast_horizon,feature,importance
0,CA_1,"[1, 2, 3, 4, 5, 6, 7]",day_of_week,331.0
1,CA_1,"[1, 2, 3, 4, 5, 6, 7]",day_of_month,711.0
2,CA_1,"[1, 2, 3, 4, 5, 6, 7]",week_of_year,653.0
3,CA_1,"[1, 2, 3, 4, 5, 6, 7]",month,53.0
4,CA_1,"[1, 2, 3, 4, 5, 6, 7]",week_of_month,0.0
...,...,...,...,...
139,WI_1,"[22, 23, 24, 25, 26, 27, 28]",week_of_month,0.0
140,WI_1,"[22, 23, 24, 25, 26, 27, 28]",weekend,74.0
141,WI_1,"[22, 23, 24, 25, 26, 27, 28]",year,73.0
142,WI_1,"[22, 23, 24, 25, 26, 27, 28]",quarter,0.0


### PySpark DataFrame with Local Results

In [7]:
forecast_flow.train(df_train, local_result=True)
forecast_flow.get_feature_importance()

Unnamed: 0,group,forecast_horizon,feature,importance
0,CA_1,"[1, 2, 3, 4, 5, 6, 7]",day_of_week,331
1,CA_1,"[1, 2, 3, 4, 5, 6, 7]",day_of_month,711
2,CA_1,"[1, 2, 3, 4, 5, 6, 7]",week_of_year,653
3,CA_1,"[1, 2, 3, 4, 5, 6, 7]",month,53
4,CA_1,"[1, 2, 3, 4, 5, 6, 7]",week_of_month,0
...,...,...,...,...
139,WI_1,"[22, 23, 24, 25, 26, 27, 28]",week_of_month,0
140,WI_1,"[22, 23, 24, 25, 26, 27, 28]",weekend,74
141,WI_1,"[22, 23, 24, 25, 26, 27, 28]",year,73
142,WI_1,"[22, 23, 24, 25, 26, 27, 28]",quarter,0


### Pandas DataFrame

In [8]:
forecast_flow.train(df_train.toPandas(), spark=spark)
forecast_flow.get_feature_importance()

Unnamed: 0,group,forecast_horizon,feature,importance
0,CA_1,"[1, 2, 3, 4, 5, 6, 7]",day_of_week,331
1,CA_1,"[1, 2, 3, 4, 5, 6, 7]",day_of_month,711
2,CA_1,"[1, 2, 3, 4, 5, 6, 7]",week_of_year,653
3,CA_1,"[1, 2, 3, 4, 5, 6, 7]",month,53
4,CA_1,"[1, 2, 3, 4, 5, 6, 7]",week_of_month,0
...,...,...,...,...
139,WI_1,"[22, 23, 24, 25, 26, 27, 28]",week_of_month,0
140,WI_1,"[22, 23, 24, 25, 26, 27, 28]",weekend,74
141,WI_1,"[22, 23, 24, 25, 26, 27, 28]",year,73
142,WI_1,"[22, 23, 24, 25, 26, 27, 28]",quarter,0
