Start Spark & Load Clean Data

In [6]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("RetailDemandFeatureEngineering") \
    .getOrCreate()

df = spark.read.parquet("../data/processed/retail_sales_clean")
df.show(5)


+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+-------+
|invoiceno|stockcode|         description|quantity|        invoicedate|unitprice|customerid|       country|revenue|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+-------+
|   550519|    22698|PINK REGENCY TEAC...|       6|2011-04-19 09:31:00|     2.95|   13141.0|United Kingdom|   17.7|
|   550519|    22699|ROSES REGENCY TEA...|       6|2011-04-19 09:31:00|     2.95|   13141.0|United Kingdom|   17.7|
|   550519|    22960|JAM MAKING SET WI...|       6|2011-04-19 09:31:00|     4.25|   13141.0|United Kingdom|   25.5|
|   550519|    20725|LUNCH BAG RED RET...|      10|2011-04-19 09:31:00|     1.65|   13141.0|United Kingdom|   16.5|
|   550519|   85099B|JUMBO BAG RED RET...|      10|2011-04-19 09:31:00|     2.08|   13141.0|United Kingdom|   20.8|
+---------+---------+--------------------+--------+-------------------+-

Create Sales Date Column

In [7]:
from pyspark.sql.functions import to_date

df = df.withColumn(
    "date",
    to_date("invoicedate")
)

df.select("invoicedate", "date").show(5)


+-------------------+----------+
|        invoicedate|      date|
+-------------------+----------+
|2011-04-19 09:31:00|2011-04-19|
|2011-04-19 09:31:00|2011-04-19|
|2011-04-19 09:31:00|2011-04-19|
|2011-04-19 09:31:00|2011-04-19|
|2011-04-19 09:31:00|2011-04-19|
+-------------------+----------+
only showing top 5 rows



Daily Demand Aggregation

In [8]:
from pyspark.sql.functions import sum as spark_sum

daily_demand = df.groupBy("date").agg(
    spark_sum("quantity").alias("daily_quantity")
)

daily_demand.orderBy("date").show(10)


+----------+--------------+
|      date|daily_quantity|
+----------+--------------+
|2010-12-01|         26919|
|2010-12-02|         31329|
|2010-12-03|         16199|
|2010-12-05|         16450|
|2010-12-06|         21795|
|2010-12-07|         25220|
|2010-12-08|         23117|
|2010-12-09|         19930|
|2010-12-10|         21097|
|2010-12-12|         10603|
+----------+--------------+
only showing top 10 rows



Calendar Features

In [9]:
from pyspark.sql.functions import dayofweek, weekofyear, month, year

daily_features = daily_demand \
    .withColumn("day_of_week", dayofweek("date")) \
    .withColumn("week_of_year", weekofyear("date")) \
    .withColumn("month", month("date")) \
    .withColumn("year", year("date"))

daily_features.show(5)


+----------+--------------+-----------+------------+-----+----+
|      date|daily_quantity|day_of_week|week_of_year|month|year|
+----------+--------------+-----------+------------+-----+----+
|2011-05-06|         19947|          6|          18|    5|2011|
|2011-04-27|         18249|          4|          17|    4|2011|
|2011-05-22|         12749|          1|          20|    5|2011|
|2011-06-05|         13501|          1|          22|    6|2011|
|2011-05-11|         18913|          4|          19|    5|2011|
+----------+--------------+-----------+------------+-----+----+
only showing top 5 rows



Lag Features (Time Dependency)

In [10]:
from pyspark.sql.window import Window
from pyspark.sql.functions import lag

window_spec = Window.orderBy("date")

daily_features = daily_features \
    .withColumn("lag_1", lag("daily_quantity", 1).over(window_spec)) \
    .withColumn("lag_7", lag("daily_quantity", 7).over(window_spec))

daily_features.select(
    "date", "daily_quantity", "lag_1", "lag_7"
).show(10)


+----------+--------------+-----+-----+
|      date|daily_quantity|lag_1|lag_7|
+----------+--------------+-----+-----+
|2010-12-01|         26919| NULL| NULL|
|2010-12-02|         31329|26919| NULL|
|2010-12-03|         16199|31329| NULL|
|2010-12-05|         16450|16199| NULL|
|2010-12-06|         21795|16450| NULL|
|2010-12-07|         25220|21795| NULL|
|2010-12-08|         23117|25220| NULL|
|2010-12-09|         19930|23117|26919|
|2010-12-10|         21097|19930|31329|
|2010-12-12|         10603|21097|16199|
+----------+--------------+-----+-----+
only showing top 10 rows



26/01/19 10:16:53 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/19 10:16:53 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/19 10:16:53 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/19 10:16:53 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/19 10:16:53 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/19 10:16:53 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/19 1

Rolling Averages (Smoothing)

In [11]:
from pyspark.sql.functions import avg

rolling_7 = window_spec.rowsBetween(-7, -1)
rolling_14 = window_spec.rowsBetween(-14, -1)

daily_features = daily_features \
    .withColumn("rolling_7", avg("daily_quantity").over(rolling_7)) \
    .withColumn("rolling_14", avg("daily_quantity").over(rolling_14))

daily_features.select(
    "date", "daily_quantity", "rolling_7", "rolling_14"
).show(10)


+----------+--------------+------------------+------------------+
|      date|daily_quantity|         rolling_7|        rolling_14|
+----------+--------------+------------------+------------------+
|2010-12-01|         26919|              NULL|              NULL|
|2010-12-02|         31329|           26919.0|           26919.0|
|2010-12-03|         16199|           29124.0|           29124.0|
|2010-12-05|         16450|24815.666666666668|24815.666666666668|
|2010-12-06|         21795|          22724.25|          22724.25|
|2010-12-07|         25220|           22538.4|           22538.4|
|2010-12-08|         23117|22985.333333333332|22985.333333333332|
|2010-12-09|         19930| 23004.14285714286| 23004.14285714286|
|2010-12-10|         21097|22005.714285714286|         22619.875|
|2010-12-12|         10603|           20544.0|22450.666666666668|
+----------+--------------+------------------+------------------+
only showing top 10 rows



26/01/19 10:17:19 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/19 10:17:19 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/19 10:17:19 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/19 10:17:19 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/19 10:17:19 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/19 10:17:19 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/19 1

Drop Null Rows (Lag Safety)

In [12]:
daily_features_clean = daily_features.dropna()
daily_features_clean.count()


26/01/19 10:17:50 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/19 10:17:50 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/19 10:17:50 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/19 10:17:50 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/19 10:17:50 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/19 10:17:50 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


298

Save Feature Dataset

In [13]:
daily_features_clean.write \
    .mode("overwrite") \
    .parquet("../data/processed/daily_features")


26/01/19 10:18:17 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/19 10:18:17 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/19 10:18:17 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/19 10:18:17 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/19 10:18:17 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/19 10:18:17 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/19 1