In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    col, sum as _sum, to_date,
    lag, avg
)
from pyspark.sql.window import Window

spark = SparkSession.builder \
    .appName("RetailDemandFeatureEngineering") \
    .getOrCreate()


26/01/16 14:39:42 WARN Utils: Your hostname, MacBook-Air-3.local resolves to a loopback address: 127.0.0.1; using 10.0.0.22 instead (on interface en0)
26/01/16 14:39:42 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
26/01/16 14:39:42 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
26/01/16 14:39:42 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [2]:
df = spark.read.parquet("../data/processed/retail_sales_clean")
df.printSchema()


root
 |-- invoiceno: string (nullable = true)
 |-- stockcode: string (nullable = true)
 |-- description: string (nullable = true)
 |-- quantity: long (nullable = true)
 |-- invoicedate: timestamp (nullable = true)
 |-- unitprice: double (nullable = true)
 |-- customerid: double (nullable = true)
 |-- country: string (nullable = true)
 |-- revenue: double (nullable = true)



In [3]:
daily_demand = (
    df
    .withColumn("date", to_date(col("invoicedate")))
    .groupBy("date")
    .agg(_sum("quantity").alias("daily_quantity"))
    .orderBy("date")
)

daily_demand.show(10)


+----------+--------------+
|      date|daily_quantity|
+----------+--------------+
|2010-12-01|         26919|
|2010-12-02|         31329|
|2010-12-03|         16199|
|2010-12-05|         16450|
|2010-12-06|         21795|
|2010-12-07|         25220|
|2010-12-08|         23117|
|2010-12-09|         19930|
|2010-12-10|         21097|
|2010-12-12|         10603|
+----------+--------------+
only showing top 10 rows



In [4]:
window_spec = Window.orderBy("date")

features = (
    daily_demand
    .withColumn("lag_1", lag("daily_quantity", 1).over(window_spec))
    .withColumn("lag_7", lag("daily_quantity", 7).over(window_spec))
    .withColumn("rolling_7", avg("daily_quantity").over(window_spec.rowsBetween(-7, -1)))
    .withColumn("rolling_14", avg("daily_quantity").over(window_spec.rowsBetween(-14, -1)))
)

features.show(15)


26/01/16 14:40:34 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/16 14:40:34 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/16 14:40:34 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/16 14:40:34 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/16 14:40:34 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/16 14:40:34 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/16 1

+----------+--------------+-----+-----+------------------+------------------+
|      date|daily_quantity|lag_1|lag_7|         rolling_7|        rolling_14|
+----------+--------------+-----+-----+------------------+------------------+
|2010-12-01|         26919| NULL| NULL|              NULL|              NULL|
|2010-12-02|         31329|26919| NULL|           26919.0|           26919.0|
|2010-12-03|         16199|31329| NULL|           29124.0|           29124.0|
|2010-12-05|         16450|16199| NULL|24815.666666666668|24815.666666666668|
|2010-12-06|         21795|16450| NULL|          22724.25|          22724.25|
|2010-12-07|         25220|21795| NULL|           22538.4|           22538.4|
|2010-12-08|         23117|25220| NULL|22985.333333333332|22985.333333333332|
|2010-12-09|         19930|23117|26919| 23004.14285714286| 23004.14285714286|
|2010-12-10|         21097|19930|31329|22005.714285714286|         22619.875|
|2010-12-12|         10603|21097|16199|           20544.0|22450.

In [5]:
features.write.mode("overwrite").parquet(
    "../data/processed/daily_features"
)


26/01/16 14:40:55 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/16 14:40:55 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/16 14:40:55 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/16 14:40:55 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/16 14:40:55 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/16 14:40:55 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/16 1