In [9]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

spark = SparkSession.builder \
    .appName("RetailDemandBaselineModel") \
    .getOrCreate()


In [10]:
df = spark.read.parquet("../data/processed/daily_features")
df.printSchema()
df.show(5)


root
 |-- date: date (nullable = true)
 |-- daily_quantity: long (nullable = true)
 |-- lag_1: long (nullable = true)
 |-- lag_7: long (nullable = true)
 |-- rolling_7: double (nullable = true)
 |-- rolling_14: double (nullable = true)

+----------+--------------+-----+-----+------------------+------------------+
|      date|daily_quantity|lag_1|lag_7|         rolling_7|        rolling_14|
+----------+--------------+-----+-----+------------------+------------------+
|2010-12-01|         26919| NULL| NULL|              NULL|              NULL|
|2010-12-02|         31329|26919| NULL|           26919.0|           26919.0|
|2010-12-03|         16199|31329| NULL|           29124.0|           29124.0|
|2010-12-05|         16450|16199| NULL|24815.666666666668|24815.666666666668|
|2010-12-06|         21795|16450| NULL|          22724.25|          22724.25|
+----------+--------------+-----+-----+------------------+------------------+
only showing top 5 rows



In [11]:
df_model = df.dropna()
df_model.count()


298

In [13]:
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.functions import col

assembler = VectorAssembler(
    inputCols=["lag_1", "lag_7", "rolling_7", "rolling_14"],
    outputCol="features"
)

df_final = assembler.transform(df_model) \
    .select(
        "features",
        col("daily_quantity").cast("double").alias("label")
    ) \
    .dropna()


In [14]:
df_final.printSchema()
df_final.show(5)
df_final.count()


root
 |-- features: vector (nullable = true)
 |-- label: double (nullable = true)

+--------------------+-------+
|            features|  label|
+--------------------+-------+
|[23117.0,26919.0,...|19930.0|
|[19930.0,31329.0,...|21097.0|
|[21097.0,16199.0,...|10603.0|
|[10603.0,16450.0,...|17727.0|
|[17727.0,21795.0,...|20284.0|
+--------------------+-------+
only showing top 5 rows



298

In [15]:
train_df, test_df = df_final.randomSplit([0.8, 0.2], seed=42)

train_df.count(), test_df.count()


(252, 46)

In [19]:
train_df.count()
test_df.count()


46

In [20]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator


In [21]:
lr = LinearRegression(
    featuresCol="features",
    labelCol="label"
)

lr_model = lr.fit(train_df)


26/01/16 14:58:48 WARN Instrumentation: [130839e5] regParam is zero, which might cause numerical instability and overfitting.
26/01/16 14:58:48 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
26/01/16 14:58:48 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS
26/01/16 14:58:48 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.lapack.JNILAPACK


In [22]:
predictions = lr_model.transform(test_df)
predictions.select("label", "prediction").show(10)


+-------+------------------+
|  label|        prediction|
+-------+------------------+
|13595.0|  9421.20239955242|
|13415.0|13530.032797115377|
|14940.0|17081.160018870676|
|12263.0|11863.839144614058|
|21589.0| 12260.06785286125|
|10244.0|15296.259006135893|
|13501.0|  11659.9542576585|
|19771.0|11177.299374375534|
| 3431.0| 13662.50158078117|
|12145.0|11381.979559463389|
+-------+------------------+
only showing top 10 rows



In [23]:
evaluator = RegressionEvaluator(
    labelCol="label",
    predictionCol="prediction",
    metricName="rmse"
)

rmse = evaluator.evaluate(predictions)
print("RMSE:", rmse)


RMSE: 12557.856535944016
