# Step 4: Advanced Modeling â€“ RandomForest Regressor

This notebook trains a RandomForest regression model using Spark MLlib
to capture non-linear demand patterns and compares its performance
against the baseline Linear Regression model.


Start Spark & Load Features

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("RetailDemandAdvancedModel") \
    .getOrCreate()

df_model = spark.read.parquet("../data/processed/daily_features")
df_model.show(5)


26/01/19 10:31:00 WARN Utils: Your hostname, MacBook-Air-3.local resolves to a loopback address: 127.0.0.1; using 10.0.0.22 instead (on interface en0)
26/01/19 10:31:00 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
26/01/19 10:31:00 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
26/01/19 10:31:01 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
26/01/19 10:31:01 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
26/01/19 10:31:01 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.


+----------+--------------+-----------+------------+-----+----+-----+-----+------------------+------------------+
|      date|daily_quantity|day_of_week|week_of_year|month|year|lag_1|lag_7|         rolling_7|        rolling_14|
+----------+--------------+-----------+------------+-----+----+-----+-----+------------------+------------------+
|2010-12-09|         19930|          5|          49|   12|2010|23117|26919| 23004.14285714286| 23004.14285714286|
|2010-12-10|         21097|          6|          49|   12|2010|19930|31329|22005.714285714286|         22619.875|
|2010-12-12|         10603|          1|          49|   12|2010|21097|16199|           20544.0|22450.666666666668|
|2010-12-13|         17727|          2|          50|   12|2010|10603|16450|19744.571428571428|           21265.9|
|2010-12-14|         20284|          3|          50|   12|2010|17727|21795|           19927.0| 20944.18181818182|
+----------+--------------+-----------+------------+-----+----+-----+-----+-------------

Verify Columns (Sanity Check)

In [2]:
df_model.printSchema()


root
 |-- date: date (nullable = true)
 |-- daily_quantity: long (nullable = true)
 |-- day_of_week: integer (nullable = true)
 |-- week_of_year: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- year: integer (nullable = true)
 |-- lag_1: long (nullable = true)
 |-- lag_7: long (nullable = true)
 |-- rolling_7: double (nullable = true)
 |-- rolling_14: double (nullable = true)



Assemble Feature Vector

In [3]:
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.functions import col

assembler = VectorAssembler(
    inputCols=[
        "lag_1",
        "lag_7",
        "rolling_7",
        "rolling_14",
        "day_of_week",
        "week_of_year",
        "month"
    ],
    outputCol="features"
)

df_final = assembler.transform(df_model) \
    .select(
        "features",
        col("daily_quantity").cast("double").alias("label")
    ) \
    .dropna()

df_final.show(5)


+--------------------+-------+
|            features|  label|
+--------------------+-------+
|[23117.0,26919.0,...|19930.0|
|[19930.0,31329.0,...|21097.0|
|[21097.0,16199.0,...|10603.0|
|[10603.0,16450.0,...|17727.0|
|[17727.0,21795.0,...|20284.0|
+--------------------+-------+
only showing top 5 rows



Train / Test Split

In [4]:
train_df, test_df = df_final.randomSplit([0.8, 0.2], seed=42)

print("Train rows:", train_df.count())
print("Test rows:", test_df.count())


Train rows: 252
Test rows: 46


Train RandomForest Regressor

In [5]:
from pyspark.ml.regression import RandomForestRegressor

rf = RandomForestRegressor(
    featuresCol="features",
    labelCol="label",
    numTrees=50,
    maxDepth=5,
    seed=42
)

rf_model = rf.fit(train_df)


Generate Predictions

In [6]:
rf_predictions = rf_model.transform(test_df)
rf_predictions.select("label", "prediction").show(5)


+-------+------------------+
|  label|        prediction|
+-------+------------------+
|13595.0|13517.511094932686|
|13415.0|13850.896159152366|
|14940.0|16557.181667683344|
|12263.0| 16219.34044287519|
|21589.0|13373.532894876222|
+-------+------------------+
only showing top 5 rows



Evaluate RandomForest (RMSE & MAE)

In [7]:
from pyspark.ml.evaluation import RegressionEvaluator

rmse_eval = RegressionEvaluator(
    labelCol="label",
    predictionCol="prediction",
    metricName="rmse"
)

mae_eval = RegressionEvaluator(
    labelCol="label",
    predictionCol="prediction",
    metricName="mae"
)

rmse_rf = rmse_eval.evaluate(rf_predictions)
mae_rf = mae_eval.evaluate(rf_predictions)

rmse_rf, mae_rf


(12348.630176079307, 6313.760869699056)

Save Predictions

In [8]:
rf_predictions \
    .select("label", "prediction") \
    .toPandas() \
    .to_csv("../data/processed/rf_predictions.csv", index=False)
