
## Baseline Modeling

This notebook trains a baseline Linear Regression model using Spark MLlib.
To validate results, a Pandas + scikit-learn implementation is also used
to compare performance on an in-memory dataset.



In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

spark = SparkSession.builder \
    .appName("RetailDemandBaselineModel") \
    .getOrCreate()


26/01/16 17:47:13 WARN Utils: Your hostname, MacBook-Air-3.local resolves to a loopback address: 127.0.0.1; using 10.0.0.22 instead (on interface en0)
26/01/16 17:47:13 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
26/01/16 17:47:13 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
26/01/16 17:47:13 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
26/01/16 17:47:13 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


In [2]:
df = spark.read.parquet("../data/processed/daily_features")
df.printSchema()
df.show(5)


root
 |-- sales_date: date (nullable = true)
 |-- daily_quantity: long (nullable = true)
 |-- daily_revenue: double (nullable = true)
 |-- day_of_week: integer (nullable = true)
 |-- week_of_year: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- year: integer (nullable = true)
 |-- lag_1_day: long (nullable = true)
 |-- lag_7_day: long (nullable = true)
 |-- rolling_7_day_avg: double (nullable = true)

+----------+--------------+------------------+-----------+------------+-----+----+---------+---------+------------------+
|sales_date|daily_quantity|     daily_revenue|day_of_week|week_of_year|month|year|lag_1_day|lag_7_day| rolling_7_day_avg|
+----------+--------------+------------------+-----------+------------+-----+----+---------+---------+------------------+
|2010-12-09|         19930| 53586.18000000004|          5|          49|   12|2010|    23117|    26919| 23004.14285714286|
|2010-12-10|         21097| 59182.92000000025|          6|          49|   12|2010|    

In [3]:
df_model = df.dropna()
df_model.count()


298

In [4]:
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.functions import col

assembler = VectorAssembler(
    inputCols=[
        "lag_1_day",
        "lag_7_day",
        "rolling_7_day_avg",
        "day_of_week",
        "week_of_year",
        "month"
    ],
    outputCol="features"
)

df_final = assembler.transform(df_model) \
    .select(
        "features",
        col("daily_quantity").cast("double").alias("label")
    ) \
    .dropna()

df_final.printSchema()
df_final.show(5)


root
 |-- features: vector (nullable = true)
 |-- label: double (nullable = true)

+--------------------+-------+
|            features|  label|
+--------------------+-------+
|[23117.0,26919.0,...|19930.0|
|[19930.0,31329.0,...|21097.0|
|[21097.0,16199.0,...|10603.0|
|[10603.0,16450.0,...|17727.0|
|[17727.0,21795.0,...|20284.0|
+--------------------+-------+
only showing top 5 rows



In [5]:
df_model.select(
    "lag_1_day",
    "lag_7_day",
    "rolling_7_day_avg",
    "day_of_week",
    "week_of_year",
    "month"
).show(5)


+---------+---------+------------------+-----------+------------+-----+
|lag_1_day|lag_7_day| rolling_7_day_avg|day_of_week|week_of_year|month|
+---------+---------+------------------+-----------+------------+-----+
|    23117|    26919| 23004.14285714286|          5|          49|   12|
|    19930|    31329|22005.714285714286|          6|          49|   12|
|    21097|    16199|           20544.0|          1|          49|   12|
|    10603|    16450|19744.571428571428|          2|          50|   12|
|    17727|    21795|           19927.0|          3|          50|   12|
+---------+---------+------------------+-----------+------------+-----+
only showing top 5 rows



In [6]:
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.functions import col

assembler = VectorAssembler(
    inputCols=[
        "lag_1_day",
        "lag_7_day",
        "rolling_7_day_avg",
        "day_of_week",
        "week_of_year",
        "month"
    ],
    outputCol="features"
)

df_final = assembler.transform(df_model) \
    .select(
        "features",
        col("daily_quantity").cast("double").alias("label")
    ) \
    .dropna()

df_final.show(5)
df_final.printSchema()


+--------------------+-------+
|            features|  label|
+--------------------+-------+
|[23117.0,26919.0,...|19930.0|
|[19930.0,31329.0,...|21097.0|
|[21097.0,16199.0,...|10603.0|
|[10603.0,16450.0,...|17727.0|
|[17727.0,21795.0,...|20284.0|
+--------------------+-------+
only showing top 5 rows

root
 |-- features: vector (nullable = true)
 |-- label: double (nullable = true)



In [7]:
train_df, test_df = df_final.randomSplit([0.8, 0.2], seed=42)

train_df.count(), test_df.count()


(252, 46)

In [8]:
train_df.count()
test_df.count()


46

In [9]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator


In [10]:
lr = LinearRegression(
    featuresCol="features",
    labelCol="label"
)

lr_model = lr.fit(train_df)


26/01/16 17:47:16 WARN Instrumentation: [08f39427] regParam is zero, which might cause numerical instability and overfitting.
26/01/16 17:47:16 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
26/01/16 17:47:16 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS
26/01/16 17:47:16 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.lapack.JNILAPACK


In [11]:
predictions = lr_model.transform(test_df)
predictions.select("label", "prediction").show(10)


+-------+------------------+
|  label|        prediction|
+-------+------------------+
|13595.0| 7498.243742078078|
|13415.0| 7706.829347421407|
|14940.0|19145.421937825115|
|12263.0|  9322.12482314435|
|21589.0|11194.177199416943|
|10244.0|17908.696812911323|
|13501.0| 7107.303057185462|
|19771.0| 8602.501111363217|
| 3431.0| 3978.763829643108|
|12145.0| 9648.245210404344|
+-------+------------------+
only showing top 10 rows



In [12]:
from pyspark.ml.evaluation import RegressionEvaluator

rmse_evaluator = RegressionEvaluator(
    labelCol="label",
    predictionCol="prediction",
    metricName="rmse"
)

mae_evaluator = RegressionEvaluator(
    labelCol="label",
    predictionCol="prediction",
    metricName="mae"
)

rmse_spark = rmse_evaluator.evaluate(predictions)
mae_spark = mae_evaluator.evaluate(predictions)

rmse_spark, mae_spark


(13222.980446660433, 7630.990098285122)

## scikit-learn Baseline (Validation)

To validate Spark MLlib results, the same feature set is used to train a
Linear Regression model using Pandas and scikit-learn on an in-memory dataset.


In [13]:
pdf = df_model.orderBy("sales_date").toPandas()
pdf.head()


Unnamed: 0,sales_date,daily_quantity,daily_revenue,day_of_week,week_of_year,month,year,lag_1_day,lag_7_day,rolling_7_day_avg
0,2010-12-09,19930,53586.18,5,49,12,2010,23117,26919,23004.142857
1,2010-12-10,21097,59182.92,6,49,12,2010,19930,31329,22005.714286
2,2010-12-12,10603,17329.07,1,49,12,2010,21097,16199,20544.0
3,2010-12-13,17727,38006.71,2,50,12,2010,10603,16450,19744.571429
4,2010-12-14,20284,45254.73,3,50,12,2010,17727,21795,19927.0


In [14]:
feature_cols = [
    "lag_1_day",
    "lag_7_day",
    "rolling_7_day_avg",
    "day_of_week",
    "week_of_year",
    "month"
]

X = pdf[feature_cols]
y = pdf["daily_quantity"]


In [15]:
split_index = int(len(pdf) * 0.8)

X_train, X_test = X.iloc[:split_index], X.iloc[split_index:]
y_train, y_test = y.iloc[:split_index], y.iloc[split_index:]


In [16]:
from sklearn.linear_model import LinearRegression

sk_model = LinearRegression()
sk_model.fit(X_train, y_train)

y_pred_sk = sk_model.predict(X_test)


In [17]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

mae_sk = mean_absolute_error(y_test, y_pred_sk)
rmse_sk = np.sqrt(mean_squared_error(y_test, y_pred_sk))

rmse_sk, mae_sk


(np.float64(14363.866894630732), 10220.03884824338)

In [18]:
import pandas as pd

comparison = pd.DataFrame({
    "Model": ["Spark Linear Regression", "scikit-learn Linear Regression"],
    "RMSE": [rmse_spark, rmse_sk],
    "MAE": [mae_spark, mae_sk]
})

comparison


Unnamed: 0,Model,RMSE,MAE
0,Spark Linear Regression,13222.980447,7630.990098
1,scikit-learn Linear Regression,14363.866895,10220.038848


In [19]:
len(X_train), len(X_test)


(238, 60)

### Comparison Summary

Both Spark MLlib and scikit-learn Linear Regression models show comparable
performance. Minor differences are expected due to distributed execution
(Spark) versus in-memory computation (scikit-learn).


### Interpretation

Spark Linear Regression outperformed the scikit-learn implementation.
This difference is expected due to variations in numerical optimization,
feature vectorization, and execution environment (distributed Spark vs
in-memory scikit-learn).

The scikit-learn model is included as a validation baseline to confirm
trend consistency rather than exact metric parity.
