### Model Testing 
Author: Anne Tumlin

Date: 04/10/25 

Let's compare this problem across various types of models and evaluate the performance. 

In [1]:
import os
import subprocess
import time
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.functions import col, explode, input_file_name, expr, sum as spark_sum, avg, count

spark = SparkSession.builder \
    .appName("app_name") \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/04/11 05:05:16 INFO SparkEnv: Registering MapOutputTracker
25/04/11 05:05:16 INFO SparkEnv: Registering BlockManagerMaster
25/04/11 05:05:16 INFO SparkEnv: Registering BlockManagerMasterHeartbeat
25/04/11 05:05:16 INFO SparkEnv: Registering OutputCommitCoordinator


Here we are going to load in our saved parquet file of the final dataset we are using for the prediction task. If you do not have this saved, go to Mileston3 and run the notebook to aquire this dataset. 

*Important note: make sure to change to your bucket name here.* 

In [2]:
# Set the bucket name (user can change this line)
bucket_name = "ds5460-tumlinam-fp-bucket"

data_path = f"gs://{bucket_name}/final_datasets/combined_features_df/"

In [3]:
df = spark.read.parquet(data_path)
df.show(5)

[Stage 1:>                                                          (0 + 1) / 1]

+--------------------+------------------+------------------+------------------+-------------------+-------------------+------------------+------------------+------------------+------------------+--------------------+-----------------+-------------------+------------------+
|            filename|        total_cost|            avg_pd|       pd_per_load|     reactive_ratio|             std_pd|  load_variability|          pd_range|        load_range|            max_pd|  load_concentration|        load_skew|  generation_margin| load_to_gen_ratio|
+--------------------+------------------+------------------+------------------+-------------------+-------------------+------------------+------------------+------------------+------------------+--------------------+-----------------+-------------------+------------------+
|gs://ds5460-tumli...| 457017.6915453811|0.6337165224374126|0.6337165224374126|0.25578066583740294| 0.3907887552662506| 0.616661774515822| 2.895855111560307| 2.895855111560307| 2

                                                                                

### Simple Model Comparisons 

Let's set up the code to test various simple models. 

In [7]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression, DecisionTreeRegressor, RandomForestRegressor, GBTRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.sql.functions import col
import time

In [None]:
feature_cols = [c for c in df.columns if c != "filename" and c != "total_cost"]

assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
df_vector = assembler.transform(df).select("features", col("total_cost").alias("label"))

In [None]:
train_data, test_data = df_vector.randomSplit([0.8, 0.2], seed=42)

In [None]:
evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction")

# Models to test
models = [
    (LinearRegression(), "Linear Regression"),
    (DecisionTreeRegressor(), "Decision Tree"),
    (RandomForestRegressor(), "Random Forest"),
    (GBTRegressor(), "Gradient Boosted Trees")
]

In [None]:
def train_and_evaluate_model(regressor, name):
    start_time = time.time()
    
    # Train the model
    model = regressor.fit(train_data)
    
    # Make predictions
    predictions = model.transform(test_data)
    
    # Evaluate metrics
    rmse = evaluator.setMetricName("rmse").evaluate(predictions)
    mae = evaluator.setMetricName("mae").evaluate(predictions)
    r2 = evaluator.setMetricName("r2").evaluate(predictions)
    
    end_time = time.time()
    train_time = end_time - start_time

    return (name, rmse, mae, r2, train_time)

In [None]:
results = [train_and_evaluate_model(m, n) for m, n in models]

columns = ["Model", "RMSE", "MAE", "R2", "TrainingTime"]
results_df = spark.createDataFrame(results, columns)

results_df.show(truncate=False)

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

data = {
    "Model": ["Linear Regression", "Decision Tree", "Random Forest", "Gradient Boosted Trees"],
    "RMSE": [287.77412801849573, 699.8999269546724, 593.2220765597973, 464.04424844225764],
    "MAE": [225.16385899537582, 429.013258415274, 382.12375761585423, 344.3683654447017],
    "R2": [0.998229821839609, 0.9895290669824389, 0.9924777442561689, 0.995397092223656],
    "TrainingTime": [101.727698802948, 77.06891965866089, 78.0661551952362, 71.09090065956116]
}

df_plot = pd.DataFrame(data)

fig, ax1 = plt.subplots(figsize=(10, 6))

# Bar plot for RMSE
color = 'tab:blue'
ax1.set_xlabel('Model')
ax1.set_ylabel('RMSE', color=color)
ax1.bar(df_plot["Model"], df_plot["RMSE"], color=color, alpha=0.6, label="RMSE")
ax1.tick_params(axis='y', labelcolor=color)
ax1.set_ylim(0, max(df_plot["RMSE"]) * 1.2)

# Line plot for Training Time on second y-axis
ax2 = ax1.twinx()
color = 'tab:red'
ax2.set_ylabel('Training Time (s)', color=color)
ax2.plot(df_plot["Model"], df_plot["TrainingTime"], color=color, marker='o', linewidth=2, label="Training Time")
ax2.tick_params(axis='y', labelcolor=color)
ax2.set_ylim(0, max(df_plot["TrainingTime"]) * 1.5)

# Title and layout
fig.suptitle("Model Comparison: RMSE vs Training Time")
plt.show()


Let's check if scaling features has any impact.

In [None]:
from pyspark.ml.feature import StandardScaler

assembler = VectorAssembler(inputCols=feature_cols, outputCol="raw_features")
assembled_df = assembler.transform(df).select("raw_features", col("total_cost").alias("label"))

scaler = StandardScaler(inputCol="raw_features", outputCol="features", withMean=True, withStd=True)
scaler_model = scaler.fit(assembled_df)
scaled_df = scaler_model.transform(assembled_df).select("features", "label")

train_data, test_data = scaled_df.randomSplit([0.8, 0.2], seed=42)

In [None]:
results = [train_and_evaluate_model(m, n) for m, n in models]

columns = ["Model", "RMSE", "MAE", "R2", "TrainingTime"]
results_df = spark.createDataFrame(results, columns)

results_df.show(truncate=False)

### More Complex Models 

Now that we have our results for the simple models, let's see what tuning we can do to improve results. GBT had the lowest training time and second best RMSE. Therefore, let's see if we can improve the outcome via hyperparameter tuning. 

In [4]:
!pip install xgboost==1.7.6

[0m

In [10]:
from xgboost.spark import SparkXGBRegressor
import time

In [9]:
feature_cols = [c for c in df.columns if c != "filename" and c != "total_cost"]

assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
assembled_df = assembler.transform(df).select("features", col("total_cost").alias("label"))

train_data, test_data = assembled_df.randomSplit([0.8, 0.2], seed=42)

In [11]:
xgb = SparkXGBRegressor(
    objective='reg:squarederror',
    num_round=50,
    max_depth=6,
    eta=0.1,
    features_col="features",
    label_col="label",
    prediction_col="prediction"
)

start_time = time.time()
xgb_model = xgb.fit(train_data)
train_time = time.time() - start_time

print(f"XGBoost training completed in {train_time:.2f} seconds.")

25/04/11 05:08:24 WARN DAGScheduler: Creating new stage failed due to exception - job: 3
org.apache.spark.scheduler.BarrierJobRunWithDynamicAllocationException: [SPARK-24942]: Barrier execution mode does not support dynamic resource allocation for now. You can disable dynamic resource allocation by setting Spark conf "spark.dynamicAllocation.enabled" to "false".
	at org.apache.spark.errors.SparkCoreErrors$.barrierStageWithDynamicAllocationError(SparkCoreErrors.scala:177) ~[spark-core_2.12-3.3.2.jar:3.3.2]
	at org.apache.spark.scheduler.DAGScheduler.checkBarrierStageWithDynamicAllocation(DAGScheduler.scala:520) ~[spark-core_2.12-3.3.2.jar:3.3.2]
	at org.apache.spark.scheduler.DAGScheduler.createResultStage(DAGScheduler.scala:609) ~[spark-core_2.12-3.3.2.jar:3.3.2]
	at org.apache.spark.scheduler.DAGScheduler.handleJobSubmitted(DAGScheduler.scala:1225) ~[spark-core_2.12-3.3.2.jar:3.3.2]
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2863) ~[spar

Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.
: org.apache.spark.scheduler.BarrierJobRunWithDynamicAllocationException: [SPARK-24942]: Barrier execution mode does not support dynamic resource allocation for now. You can disable dynamic resource allocation by setting Spark conf "spark.dynamicAllocation.enabled" to "false".
	at org.apache.spark.errors.SparkCoreErrors$.barrierStageWithDynamicAllocationError(SparkCoreErrors.scala:177)
	at org.apache.spark.scheduler.DAGScheduler.checkBarrierStageWithDynamicAllocation(DAGScheduler.scala:520)
	at org.apache.spark.scheduler.DAGScheduler.createResultStage(DAGScheduler.scala:609)
	at org.apache.spark.scheduler.DAGScheduler.handleJobSubmitted(DAGScheduler.scala:1225)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2863)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2855)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2844)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:959)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2293)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2314)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2333)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2358)
	at org.apache.spark.rdd.RDD.$anonfun$collect$1(RDD.scala:1021)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:406)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:1020)
	at org.apache.spark.api.python.PythonRDD$.collectAndServe(PythonRDD.scala:180)
	at org.apache.spark.api.python.PythonRDD.collectAndServe(PythonRDD.scala)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:566)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:829)


In [None]:
from pyspark.ml.evaluation import RegressionEvaluator

predictions = xgb_model.transform(test_data)

evaluator_rmse = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse")
evaluator_mae = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="mae")
evaluator_r2 = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="r2")

rmse = evaluator_rmse.evaluate(predictions)
mae = evaluator_mae.evaluate(predictions)
r2 = evaluator_r2.evaluate(predictions)

print(f"RMSE: {rmse:.4f}")
print(f"MAE:  {mae:.4f}")
print(f"R²:   {r2:.4f}")