### Decision Tree Regression

In [1]:
import findspark
findspark.init()

import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('SparkSess').getOrCreate()

In [3]:
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import VectorAssembler

In [4]:
pp_df = spark.read.csv("power_plant.csv", header=True, inferSchema=True)

In [5]:
pp_df.take(1)

[Row(AT=8.34, V=40.77, AP=1010.84, RH=90.01, PE=480.48)]

In [6]:
vectorAssembler = VectorAssembler(inputCols=["AT","V", "AP","RH"], outputCol = "features")

In [7]:
vpp_df = vectorAssembler.transform(pp_df)

In [8]:
vpp_df.take(1)

[Row(AT=8.34, V=40.77, AP=1010.84, RH=90.01, PE=480.48, features=DenseVector([8.34, 40.77, 1010.84, 90.01]))]

In [11]:
splits = vpp_df.randomSplit([0.7,0.3])
train_df = splits[0]
test_df = splits[1]

In [14]:
train_df.count()
test_df.count()
vpp_df.count()

9568

In [15]:
dt = DecisionTreeRegressor(featuresCol="features", labelCol="PE")
dt_model = dt.fit(train_df)
dt_predictions = dt_model.transform(test_df)

In [16]:
dt_evaluator = RegressionEvaluator(labelCol="PE", predictionCol="prediction", metricName="rmse")

In [18]:
rmse = dt_evaluator.evaluate(dt_predictions)

In [19]:
rmse

4.373533575763438

### Gradient-boosted tree regression

In [20]:
from pyspark.ml.regression import GBTRegressor
gbt = GBTRegressor(featuresCol="features", labelCol="PE")
gbt_model = gbt.fit(train_df)

In [21]:
gbt_predictions = gbt_model.transform(test_df)

In [23]:
gbt_evaluator = RegressionEvaluator(labelCol = "PE", predictionCol="prediction", metricName="rmse")

In [24]:
gbt_rmse = gbt_evaluator.evaluate(gbt_predictions)

In [25]:
gbt_rmse

3.890130847692605