In [None]:
%pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488491 sha256=6a55645a7b43a9f2be167f3f2179ffc51014bd3a6be9955b43f53856afebf96a
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1


In [None]:
import os
import sys
import pyspark

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("decision_trees").config("spark.driver.memory", "14g").config("spark.executor.memory", "14g").getOrCreate()

In [None]:
#grant permission (for gdrive use only)
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
cd gdrive/MyDrive/

/content/gdrive/MyDrive


In [None]:
#load housing dataset from csv file
from pyspark.sql import SparkSession

ss = SparkSession.builder.appName('housing_data').getOrCreate()
df_without_header = ss.read.option('inferSchema', True).option('header', False).csv('202304.csv').limit(200000)
df_without_header = df_without_header.na.drop()
df_without_header.show()

24/04/04 18:55:29 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.

+--------------------+------+-------------------+--------+---+---+---+--------------------+----------------+--------------------+------------------+------------------+--------------------+--------------+----+----+
|                 _c0|   _c1|                _c2|     _c3|_c4|_c5|_c6|                 _c7|             _c8|                 _c9|              _c10|              _c11|                _c12|          _c13|_c14|_c15|
+--------------------+------+-------------------+--------+---+---+---+--------------------+----------------+--------------------+------------------+------------------+--------------------+--------------+----+----+
|{E104A9E7-1D6A-4D...| 36500|1995-06-09 00:00:00| FY4 1DL|  F|  N|  L|      CLARENCE COURT|              28|    RAWCLIFFE STREET|         BLACKPOOL|         BLACKPOOL|           BLACKPOOL|     BLACKPOOL|   A|   A|
|{748F870E-C337-40...| 74000|1995-01-12 00:00:00| RH2 9NF|  F|  Y|  L|SOMERS PLACE, 83 ...|         FLAT 21|        REIGATE HILL|           REIG

                                                                                

In [None]:
#add column names from kaggle dataset page
col_names=['Transaction_unique_identifier', 'price', 'Date_of_Transfer', 'postcode', 'Property_Type', 'Old/New',
'Duration', 'PAON', 'SAON', 'Street', 'Locality', 'Town/City', 'District', 'County', 'PPDCategory_Type',
'Record_Status - monthly_file_only']
housing_df = df_without_header.toDF(*col_names)

# use first 200000 entries due to limitations in compute power
housing_df = ss.createDataFrame(housing_df.head(200000), housing_df.schema)

housing_df.show()
print(housing_df.count())

                                                                                

+-----------------------------+------+-------------------+--------+-------------+-------+--------+--------------------+----------------+--------------------+------------------+------------------+--------------------+--------------+----------------+---------------------------------+
|Transaction_unique_identifier| price|   Date_of_Transfer|postcode|Property_Type|Old/New|Duration|                PAON|            SAON|              Street|          Locality|         Town/City|            District|        County|PPDCategory_Type|Record_Status - monthly_file_only|
+-----------------------------+------+-------------------+--------+-------------+-------+--------+--------------------+----------------+--------------------+------------------+------------------+--------------------+--------------+----------------+---------------------------------+
|         {E104A9E7-1D6A-4D...| 36500|1995-06-09 00:00:00| FY4 1DL|            F|      N|       L|      CLARENCE COURT|              28|    RAWCLIFFE S

In [None]:
#extract year from date of  transfer
from pyspark.sql.functions import udf
from pyspark.sql.functions import *



extract_year = udf(lambda x: x[0:4])

housing_df = housing_df.withColumn('Date_of_Transfer', to_timestamp('Date_of_Transfer').cast('string'))
housing_df = housing_df.withColumn('Date_of_Transfer', extract_year('Date_of_Transfer'))
housing_df.show()

+-----------------------------+------+----------------+--------+-------------+-------+--------+--------------------+----------------+--------------------+------------------+------------------+--------------------+--------------+----------------+---------------------------------+
|Transaction_unique_identifier| price|Date_of_Transfer|postcode|Property_Type|Old/New|Duration|                PAON|            SAON|              Street|          Locality|         Town/City|            District|        County|PPDCategory_Type|Record_Status - monthly_file_only|
+-----------------------------+------+----------------+--------+-------------+-------+--------+--------------------+----------------+--------------------+------------------+------------------+--------------------+--------------+----------------+---------------------------------+
|         {E104A9E7-1D6A-4D...| 36500|            1995| FY4 1DL|            F|      N|       L|      CLARENCE COURT|              28|    RAWCLIFFE STREET|      

In [None]:
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import VectorAssembler,OneHotEncoder
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml.evaluation import RegressionEvaluator


i=10
print('Depth ',i,':')
all_cols = housing_df.columns
all_cols.remove('price')


# Remove categorical features with high cardinality to reduce computation load and avoid overfitting
all_cols.remove('postcode')
all_cols.remove('PAON')
all_cols.remove('Street')

#all categorical varables that require one hot encoding
categorical_cols = ['postcode', 'Old/New', 'PAON', 'SAON',
                    'Locality', 'Town/City', 'District',
                    'County', 'PPDCategory_Type', 'Date_of_Transfer']


# List to store stages of the pipeline
stages = []

# Apply StringIndexer to each categorical column
indexers = [StringIndexer(inputCol=col, outputCol=f"{col}_indexed", handleInvalid="keep") for col in categorical_cols]
stages += indexers

# Apply OneHotEncoder to each indexed categorical column
encoders = [OneHotEncoder(inputCol=f"{col}_indexed", outputCol=f"{col}_encoded") for col in categorical_cols]
stages += encoders

# Assemble all features into a single vector
assembler_inputs = [f"{col}_encoded" for col in categorical_cols]
assembler = VectorAssembler(inputCols=assembler_inputs, outputCol="features")
stages.append(assembler)

# Create pipeline
pipeline = Pipeline(stages=stages)

# Fit pipeline to data and transform DataFrame
pipeline_model = pipeline.fit(housing_df)
df_housing_final = pipeline_model.transform(housing_df)
df_housing_final.show(5)

# Split data into train and test sets
(train_data, test_data) = df_housing_final.randomSplit([0.99, 0.01], seed=123)


# Create a DecisionTreeRegressor model
dt = DecisionTreeRegressor(featuresCol="features", labelCol="price", maxBins=266, maxDepth=i, minInfoGain=100)

# Fit the model on the training data
dt_model = dt.fit(train_data)

# Make predictions on the training data
train_predictions = dt_model.transform(train_data)

# Evaluate the model on the training data
train_evaluator = RegressionEvaluator(labelCol="price", predictionCol="prediction", metricName="rmse")
train_rmse = train_evaluator.evaluate(train_predictions)
train_r2 = train_evaluator.evaluate(train_predictions, {train_evaluator.metricName: "r2"})
print("Training - RMSE: {:.3f}, R2 Score: {:.3f}".format(train_rmse, train_r2))

# Make predictions on the test data
test_predictions = dt_model.transform(test_data)

# Evaluate the model on the test data
test_evaluator = RegressionEvaluator(labelCol="price", predictionCol="prediction", metricName="rmse")
test_rmse = test_evaluator.evaluate(test_predictions)
test_r2 = test_evaluator.evaluate(test_predictions, {test_evaluator.metricName: "r2"})
print("Test - RMSE: {:.3f}, R2 Score: {:.3f}".format(test_rmse, test_r2))
print('---')



Depth  10 :


24/04/04 18:56:12 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


+-----------------------------+-----+----------------+--------+-------------+-------+--------+--------------------+-------+-----------------+-----------------+-----------------+--------------------+-----------+----------------+---------------------------------+----------------+---------------+------------+------------+----------------+-----------------+----------------+--------------+------------------------+------------------------+--------------------+---------------+-------------------+-----------------+------------------+-----------------+----------------+---------------+------------------------+------------------------+--------------------+
|Transaction_unique_identifier|price|Date_of_Transfer|postcode|Property_Type|Old/New|Duration|                PAON|   SAON|           Street|         Locality|        Town/City|            District|     County|PPDCategory_Type|Record_Status - monthly_file_only|postcode_indexed|Old/New_indexed|PAON_indexed|SAON_indexed|Locality_indexed|Town/City

24/04/04 18:56:13 WARN DAGScheduler: Broadcasting large task binary with size 3.4 MiB
24/04/04 18:56:13 WARN DAGScheduler: Broadcasting large task binary with size 3.4 MiB
24/04/04 18:56:14 WARN DAGScheduler: Broadcasting large task binary with size 4.0 MiB
24/04/04 18:56:18 WARN DAGScheduler: Broadcasting large task binary with size 4.0 MiB
24/04/04 18:56:19 WARN DAGScheduler: Broadcasting large task binary with size 4.0 MiB
24/04/04 18:56:21 WARN DAGScheduler: Broadcasting large task binary with size 4.0 MiB
24/04/04 18:56:22 WARN DAGScheduler: Broadcasting large task binary with size 4.0 MiB
24/04/04 18:56:24 WARN DAGScheduler: Broadcasting large task binary with size 4.0 MiB
24/04/04 18:56:26 WARN DAGScheduler: Broadcasting large task binary with size 4.0 MiB
24/04/04 18:56:28 WARN DAGScheduler: Broadcasting large task binary with size 4.0 MiB
24/04/04 18:56:31 WARN DAGScheduler: Broadcasting large task binary with size 4.0 MiB
24/04/04 18:56:33 WARN DAGScheduler: Broadcasting larg

Training - RMSE: 47986.740, R2 Score: 0.593


24/04/04 18:56:38 WARN DAGScheduler: Broadcasting large task binary with size 3.4 MiB
24/04/04 18:56:39 WARN DAGScheduler: Broadcasting large task binary with size 3.4 MiB
24/04/04 18:56:39 WARN DAGScheduler: Broadcasting large task binary with size 3.4 MiB

Test - RMSE: 56165.052, R2 Score: 0.361
---


24/04/04 18:56:40 WARN DAGScheduler: Broadcasting large task binary with size 3.4 MiB
                                                                                

In [None]:
# Create a DecisionTreeRegressor model
dt = DecisionTreeRegressor(featuresCol="features", labelCol="price", maxBins=266, maxDepth=15, minInfoGain=100)

# Fit the model on the training data
dt_model = dt.fit(train_data)

# Make predictions on the training data
train_predictions = dt_model.transform(train_data)

# Evaluate the model on the training data
train_evaluator = RegressionEvaluator(labelCol="price", predictionCol="prediction", metricName="rmse")
train_rmse = train_evaluator.evaluate(train_predictions)
train_r2 = train_evaluator.evaluate(train_predictions, {train_evaluator.metricName: "r2"})
print("Training - RMSE: {:.3f}, R2 Score: {:.3f}".format(train_rmse, train_r2))

# Make predictions on the test data
test_predictions = dt_model.transform(test_data)

# Evaluate the model on the test data
test_evaluator = RegressionEvaluator(labelCol="price", predictionCol="prediction", metricName="rmse")
test_rmse = test_evaluator.evaluate(test_predictions)
test_r2 = test_evaluator.evaluate(test_predictions, {test_evaluator.metricName: "r2"})
print("Test - RMSE: {:.3f}, R2 Score: {:.3f}".format(test_rmse, test_r2))
print('---')


24/04/04 18:56:40 WARN DAGScheduler: Broadcasting large task binary with size 3.4 MiB
24/04/04 18:56:41 WARN DAGScheduler: Broadcasting large task binary with size 3.4 MiB
24/04/04 18:56:42 WARN DAGScheduler: Broadcasting large task binary with size 4.0 MiB
24/04/04 18:56:44 WARN DAGScheduler: Broadcasting large task binary with size 4.0 MiB
24/04/04 18:56:45 WARN DAGScheduler: Broadcasting large task binary with size 4.0 MiB
24/04/04 18:56:46 WARN DAGScheduler: Broadcasting large task binary with size 4.0 MiB
24/04/04 18:56:47 WARN DAGScheduler: Broadcasting large task binary with size 4.0 MiB
24/04/04 18:56:49 WARN DAGScheduler: Broadcasting large task binary with size 4.0 MiB
24/04/04 18:56:51 WARN DAGScheduler: Broadcasting large task binary with size 4.0 MiB
24/04/04 18:56:53 WARN DAGScheduler: Broadcasting large task binary with size 4.0 MiB
24/04/04 18:56:55 WARN DAGScheduler: Broadcasting large task binary with size 4.0 MiB
24/04/04 18:56:57 WARN DAGScheduler: Broadcasting larg

Training - RMSE: 43556.101, R2 Score: 0.664


24/04/04 18:57:15 WARN DAGScheduler: Broadcasting large task binary with size 3.4 MiB
24/04/04 18:57:16 WARN DAGScheduler: Broadcasting large task binary with size 3.4 MiB
24/04/04 18:57:16 WARN DAGScheduler: Broadcasting large task binary with size 3.4 MiB
24/04/04 18:57:17 WARN DAGScheduler: Broadcasting large task binary with size 3.4 MiB


Test - RMSE: 56549.497, R2 Score: 0.352
---


                                                                                

In [None]:
# Create a DecisionTreeRegressor model
dt = DecisionTreeRegressor(featuresCol="features", labelCol="price", maxBins=300, maxDepth=8, minInfoGain=100)

# Fit the model on the training data
dt_model = dt.fit(train_data)

# Make predictions on the training data
train_predictions = dt_model.transform(train_data)

# Evaluate the model on the training data
train_evaluator = RegressionEvaluator(labelCol="price", predictionCol="prediction", metricName="rmse")
train_rmse = train_evaluator.evaluate(train_predictions)
train_r2 = train_evaluator.evaluate(train_predictions, {train_evaluator.metricName: "r2"})
print("Training - RMSE: {:.3f}, R2 Score: {:.3f}".format(train_rmse, train_r2))

# Make predictions on the test data
test_predictions = dt_model.transform(test_data)

# Evaluate the model on the test data
test_evaluator = RegressionEvaluator(labelCol="price", predictionCol="prediction", metricName="rmse")
test_rmse = test_evaluator.evaluate(test_predictions)
test_r2 = test_evaluator.evaluate(test_predictions, {test_evaluator.metricName: "r2"})
print("Test - RMSE: {:.3f}, R2 Score: {:.3f}".format(test_rmse, test_r2))
print('---')


24/04/04 18:57:25 WARN DAGScheduler: Broadcasting large task binary with size 3.4 MiB
24/04/04 18:57:26 WARN DAGScheduler: Broadcasting large task binary with size 3.4 MiB
24/04/04 18:57:26 WARN DAGScheduler: Broadcasting large task binary with size 4.0 MiB
24/04/04 18:57:29 WARN DAGScheduler: Broadcasting large task binary with size 4.0 MiB
24/04/04 18:57:30 WARN DAGScheduler: Broadcasting large task binary with size 4.0 MiB
24/04/04 18:57:31 WARN DAGScheduler: Broadcasting large task binary with size 4.0 MiB
24/04/04 18:57:32 WARN DAGScheduler: Broadcasting large task binary with size 4.0 MiB
24/04/04 18:57:34 WARN DAGScheduler: Broadcasting large task binary with size 4.0 MiB
24/04/04 18:57:36 WARN DAGScheduler: Broadcasting large task binary with size 4.0 MiB
24/04/04 18:57:38 WARN DAGScheduler: Broadcasting large task binary with size 4.0 MiB
24/04/04 18:57:40 WARN DAGScheduler: Broadcasting large task binary with size 3.4 MiB
24/04/04 18:57:41 WARN DAGScheduler: Broadcasting larg

Training - RMSE: 50176.114, R2 Score: 0.555


24/04/04 18:57:42 WARN DAGScheduler: Broadcasting large task binary with size 3.4 MiB
24/04/04 18:57:43 WARN DAGScheduler: Broadcasting large task binary with size 3.4 MiB
24/04/04 18:57:44 WARN DAGScheduler: Broadcasting large task binary with size 3.4 MiB

Test - RMSE: 55913.392, R2 Score: 0.367
---


24/04/04 18:57:44 WARN DAGScheduler: Broadcasting large task binary with size 3.4 MiB
                                                                                