In [1]:
%pip install pyspark



In [2]:
import os
import sys
import pyspark

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

In [3]:
#grant permission (for gdrive use only)
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [4]:
cd gdrive/MyDrive/

/content/gdrive/MyDrive


In [5]:
#load housing dataset from csv file
from pyspark.sql import SparkSession

ss = SparkSession.builder.appName('housing_data').getOrCreate()
df_without_header = ss.read.option('inferSchema', True).option('header', False).csv('housing_data.csv')
df_without_header = df_without_header.na.drop()
df_without_header.show()

+--------------------+------+-------------------+--------+---+---+---+--------------------+----------------+--------------------+------------------+------------------+--------------------+--------------+----+----+
|                 _c0|   _c1|                _c2|     _c3|_c4|_c5|_c6|                 _c7|             _c8|                 _c9|              _c10|              _c11|                _c12|          _c13|_c14|_c15|
+--------------------+------+-------------------+--------+---+---+---+--------------------+----------------+--------------------+------------------+------------------+--------------------+--------------+----+----+
|{E104A9E7-1D6A-4D...| 36500|1995-06-09 00:00:00| FY4 1DL|  F|  N|  L|      CLARENCE COURT|              28|    RAWCLIFFE STREET|         BLACKPOOL|         BLACKPOOL|           BLACKPOOL|     BLACKPOOL|   A|   A|
|{748F870E-C337-40...| 74000|1995-01-12 00:00:00| RH2 9NF|  F|  Y|  L|SOMERS PLACE, 83 ...|         FLAT 21|        REIGATE HILL|           REIG

In [6]:
#add column names from kaggle dataset page
col_names=['Transaction_unique_identifier', 'price', 'Date_of_Transfer', 'postcode', 'Property_Type', 'Old/New',
'Duration', 'PAON', 'SAON', 'Street', 'Locality', 'Town/City', 'District', 'County', 'PPDCategory_Type',
'Record_Status - monthly_file_only']
housing_df = df_without_header.toDF(*col_names)

# use first 500000 entries due to limitations in compute power
housing_df = ss.createDataFrame(housing_df.head(200000), housing_df.schema)

housing_df.show()
print(housing_df.count())

+-----------------------------+------+-------------------+--------+-------------+-------+--------+--------------------+----------------+--------------------+------------------+------------------+--------------------+--------------+----------------+---------------------------------+
|Transaction_unique_identifier| price|   Date_of_Transfer|postcode|Property_Type|Old/New|Duration|                PAON|            SAON|              Street|          Locality|         Town/City|            District|        County|PPDCategory_Type|Record_Status - monthly_file_only|
+-----------------------------+------+-------------------+--------+-------------+-------+--------+--------------------+----------------+--------------------+------------------+------------------+--------------------+--------------+----------------+---------------------------------+
|         {E104A9E7-1D6A-4D...| 36500|1995-06-09 00:00:00| FY4 1DL|            F|      N|       L|      CLARENCE COURT|              28|    RAWCLIFFE S

In [7]:
#extract year from date of  transfer
from pyspark.sql.functions import udf
from pyspark.sql.functions import *



extract_year = udf(lambda x: x[0:4])

housing_df = housing_df.withColumn('Date_of_Transfer', to_timestamp('Date_of_Transfer').cast('string'))
housing_df = housing_df.withColumn('Date_of_Transfer', extract_year('Date_of_Transfer'))
housing_df.show()

+-----------------------------+------+----------------+--------+-------------+-------+--------+--------------------+----------------+--------------------+------------------+------------------+--------------------+--------------+----------------+---------------------------------+
|Transaction_unique_identifier| price|Date_of_Transfer|postcode|Property_Type|Old/New|Duration|                PAON|            SAON|              Street|          Locality|         Town/City|            District|        County|PPDCategory_Type|Record_Status - monthly_file_only|
+-----------------------------+------+----------------+--------+-------------+-------+--------+--------------------+----------------+--------------------+------------------+------------------+--------------------+--------------+----------------+---------------------------------+
|         {E104A9E7-1D6A-4D...| 36500|            1995| FY4 1DL|            F|      N|       L|      CLARENCE COURT|              28|    RAWCLIFFE STREET|      

In [None]:
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import VectorAssembler,OneHotEncoder
from pyspark.ml.evaluation import RegressionEvaluator


all_cols = housing_df.columns
all_cols.remove('price')


# Remove categorical features with high cardinality to reduce computation load and avoid overfitting
all_cols.remove('postcode')
all_cols.remove('PAON')
all_cols.remove('Street')

#all categorical varables that require one hot encoding
categorical_cols = ['postcode', 'Old/New', 'PAON', 'SAON',
                    'Locality', 'Town/City', 'District',
                    'County', 'PPDCategory_Type', 'Date_of_Transfer']


# List to store stages of the pipeline
stages = []

# Apply StringIndexer to each categorical column
indexers = [StringIndexer(inputCol=col, outputCol=f"{col}_indexed", handleInvalid="keep") for col in categorical_cols]
stages += indexers

# Apply OneHotEncoder to each indexed categorical column
encoders = [OneHotEncoder(inputCol=f"{col}_indexed", outputCol=f"{col}_encoded") for col in categorical_cols]
stages += encoders

# Assemble all features into a single vector
assembler_inputs = [f"{col}_encoded" for col in categorical_cols]
assembler = VectorAssembler(inputCols=assembler_inputs, outputCol="features")
stages.append(assembler)

# Create pipeline
pipeline = Pipeline(stages=stages)

# Fit pipeline to data and transform DataFrame
pipeline_model = pipeline.fit(housing_df)
df_housing_final = pipeline_model.transform(housing_df)
df_housing_final.show(5)


# Split data into train and test sets
(train_data, test_data) = df_housing_final.randomSplit([0.99, 0.01], seed=123)

#regression model
dt = DecisionTreeRegressor(featuresCol="features", labelCol="price")

#evaluator
dtevaluator = RegressionEvaluator(predictionCol="features", labelCol="price", metricName="rmse")

#paramgridbuilder for comparision
paramgrid = ParamGridBuilder().addGrid(dt.maxDepth, [5, 15]).addGrid(dt.maxBins, [100, 200]).build()

#crossvalidator
dtcv = CrossValidator(estimator = dt, estimatorParamMaps = paramgrid, evaluator = dtevaluator, numFolds = 5)

#fit to train data
dtcvModel = dtcv.fit(train_data)
print(dtcvModel)

#test predictions and score
dtpredictions = dtcvModel.transform(test_data)
print('RMSE:', dt.evaluate(dtpredictions))


+-----------------------------+-----+----------------+--------+-------------+-------+--------+--------------------+-------+-----------------+-----------------+-----------------+--------------------+-----------+----------------+---------------------------------+----------------+---------------+------------+------------+----------------+-----------------+----------------+--------------+------------------------+------------------------+--------------------+---------------+--------------------+------------------+------------------+------------------+-----------------+----------------+------------------------+------------------------+--------------------+
|Transaction_unique_identifier|price|Date_of_Transfer|postcode|Property_Type|Old/New|Duration|                PAON|   SAON|           Street|         Locality|        Town/City|            District|     County|PPDCategory_Type|Record_Status - monthly_file_only|postcode_indexed|Old/New_indexed|PAON_indexed|SAON_indexed|Locality_indexed|Town

----------------------------------------
Exception occurred during processing of request from ('127.0.0.1', 57668)
ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/usr/lib/python3.10/socket.py", line 705, in readinto
    return self._sock.recv_into(b)
ConnectionResetError: [Errno 104] Connection reset by peer

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/usr/local/lib/python3.10/dist-packages/py4j/clientserver.py", line 539, in send_command
    raise Py4JNetworkError(
py4j.protocol.Py4JNetworkError: Error while sending or receiving
Traceback (most recent call last):
  File "/usr/lib/pytho

ConnectionRefusedError: [Errno 111] Connection refused