In [1]:
import os
import sys
import pyspark 

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

In [2]:
#load housing dataset from csv file
from pyspark.sql import SparkSession 

ss = SparkSession.builder.appName('housing_data').getOrCreate()
df_without_header = ss.read.option('inferSchema', True).option('header', False).csv('housing_data.csv')
df_without_header.show()



+--------------------+------+-------------------+--------+---+---+---+--------------+----+-----------------+-------------+-------------+-------------------+---------------+----+----+
|                 _c0|   _c1|                _c2|     _c3|_c4|_c5|_c6|           _c7| _c8|              _c9|         _c10|         _c11|               _c12|           _c13|_c14|_c15|
+--------------------+------+-------------------+--------+---+---+---+--------------+----+-----------------+-------------+-------------+-------------------+---------------+----+----+
|{F887F88E-7D15-44...| 70000|1995-07-07 00:00:00|MK15 9HP|  D|  N|  F|            31|null|    ALDRICH DRIVE|       WILLEN|MILTON KEYNES|      MILTON KEYNES|  MILTON KEYNES|   A|   A|
|{40FD4DF2-5362-40...| 44500|1995-02-03 00:00:00| SR6 0AQ|  T|  N|  F|            50|null|      HOWICK PARK|   SUNDERLAND|   SUNDERLAND|         SUNDERLAND|  TYNE AND WEAR|   A|   A|
|{7A99F89E-7D81-4E...| 56500|1995-01-13 00:00:00| CO6 1SQ|  T|  N|  F|            19|

In [3]:
#add column names from kaggle dataset page
col_names=['Transaction_unique_identifier', 'price', 'Date_of_Transfer', 'postcode', 'Property_Type', 'Old/New',
'Duration', 'PAON', 'SAON', 'Street', 'Locality', 'Town/City', 'District', 'County', 'PPDCategory_Type',
'Record_Status - monthly_file_only']
housing_df = df_without_header.toDF(*col_names)
                                    
#use first 500000 entries due to limitations in compute power 
housing_df = ss.createDataFrame(housing_df.head(500000), housing_df.schema)
housing_df.show()


+-----------------------------+------+-------------------+--------+-------------+-------+--------+--------------+----+-----------------+-------------+-------------+-------------------+---------------+----------------+---------------------------------+
|Transaction_unique_identifier| price|   Date_of_Transfer|postcode|Property_Type|Old/New|Duration|          PAON|SAON|           Street|     Locality|    Town/City|           District|         County|PPDCategory_Type|Record_Status - monthly_file_only|
+-----------------------------+------+-------------------+--------+-------------+-------+--------+--------------+----+-----------------+-------------+-------------+-------------------+---------------+----------------+---------------------------------+
|         {F887F88E-7D15-44...| 70000|1995-07-07 00:00:00|MK15 9HP|            D|      N|       F|            31|null|    ALDRICH DRIVE|       WILLEN|MILTON KEYNES|      MILTON KEYNES|  MILTON KEYNES|               A|                           

In [4]:
#data schema 
housing_df.printSchema()

root
 |-- Transaction_unique_identifier: string (nullable = true)
 |-- price: integer (nullable = true)
 |-- Date_of_Transfer: timestamp (nullable = true)
 |-- postcode: string (nullable = true)
 |-- Property_Type: string (nullable = true)
 |-- Old/New: string (nullable = true)
 |-- Duration: string (nullable = true)
 |-- PAON: string (nullable = true)
 |-- SAON: string (nullable = true)
 |-- Street: string (nullable = true)
 |-- Locality: string (nullable = true)
 |-- Town/City: string (nullable = true)
 |-- District: string (nullable = true)
 |-- County: string (nullable = true)
 |-- PPDCategory_Type: string (nullable = true)
 |-- Record_Status - monthly_file_only: string (nullable = true)



In [5]:
#get size stats and category statistics to decide which attributes are useful in computation
from pyspark.sql.functions import col

#total entries
print("Housing data contains " + str(housing_df.count()) + " entries")

#Date_of_Transfer
transfer_count =  housing_df.select('Date_of_Transfer').distinct().count()
print("There are " + str(transfer_count) + " entries based on unique Date_of_Transfer")


Housing data contains 500000 entries
There are 365 entries based on unique Date_of_Transfer


In [6]:
def count_cols(df_col):
    col_count = housing_df.select(df_col).distinct().count()
    print("There are " + str(col_count) + " entries based on unique " + str(df_col) + " values")
    
for i in col_names:
    count_cols(i)


There are 500000 entries based on unique Transaction_unique_identifier values
There are 10819 entries based on unique price values
There are 365 entries based on unique Date_of_Transfer values
There are 316022 entries based on unique postcode values
There are 5 entries based on unique Property_Type values
There are 2 entries based on unique Old/New values
There are 3 entries based on unique Duration values
There are 32683 entries based on unique PAON values
There are 1811 entries based on unique SAON values
There are 118030 entries based on unique Street values
There are 14206 entries based on unique Locality values
There are 1156 entries based on unique Town/City values
There are 459 entries based on unique District values
There are 130 entries based on unique County values
There are 2 entries based on unique PPDCategory_Type values
There are 1 entries based on unique Record_Status - monthly_file_only values


In [7]:
#drop Transaction_unique_identifier due to there not being enough unique column entries
housing_df = housing_df.drop('Transaction_unique_identifier')
housing_df = housing_df.drop('Record_Status - monthly_file_only')

In [8]:
#get list of string entries from schema 
string_type_columns = []
for col in housing_df.dtypes:
    #print(col[0]+" , "+col[1])
    if(col[1] == 'string'):
        string_type_columns.append(col[0])
print(string_type_columns)

['postcode', 'Property_Type', 'Old/New', 'Duration', 'PAON', 'SAON', 'Street', 'Locality', 'Town/City', 'District', 'County', 'PPDCategory_Type']


In [9]:
from pyspark.ml.feature import StringIndexer

def encode_values(column):
    global housing_df
    indexer = StringIndexer(inputCol= column, outputCol= column + "encoded")
    housing_df = indexer.fit(housing_df).transform(housing_df)
    housing_df = housing_df.drop(column).withColumnRenamed(column + "encoded", column)
    
"""for col_name in string_type_columns:
    encode_values(col_name)
housing_df.show()"""    
encode_values('postcode')
encode_values('Property_Type')
encode_values('Old/New')
housing_df.show()


+------+-------------------+--------+--------------+----+-----------------+-------------+-------------+-------------------+---------------+----------------+--------+-------------+-------+
| price|   Date_of_Transfer|Duration|          PAON|SAON|           Street|     Locality|    Town/City|           District|         County|PPDCategory_Type|postcode|Property_Type|Old/New|
+------+-------------------+--------+--------------+----+-----------------+-------------+-------------+-------------------+---------------+----------------+--------+-------------+-------+
| 70000|1995-07-07 00:00:00|       F|            31|null|    ALDRICH DRIVE|       WILLEN|MILTON KEYNES|      MILTON KEYNES|  MILTON KEYNES|               A|211233.0|          2.0|    0.0|
| 44500|1995-02-03 00:00:00|       F|            50|null|      HOWICK PARK|   SUNDERLAND|   SUNDERLAND|         SUNDERLAND|  TYNE AND WEAR|               A| 89227.0|          0.0|    0.0|
| 56500|1995-01-13 00:00:00|       F|            19|null| BR

In [10]:
encode_values('Duration')
housing_df.show()


+------+-------------------+--------------+----+-----------------+-------------+-------------+-------------------+---------------+----------------+--------+-------------+-------+--------+
| price|   Date_of_Transfer|          PAON|SAON|           Street|     Locality|    Town/City|           District|         County|PPDCategory_Type|postcode|Property_Type|Old/New|Duration|
+------+-------------------+--------------+----+-----------------+-------------+-------------+-------------------+---------------+----------------+--------+-------------+-------+--------+
| 70000|1995-07-07 00:00:00|            31|null|    ALDRICH DRIVE|       WILLEN|MILTON KEYNES|      MILTON KEYNES|  MILTON KEYNES|               A|211233.0|          2.0|    0.0|     0.0|
| 44500|1995-02-03 00:00:00|            50|null|      HOWICK PARK|   SUNDERLAND|   SUNDERLAND|         SUNDERLAND|  TYNE AND WEAR|               A| 89227.0|          0.0|    0.0|     0.0|
| 56500|1995-01-13 00:00:00|            19|null| BRICK KILN 

In [11]:
encode_values('PAON')
encode_values('SAON')
housing_df.show()


Py4JJavaError: An error occurred while calling o407.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 130.0 failed 1 times, most recent failure: Lost task 0.0 in stage 130.0 (TID 688) (lpcp-23 executor driver): org.apache.spark.SparkException: [FAILED_EXECUTE_UDF] Failed to execute user defined function (StringIndexerModel$$Lambda$4946/0x00007fa76918c000: (string) => double).
	at org.apache.spark.sql.errors.QueryExecutionErrors$.failedExecuteUserDefinedFunctionError(QueryExecutionErrors.scala:217)
	at org.apache.spark.sql.errors.QueryExecutionErrors.failedExecuteUserDefinedFunctionError(QueryExecutionErrors.scala)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:760)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$getByteArrayRdd$1(SparkPlan.scala:388)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:888)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:888)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:364)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:328)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:92)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:161)
	at org.apache.spark.scheduler.Task.run(Task.scala:139)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:554)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1529)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:557)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1136)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:635)
	at java.base/java.lang.Thread.run(Thread.java:840)
Caused by: org.apache.spark.SparkException: StringIndexer encountered NULL value. To handle or skip NULLS, try setting StringIndexer.handleInvalid.
	at org.apache.spark.ml.feature.StringIndexerModel.$anonfun$getIndexer$1(StringIndexer.scala:396)
	at org.apache.spark.ml.feature.StringIndexerModel.$anonfun$getIndexer$1$adapted(StringIndexer.scala:391)
	... 18 more

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2785)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2721)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2720)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2720)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1206)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1206)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1206)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2984)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2923)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2912)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:971)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2263)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2284)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2303)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:530)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:483)
	at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:61)
	at org.apache.spark.sql.Dataset.collectFromPlan(Dataset.scala:4216)
	at org.apache.spark.sql.Dataset.$anonfun$head$1(Dataset.scala:3200)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$2(Dataset.scala:4206)
	at org.apache.spark.sql.execution.QueryExecution$.withInternalError(QueryExecution.scala:526)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:4204)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:118)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:195)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:103)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:827)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:65)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:4204)
	at org.apache.spark.sql.Dataset.head(Dataset.scala:3200)
	at org.apache.spark.sql.Dataset.take(Dataset.scala:3421)
	at org.apache.spark.sql.Dataset.getRows(Dataset.scala:283)
	at org.apache.spark.sql.Dataset.showString(Dataset.scala:322)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:77)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:568)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:840)
Caused by: org.apache.spark.SparkException: [FAILED_EXECUTE_UDF] Failed to execute user defined function (StringIndexerModel$$Lambda$4946/0x00007fa76918c000: (string) => double).
	at org.apache.spark.sql.errors.QueryExecutionErrors$.failedExecuteUserDefinedFunctionError(QueryExecutionErrors.scala:217)
	at org.apache.spark.sql.errors.QueryExecutionErrors.failedExecuteUserDefinedFunctionError(QueryExecutionErrors.scala)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:760)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$getByteArrayRdd$1(SparkPlan.scala:388)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:888)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:888)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:364)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:328)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:92)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:161)
	at org.apache.spark.scheduler.Task.run(Task.scala:139)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:554)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1529)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:557)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1136)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:635)
	... 1 more
Caused by: org.apache.spark.SparkException: StringIndexer encountered NULL value. To handle or skip NULLS, try setting StringIndexer.handleInvalid.
	at org.apache.spark.ml.feature.StringIndexerModel.$anonfun$getIndexer$1(StringIndexer.scala:396)
	at org.apache.spark.ml.feature.StringIndexerModel.$anonfun$getIndexer$1$adapted(StringIndexer.scala:391)
	... 18 more
