In [1]:
# import the library
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style('whitegrid')

In [2]:
# initiate spark

import findspark

findspark.init()

from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession

sc = SparkContext()
spark = SparkSession(sc)

In [3]:
# import spark libraries
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.evaluation import RegressionEvaluator

# Problem definition

Apply regression models to predict the house pricing

# Load the data

In [4]:
# Load and parse the data file, converting it to a DataFrame.
data = spark.read.csv('data/kc_house_data.csv', header = True, inferSchema = True)
data.show

<bound method DataFrame.show of DataFrame[id: bigint, date: string, price: decimal(7,0), bedrooms: int, bathrooms: double, sqft_living: int, sqft_lot: int, floors: double, waterfront: int, view: int, condition: int, grade: int, sqft_above: int, sqft_basement: int, yr_built: int, yr_renovated: int, zipcode: int, lat: double, long: double, sqft_living15: int, sqft_lot15: int]>

# Feature Engineering 

In [5]:
# feature engineering
X_columns = ['bedrooms', 'bathrooms', 'grade', 'condition', 'waterfront', 'sqft_living15', 'sqft_lot15', 'lat', 'long']
y_column = "price"

In [6]:
# Create the features column
vecAssembler = VectorAssembler(inputCols=X_columns, outputCol="features")
data = vecAssembler.transform(data)

# Split the data into training and test sets (80% held out for testing)
(trainingData, testData) = data.randomSplit([0.8, 0.2])

# Model Training

In [7]:
# Train a RandomForest model.
rf = RandomForestRegressor(labelCol=y_column, numTrees=100)

# Chain indexer and forest in a Pipeline
pipeline = Pipeline(stages=[rf])

# Train model.  This also runs the indexer.
model = pipeline.fit(trainingData)

# Make predictions.
predictions = model.transform(testData)

# Select example rows to display.
predictions.select("prediction", y_column, "features").show(5)

rfModel = model.stages
print(rfModel)  # summary only

+------------------+------+--------------------+
|        prediction| price|            features|
+------------------+------+--------------------+
|348821.05241557217|400000|[3.0,1.0,7.0,3.0,...|
| 871422.8317387827|872750|[3.0,2.5,10.0,3.0...|
| 997595.6252685287|795000|[3.0,3.5,10.0,3.0...|
| 303664.9780927223|130000|[3.0,1.0,7.0,4.0,...|
| 312307.3378118256|205000|[2.0,1.75,6.0,3.0...|
+------------------+------+--------------------+
only showing top 5 rows

[RandomForestRegressionModel (uid=RandomForestRegressor_42bc8f4229ecf52d691a) with 100 trees]


# Model Evaluation

In [8]:
# Select (prediction, true label) and compute test error
evaluator = RegressionEvaluator(labelCol=y_column, predictionCol="prediction", metricName="mae")
mae = evaluator.evaluate(predictions)
print("Mean Absolute Error (MAE) on test data = %g" % mae)

evaluator = RegressionEvaluator(labelCol=y_column, predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

rfModel = model.stages
print(rfModel)  # summary only

Mean Absolute Error (MAE) on test data = 116454
Root Mean Squared Error (RMSE) on test data = 208939
[RandomForestRegressionModel (uid=RandomForestRegressor_42bc8f4229ecf52d691a) with 100 trees]
