Population vs. Median Home Prices
Linear Regression with Single Variable

Load and parse the data

In [1]:
# Use the Spark CSV datasource with options specifying:
#  - First line of file is a header
#  - Automatically infer the schema of the data
data = spark.read.csv("/Users/cchavez/dev/Group_Project/HOUSE_COMBINED/HOUSE_COMBINED.csv", header="true", inferSchema="true")
data.cache()  # Cache data for faster reuse
data.count()

1716

In [2]:
display(data)

DataFrame[ZIP Code: int, Year: int, Month: int, SalesCount: int, AvgSalesPrice: string]

In [3]:
data = data.dropna()  # drop rows with missing values
data.count()

1716

In [4]:
from pyspark.sql.functions import col
# rename the feature and label columns, replacing spaces with _
exprs = [col(column).alias(column.replace(' ', '_')) for column in data.columns]


In [5]:
display(data)

DataFrame[ZIP Code: int, Year: int, Month: int, SalesCount: int, AvgSalesPrice: string]

In [None]:

vdata = data.select(*exprs).selectExpr("2014_Population_estimate as population", "2015_median_sales_price as label")
display(vdata)

In [None]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler

stages = []
assembler = VectorAssembler(inputCols=["population"], outputCol="features")
stages += [assembler]
pipeline = Pipeline(stages=stages)
pipelineModel = pipeline.fit(vdata)
dataset = pipelineModel.transform(vdata)
# Keep relevant columns
selectedcols = ["features", "label"]
display(dataset.select(selectedcols))

Scatterplot of the data

In [None]:
import numpy as np
import matplotlib.pyplot as plt

x = dataset.rdd.map(lambda p: (p.features[0])).collect()
y = dataset.rdd.map(lambda p: (p.label)).collect()

plt.style.use('classic')
plt.rcParams['lines.linewidth'] = 0
fig, ax = plt.subplots()
ax.loglog(x,y)
plt.xlim(1.0e5, 1.0e7)
plt.ylim(5.0e1, 1.0e3)
ax.scatter(x, y, c="blue")

display(fig)

Linear Regression

In [None]:
Goal
Predict y = 2015 Median Housing Price
Using feature x = 2014 Population Estimate

In [None]:
# Import LinearRegression class
from pyspark.ml.regression import LinearRegression
# Define LinearRegression algorithm
lr = LinearRegression()

In [None]:
# Fit 2 models, using different regularization parameters
modelA = lr.fit(dataset, {lr.regParam:0.0})
modelB = lr.fit(dataset, {lr.regParam:100.0})
print(">>>> ModelA intercept: %r, coefficient: %r" % (modelA.intercept, modelA.coefficients[0]))
print(">>>> ModelB intercept: %r, coefficient: %r" % (modelB.intercept, modelB.coefficients[0]))
>>>> ModelA intercept: 191.29427575139394, coefficient: 3.779789682338248e-05
>>>> ModelB intercept: 199.85112564667153, coefficient: 2.1603499483717156e-05

Make predictions
Calling ```transform()``` on data adds a new column of predictions.

In [None]:
# Make predictions
predictionsA = modelA.transform(dataset)
display(predictionsA)

In [None]:
predictionsB = modelB.transform(dataset)
display(predictionsB)

Evaluate the Model
Predicted vs. True label

In [None]:
from pyspark.ml.evaluation import RegressionEvaluator
evaluator = RegressionEvaluator(metricName="rmse")
RMSE = evaluator.evaluate(predictionsA)
print("ModelA: Root Mean Squared Error = " + str(RMSE))

In [None]:
predictionsB = modelB.transform(dataset)
RMSE = evaluator.evaluate(predictionsB)
print("ModelB: Root Mean Squared Error = " + str(RMSE))

Plot residuals versus fitted values


In [None]:
display(modelA,dataset)


Linear Regression Plots

In [None]:
import numpy as np
from pandas import *

pop = dataset.rdd.map(lambda p: (p.features[0])).collect()
price = dataset.rdd.map(lambda p: (p.label)).collect()
predA = predictionsA.select("prediction").rdd.map(lambda r: r[0]).collect()
predB = predictionsB.select("prediction").rdd.map(lambda r: r[0]).collect()

pydf = DataFrame({'pop':pop,'price':price,'predA':predA, 'predB':predB})

View the pandas DataFrame (pydf)


In [None]:
pydf

Display the scatterplot and the two regression models


In [None]:
fig, ax = plt.subplots()
ax.loglog(x,y)
ax.scatter(x, y)
plt.xlim(1.0e5, 1.0e7)
plt.ylim(5.0e1, 1.0e3)
ax.plot(pop, predA, '.r-')
ax.plot(pop, predB, '.g-')
display(fig)