In [1]:
#Import Spark Session (SparkContext and SparkSQL)
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

In [2]:
#Load data from csv file and display top rows
data = spark.read.csv('housing_data.csv', header=True, inferSchema=True)
data.show()

+-------+----+-----+----+-----+-----+-----+------+---+---+-------+------+-----+----+
|   crim|  zn|indus|chas|  nox|   rm|  age|   dis|rad|tax|ptratio|     b|lstat|medv|
+-------+----+-----+----+-----+-----+-----+------+---+---+-------+------+-----+----+
|0.00632|18.0| 2.31|   0|0.538|6.575| 65.2|  4.09|  1|296|   15.3| 396.9| 4.98|24.0|
|0.02731| 0.0| 7.07|   0|0.469|6.421| 78.9|4.9671|  2|242|   17.8| 396.9| 9.14|21.6|
|0.02729| 0.0| 7.07|   0|0.469|7.185| 61.1|4.9671|  2|242|   17.8|392.83| 4.03|34.7|
|0.03237| 0.0| 2.18|   0|0.458|6.998| 45.8|6.0622|  3|222|   18.7|394.63| 2.94|33.4|
|0.06905| 0.0| 2.18|   0|0.458|7.147| 54.2|6.0622|  3|222|   18.7| 396.9| 5.33|36.2|
|0.02985| 0.0| 2.18|   0|0.458| 6.43| 58.7|6.0622|  3|222|   18.7|394.12| 5.21|28.7|
|0.08829|12.5| 7.87|   0|0.524|6.012| 66.6|5.5605|  5|311|   15.2| 395.6|12.43|22.9|
|0.14455|12.5| 7.87|   0|0.524|6.172| 96.1|5.9505|  5|311|   15.2| 396.9|19.15|27.1|
|0.21124|12.5| 7.87|   0|0.524|5.631|100.0|6.0821|  5|311|   15.2

In [3]:
feature_columns = data.columns[:-1] # here we omit the final column
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=feature_columns,outputCol="features")

In [4]:
#use the assembler to create the features column
data_2 = assembler.transform(data)

In [5]:
#Notice another column features at the end
data_2.show()

+-------+----+-----+----+-----+-----+-----+------+---+---+-------+------+-----+----+--------------------+
|   crim|  zn|indus|chas|  nox|   rm|  age|   dis|rad|tax|ptratio|     b|lstat|medv|            features|
+-------+----+-----+----+-----+-----+-----+------+---+---+-------+------+-----+----+--------------------+
|0.00632|18.0| 2.31|   0|0.538|6.575| 65.2|  4.09|  1|296|   15.3| 396.9| 4.98|24.0|[0.00632,18.0,2.3...|
|0.02731| 0.0| 7.07|   0|0.469|6.421| 78.9|4.9671|  2|242|   17.8| 396.9| 9.14|21.6|[0.02731,0.0,7.07...|
|0.02729| 0.0| 7.07|   0|0.469|7.185| 61.1|4.9671|  2|242|   17.8|392.83| 4.03|34.7|[0.02729,0.0,7.07...|
|0.03237| 0.0| 2.18|   0|0.458|6.998| 45.8|6.0622|  3|222|   18.7|394.63| 2.94|33.4|[0.03237,0.0,2.18...|
|0.06905| 0.0| 2.18|   0|0.458|7.147| 54.2|6.0622|  3|222|   18.7| 396.9| 5.33|36.2|[0.06905,0.0,2.18...|
|0.02985| 0.0| 2.18|   0|0.458| 6.43| 58.7|6.0622|  3|222|   18.7|394.12| 5.21|28.7|[0.02985,0.0,2.18...|
|0.08829|12.5| 7.87|   0|0.524|6.012| 66.6|5.5

In [6]:
#Divide data into training and test data
train, test = data_2.randomSplit([0.7, 0.3])

In [7]:
#Train using Linear Regression
from pyspark.ml.regression import LinearRegression

In [8]:
algo = LinearRegression(featuresCol="features", labelCol="medv")

In [9]:
#This function will train the algorithm on given data
model = algo.fit(train)

In [10]:
# Print the coefficients and intercept for linear regression
print("Coefficients: %s" % str(model.coefficients))
print("Intercept: %s" % str(model.intercept))

Coefficients: [-0.15863767534813358,0.040593569129485564,0.009411207605170939,4.306718889090229,-16.20445285077019,4.6994198905113755,-0.0220284113317002,-1.3187644546592525,0.3606117334165998,-0.014940459853213392,-0.8769699322670893,0.010477901375876252,-0.38191946360392404]
Intercept: 28.05052029206885


Based on above, my model essentially is the following equation:
medv = intercept + a*crm + b*zn + c*indus + d*chas + e*nox + f*rm + g*age + h*dis + i*rad + j*tax + k*ptratio + l*b + m*lstat

In [11]:
# Summarize the model over the training set and print out some metrics
trainingSummary = model.summary
print("numIterations: %d" % trainingSummary.totalIterations)
print("objectiveHistory: %s" % str(trainingSummary.objectiveHistory))
trainingSummary.residuals.show()
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
print("r2: %f" % trainingSummary.r2)

numIterations: 1
objectiveHistory: [0.0]
+--------------------+
|           residuals|
+--------------------+
|  -4.929948724390787|
| -0.0860742847362701|
|  -5.438587575279584|
|  3.0619096055285837|
|   8.683356682971208|
| -1.3648837052966272|
| -2.3269654371723547|
| -3.4474032585604455|
|   6.374676034480373|
|   3.782710215998449|
|  1.8869397417503144|
| -2.4239549309798285|
|   8.471783508015875|
|    6.31418961702073|
|-0.08283831727123925|
|   4.876152252627207|
|  -1.369336951346419|
|  -5.219257907827881|
|  -3.494290101187591|
| -2.9628378348236417|
+--------------------+
only showing top 20 rows

RMSE: 4.502752
r2: 0.746963


In [12]:
#Let us look at how well our model is performing on the test dataset. 
evaluation_summary = model.evaluate(test)

In [13]:
print(evaluation_summary.meanAbsoluteError)
print(evaluation_summary.rootMeanSquaredError)
print(evaluation_summary.r2)


3.4751342901886098
5.25964440891253
0.6995906016079638


In [14]:
#Let us now perform predictions
predictions = model.transform(test)

In [15]:
predictions.select(predictions.columns[13:]).show() 

+----+--------------------+------------------+
|medv|            features|        prediction|
+----+--------------------+------------------+
|32.7|[0.01301,35.0,1.5...|30.546130722949663|
|18.9|[0.0136,75.0,4.0,...|15.532336376177499|
|50.0|[0.01501,90.0,1.2...|46.860915246021094|
|20.1|[0.01965,80.0,1.7...|21.109855636725854|
|34.7|[0.02729,0.0,7.07...|30.454266926048334|
|18.5|[0.03041,0.0,5.19...|19.579885330689486|
|31.2|[0.03049,55.0,3.7...| 28.77430356332234|
|17.5|[0.03113,0.0,4.39...|16.905255920813108|
|34.9|[0.0315,95.0,1.47...|30.318147207337315|
|34.9|[0.03359,75.0,2.9...| 34.37722065656257|
|28.5|[0.03502,80.0,4.9...| 33.39050960689039|
|22.0|[0.03537,34.0,6.0...|29.310896800817957|
|22.9|[0.03551,25.0,4.8...|24.839333539948008|
|20.7|[0.03738,0.0,5.19...|22.280020128581253|
|22.0|[0.03932,0.0,3.41...| 26.61560366067733|
|28.0|[0.04113,25.0,4.8...| 28.67434503629106|
|22.9|[0.04203,28.0,15....| 28.53171560429838|
|19.4|[0.04379,80.0,3.3...|25.200970538055348|
|22.3|[0.0459

In [16]:
#Now, let us try a different ML algorithm and see how that algorithm would perform here
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml.evaluation import RegressionEvaluator
dt = DecisionTreeRegressor(featuresCol ='features', labelCol = 'medv')
dt_model = dt.fit(train)
dt_predictions = dt_model.transform(test)
dt_evaluator = RegressionEvaluator(
    labelCol="medv", predictionCol="prediction", metricName="rmse")
rmse = dt_evaluator.evaluate(dt_predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

Root Mean Squared Error (RMSE) on test data = 5.37561


In [17]:
# Select example rows to display.
dt_predictions.select("prediction", "medv", "features").show(5)

+------------------+----+--------------------+
|        prediction|medv|            features|
+------------------+----+--------------------+
|32.682352941176475|32.7|[0.01301,35.0,1.5...|
|20.395689655172415|18.9|[0.0136,75.0,4.0,...|
|48.599999999999994|50.0|[0.01501,90.0,1.2...|
|20.395689655172415|20.1|[0.01965,80.0,1.7...|
|23.600000000000023|34.7|[0.02729,0.0,7.07...|
+------------------+----+--------------------+
only showing top 5 rows



In [18]:
dt_model.featureImportances
[-0.15863767534813358,
 0.040593569129485564,
 0.009411207605170939,
 4.306718889090229,
 -16.20445285077019,
 4.6994198905113755,
 -0.0220284113317002,
 -1.3187644546592525,0.3606117334165998,-0.014940459853213392,-0.8769699322670893,0.010477901375876252,-0.38191946360392404]

SparseVector(13, {0: 0.0382, 2: 0.0113, 4: 0.0365, 5: 0.1896, 6: 0.0039, 7: 0.0339, 9: 0.0028, 10: 0.0069, 11: 0.0002, 12: 0.6767})