In [1]:
import findspark
findspark.init()
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.getOrCreate()
spark

In [4]:
data = spark.read.csv('boston_housing.csv',header=True,inferSchema=True)

In [5]:
data

DataFrame[crim: double, zn: double, indus: double, chas: int, nox: double, rm: double, age: double, dis: double, rad: int, tax: int, ptratio: double, b: double, lstat: double, medv: double]

In [6]:
data.show()

+-------+----+-----+----+-----+-----+-----+------+---+---+-------+------+-----+----+
|   crim|  zn|indus|chas|  nox|   rm|  age|   dis|rad|tax|ptratio|     b|lstat|medv|
+-------+----+-----+----+-----+-----+-----+------+---+---+-------+------+-----+----+
|0.00632|18.0| 2.31|   0|0.538|6.575| 65.2|  4.09|  1|296|   15.3| 396.9| 4.98|24.0|
|0.02731| 0.0| 7.07|   0|0.469|6.421| 78.9|4.9671|  2|242|   17.8| 396.9| 9.14|21.6|
|0.02729| 0.0| 7.07|   0|0.469|7.185| 61.1|4.9671|  2|242|   17.8|392.83| 4.03|34.7|
|0.03237| 0.0| 2.18|   0|0.458|6.998| 45.8|6.0622|  3|222|   18.7|394.63| 2.94|33.4|
|0.06905| 0.0| 2.18|   0|0.458|7.147| 54.2|6.0622|  3|222|   18.7| 396.9| 5.33|36.2|
|0.02985| 0.0| 2.18|   0|0.458| 6.43| 58.7|6.0622|  3|222|   18.7|394.12| 5.21|28.7|
|0.08829|12.5| 7.87|   0|0.524|6.012| 66.6|5.5605|  5|311|   15.2| 395.6|12.43|22.9|
|0.14455|12.5| 7.87|   0|0.524|6.172| 96.1|5.9505|  5|311|   15.2| 396.9|19.15|27.1|
|0.21124|12.5| 7.87|   0|0.524|5.631|100.0|6.0821|  5|311|   15.2

In [7]:
# setup data into Features and Labels

feature_columns = data.columns[:-1]

from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols = feature_columns, outputCol="features")

df = assembler.transform(data)
df.show()

+-------+----+-----+----+-----+-----+-----+------+---+---+-------+------+-----+----+--------------------+
|   crim|  zn|indus|chas|  nox|   rm|  age|   dis|rad|tax|ptratio|     b|lstat|medv|            features|
+-------+----+-----+----+-----+-----+-----+------+---+---+-------+------+-----+----+--------------------+
|0.00632|18.0| 2.31|   0|0.538|6.575| 65.2|  4.09|  1|296|   15.3| 396.9| 4.98|24.0|[0.00632,18.0,2.3...|
|0.02731| 0.0| 7.07|   0|0.469|6.421| 78.9|4.9671|  2|242|   17.8| 396.9| 9.14|21.6|[0.02731,0.0,7.07...|
|0.02729| 0.0| 7.07|   0|0.469|7.185| 61.1|4.9671|  2|242|   17.8|392.83| 4.03|34.7|[0.02729,0.0,7.07...|
|0.03237| 0.0| 2.18|   0|0.458|6.998| 45.8|6.0622|  3|222|   18.7|394.63| 2.94|33.4|[0.03237,0.0,2.18...|
|0.06905| 0.0| 2.18|   0|0.458|7.147| 54.2|6.0622|  3|222|   18.7| 396.9| 5.33|36.2|[0.06905,0.0,2.18...|
|0.02985| 0.0| 2.18|   0|0.458| 6.43| 58.7|6.0622|  3|222|   18.7|394.12| 5.21|28.7|[0.02985,0.0,2.18...|
|0.08829|12.5| 7.87|   0|0.524|6.012| 66.6|5.5

In [8]:
train,test = df.randomSplit([0.7,0.3])

In [11]:
from pyspark.ml.regression import LinearRegression

algo = LinearRegression(featuresCol="features",labelCol="medv")

In [12]:
# Time to train the data

model = algo.fit(train)

In [13]:
evaluation_summary = model.evaluate(test)

Exception ignored in: <function JavaWrapper.__del__ at 0x00000000061B6BF8>
Traceback (most recent call last):
  File "C:\spark-2.4.4-bin-hadoop2.7\python\pyspark\ml\wrapper.py", line 40, in __del__
    if SparkContext._active_spark_context and self._java_obj is not None:
AttributeError: 'LinearRegression' object has no attribute '_java_obj'


In [16]:
evaluation_summary.meanAbsoluteError

3.409699162433982

In [17]:
evaluation_summary.rootMeanSquaredError

4.640266647528652

In [18]:
evaluation_summary.r2

0.6376343957107979

In [19]:
predictions = model.transform(test)

In [20]:
predictions.select(predictions.columns[13:]).show()

+----+--------------------+------------------+
|medv|            features|        prediction|
+----+--------------------+------------------+
|32.2|[0.00906,90.0,2.9...|31.754365568341804|
|50.0|[0.01381,80.0,0.4...| 41.56684332653541|
|31.6|[0.01432,100.0,1....| 34.14998885104967|
|24.5|[0.01501,80.0,2.0...| 28.34364782312287|
|23.1|[0.0187,85.0,4.15...|26.178150478939454|
|24.7|[0.02055,85.0,0.7...|25.935355620001385|
|42.3|[0.02177,82.5,2.0...|37.767075270484455|
|16.5|[0.02498,0.0,1.89...|22.667552583367556|
|23.9|[0.02543,55.0,3.7...| 28.52866798213852|
|31.2|[0.03049,55.0,3.7...|29.463165194775808|
|20.6|[0.03306,0.0,5.19...|22.419306389201083|
|24.1|[0.03445,82.5,2.0...| 29.59171962734775|
|28.5|[0.03502,80.0,4.9...|34.314052370024015|
|22.0|[0.03537,34.0,6.0...| 28.97424902947539|
|35.4|[0.03705,20.0,3.3...| 34.74613217457992|
|20.7|[0.03738,0.0,5.19...|22.283186489076932|
|23.2|[0.03871,52.5,5.3...|27.499199654179122|
|22.0|[0.03932,0.0,3.41...|27.816052900974825|
|21.1|[0.0396