In [1]:
from pyspark import SparkContext, SparkConf
from pyspark import SQLContext
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression

conf = SparkConf().setAppName("Analisis Housing").setMaster("local")
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)

rdd = sqlContext.read.csv("housing(1).csv", header=True).rdd

df = rdd.toDF()
df.show()

+-----+-----+-------+--------+
|   RM|LSTAT|PTRATIO|    MEDV|
+-----+-----+-------+--------+
|6.575| 4.98|   15.3|504000.0|
|6.421| 9.14|   17.8|453600.0|
|7.185| 4.03|   17.8|728700.0|
|6.998| 2.94|   18.7|701400.0|
|7.147| 5.33|   18.7|760200.0|
| 6.43| 5.21|   18.7|602700.0|
|6.012|12.43|   15.2|480900.0|
|6.172|19.15|   15.2|569100.0|
|5.631|29.93|   15.2|346500.0|
|6.004| 17.1|   15.2|396900.0|
|6.377|20.45|   15.2|315000.0|
|6.009|13.27|   15.2|396900.0|
|5.889|15.71|   15.2|455700.0|
|5.949| 8.26|   21.0|428400.0|
|6.096|10.26|   21.0|382200.0|
|5.834| 8.47|   21.0|417900.0|
|5.935| 6.58|   21.0|485100.0|
| 5.99|14.67|   21.0|367500.0|
|5.456|11.69|   21.0|424200.0|
|5.727|11.28|   21.0|382200.0|
+-----+-----+-------+--------+
only showing top 20 rows



In [2]:
rdd = rdd.map(lambda x: ( float(x[0]), float(x[1]), float(x[2]), float(x[3]) ) )
df = rdd.toDF()

In [3]:
assembler = VectorAssembler(inputCols=["_1", "_2", "_3"], outputCol="features")
df = assembler.transform(df)
df.show()

+-----+-----+----+--------+------------------+
|   _1|   _2|  _3|      _4|          features|
+-----+-----+----+--------+------------------+
|6.575| 4.98|15.3|504000.0| [6.575,4.98,15.3]|
|6.421| 9.14|17.8|453600.0| [6.421,9.14,17.8]|
|7.185| 4.03|17.8|728700.0| [7.185,4.03,17.8]|
|6.998| 2.94|18.7|701400.0| [6.998,2.94,18.7]|
|7.147| 5.33|18.7|760200.0| [7.147,5.33,18.7]|
| 6.43| 5.21|18.7|602700.0|  [6.43,5.21,18.7]|
|6.012|12.43|15.2|480900.0|[6.012,12.43,15.2]|
|6.172|19.15|15.2|569100.0|[6.172,19.15,15.2]|
|5.631|29.93|15.2|346500.0|[5.631,29.93,15.2]|
|6.004| 17.1|15.2|396900.0| [6.004,17.1,15.2]|
|6.377|20.45|15.2|315000.0|[6.377,20.45,15.2]|
|6.009|13.27|15.2|396900.0|[6.009,13.27,15.2]|
|5.889|15.71|15.2|455700.0|[5.889,15.71,15.2]|
|5.949| 8.26|21.0|428400.0| [5.949,8.26,21.0]|
|6.096|10.26|21.0|382200.0|[6.096,10.26,21.0]|
|5.834| 8.47|21.0|417900.0| [5.834,8.47,21.0]|
|5.935| 6.58|21.0|485100.0| [5.935,6.58,21.0]|
| 5.99|14.67|21.0|367500.0| [5.99,14.67,21.0]|
|5.456|11.69|

In [4]:
dfTrain, dfTest = df.randomSplit([0.8, 0.2], seed=12345)

dfTrain.count()

399

In [5]:
dfTest.count()

90

In [6]:
lr = LinearRegression(featuresCol="features", 
                      labelCol="_4",
                     maxIter=100,
                     regParam=0.001)
lrModel = lr.fit(dfTrain)

In [7]:
lrModel.coefficients

DenseVector([79801.3016, -11021.8532, -19306.5604])

In [8]:
lrModel.intercept

456470.1768002261

In [12]:
from pyspark.ml.evaluation import RegressionEvaluator

dfPrediction = lrModel.transform(dfTest)
evaluador = RegressionEvaluator(predictionCol="prediction",
                               labelCol="_4", metricName="rmse")
evaluador.evaluate(dfPrediction)

93937.27430755217