In [1]:
import org.apache.spark.sql.functions._
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.linalg.DenseVector
import org.apache.spark.ml.regression.LinearRegression
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.feature.{MinMaxScaler,StandardScaler}
import org.apache.spark.ml.tuning.{CrossValidator, ParamGridBuilder}
import org.apache.spark.ml.evaluation.RegressionEvaluator
import breeze.plot._
import convert.jfc.tohtml

## Load the dataset

In [2]:
val df = spark.read.
  format("csv").
  option("header", "true").
  option("inferschema", "true").
  option("delimiter",",").
  load("../Datasets/Boston.csv")

df = [CRIM: double, ZN: double ... 12 more fields]


[CRIM: double, ZN: double ... 12 more fields]

## Explore the dataset

In [3]:
df.show(10)

+-----------+----+-----------+----+-----------+-----------+-----------+-----------+---+---+-----------+-----------+-----------+-----------+
|       CRIM|  ZN|      INDUS|CHAS|        NOX|         RM|        AGE|        DIS|RAD|TAX|    PTRATIO|          B|      LSTAT|       MEDV|
+-----------+----+-----------+----+-----------+-----------+-----------+-----------+---+---+-----------+-----------+-----------+-----------+
|    0.00632|18.0|2.309999943|   0|0.537999988|6.574999809|65.19999695|4.090000153|  1|296|15.30000019|396.8999939|4.980000019|       24.0|
|0.027310001| 0.0|7.070000172|   0|0.469000012|6.421000004|78.90000153|4.967100143|  2|242|17.79999924|396.8999939|9.140000343|21.60000038|
|    0.02729| 0.0|7.070000172|   0|0.469000012|7.184999943|61.09999847|4.967100143|  2|242|17.79999924|392.8299866| 4.03000021|34.70000076|
|0.032370001| 0.0|2.180000067|   0|0.458000004|6.998000145|45.79999924|6.062200069|  3|222|18.70000076|394.6300049|2.940000057|33.40000153|
|0.069049999| 0.0|2.

In [4]:
df.printSchema

root
 |-- CRIM: double (nullable = true)
 |-- ZN: double (nullable = true)
 |-- INDUS: double (nullable = true)
 |-- CHAS: integer (nullable = true)
 |-- NOX: double (nullable = true)
 |-- RM: double (nullable = true)
 |-- AGE: double (nullable = true)
 |-- DIS: double (nullable = true)
 |-- RAD: integer (nullable = true)
 |-- TAX: integer (nullable = true)
 |-- PTRATIO: double (nullable = true)
 |-- B: double (nullable = true)
 |-- LSTAT: double (nullable = true)
 |-- MEDV: double (nullable = true)



In [5]:
df.describe().show()

+-------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+-----------------+------------------+-----------------+------------------+------------------+-----------------+
|summary|              CRIM|                ZN|             INDUS|              CHAS|               NOX|                RM|               AGE|               DIS|              RAD|               TAX|          PTRATIO|                 B|             LSTAT|             MEDV|
+-------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+-----------------+------------------+-----------------+------------------+------------------+-----------------+
|  count|               506|               506|               506|               506|               506|               506|               506|               506|              506|  

In [6]:
df.columns

[CRIM, ZN, INDUS, CHAS, NOX, RM, AGE, DIS, RAD, TAX, PTRATIO, B, LSTAT, MEDV]

In [7]:
df.columns.length

14

In [8]:
df.count

506

In [9]:
var L = List.empty[(String,Double)]

for(cn <- df.columns) L = L :+ (cn,df.stat.corr("MEDV", cn))

val df_corr = L.toDF("colname","correlation")
df_corr.show()

+-------+--------------------+
|colname|         correlation|
+-------+--------------------+
|   CRIM| -0.3883046116575088|
|     ZN| 0.36044534463752903|
|  INDUS|-0.48372517128143383|
|   CHAS| 0.17526017775291847|
|    NOX| -0.4273207763683772|
|     RM|   0.695359937127267|
|    AGE|-0.37695456714288667|
|    DIS| 0.24992873873512172|
|    RAD| -0.3816262315669168|
|    TAX|-0.46853593528654536|
|PTRATIO| -0.5077867038116085|
|      B|  0.3334608226834164|
|  LSTAT| -0.7376627294671615|
|   MEDV|                 1.0|
+-------+--------------------+



L = List((CRIM,-0.3883046116575088), (ZN,0.36044534463752903), (INDUS,-0.48372517128143383), (CHAS,0.17526017775291847), (NOX,-0.4273207763683772), (RM,0.695359937127267), (AGE,-0.37695456714288667), (DIS,0.24992873873512172), (RAD,-0.3816262315669168), (TAX,-0.46853593528654536), (PTRATIO,-0.5077867038116085), (B,0.3334608226834164), (LSTAT,-0.7376627294671615), (MEDV,1.0))
df_corr = [colname: string, correlation: double]


[colname: string, correlation: double]

In [10]:
df_corr.select(col("colname"),abs(col("correlation"))).sort(col("abs(correlation)").desc).show()

+-------+-------------------+                                                   
|colname|   abs(correlation)|
+-------+-------------------+
|   MEDV|                1.0|
|  LSTAT| 0.7376627294671615|
|     RM|  0.695359937127267|
|PTRATIO| 0.5077867038116085|
|  INDUS|0.48372517128143383|
|    TAX|0.46853593528654536|
|    NOX| 0.4273207763683772|
|   CRIM| 0.3883046116575088|
|    RAD| 0.3816262315669168|
|    AGE|0.37695456714288667|
|     ZN|0.36044534463752903|
|      B| 0.3334608226834164|
|    DIS|0.24992873873512172|
|   CHAS|0.17526017775291847|
+-------+-------------------+



## Apply Vector Assembler

In [11]:
val features = df.columns.slice(0,df.columns.length -1)

features = Array(CRIM, ZN, INDUS, CHAS, NOX, RM, AGE, DIS, RAD, TAX, PTRATIO, B, LSTAT)


[CRIM, ZN, INDUS, CHAS, NOX, RM, AGE, DIS, RAD, TAX, PTRATIO, B, LSTAT]

In [12]:
val assembler = new VectorAssembler().
  setInputCols(features).//setInputCols(Array("_c1", "_c2")).
  setOutputCol("features")

assembler = vecAssembler_8cddf5b9128f


vecAssembler_8cddf5b9128f

In [13]:
val df_v = assembler.transform(df).
    select(col("features"), col("MEDV").as("label"))

df_v.show(10)

+--------------------+-----------+
|            features|      label|
+--------------------+-----------+
|[0.00632,18.0,2.3...|       24.0|
|[0.027310001,0.0,...|21.60000038|
|[0.02729,0.0,7.07...|34.70000076|
|[0.032370001,0.0,...|33.40000153|
|[0.069049999,0.0,...|36.20000076|
|[0.029850001,0.0,...|28.70000076|
|[0.088289998,12.5...|22.89999962|
|[0.144549996,12.5...|27.10000038|
|[0.211239994,12.5...|       16.5|
|[0.170039997,12.5...|18.89999962|
+--------------------+-----------+
only showing top 10 rows



df_v = [features: vector, label: double]


[features: vector, label: double]

## Split into train and test sets

In [14]:
val Array(trainingData, testData) = df_v.randomSplit(Array(0.7, 0.3))

trainingData = [features: vector, label: double]
testData = [features: vector, label: double]


[features: vector, label: double]

## Scale the features

In [15]:
val scaler = new MinMaxScaler().//StandardScaler().
  setInputCol("features").
  setOutputCol("scaledFeatures")

scaler = minMaxScal_51017282211c


minMaxScal_51017282211c

In [16]:
val s = scaler.fit(trainingData)

s = minMaxScal_51017282211c


minMaxScal_51017282211c

In [17]:
val trainingData_s = s.transform(trainingData)
val testData_s = s.transform(testData)

trainingData_s = [features: vector, label: double ... 1 more field]
testData_s = [features: vector, label: double ... 1 more field]


[features: vector, label: double ... 1 more field]

## Hyperparameter optimization with cross-validation

In [18]:
val lr = new LinearRegression().
  setMaxIter(100).//setElasticNetParam(1.0).
  setFeaturesCol("scaledFeatures")
  //setLabelCol("y"). //by default input columns: label and features
  //setFeaturesCol("x")

lr = linReg_f33951b07d6d


linReg_f33951b07d6d

In [19]:
val evaluator = new RegressionEvaluator().setMetricName("r2")
  //setLabelCol("label").
  //setPredictionCol("prediction")

evaluator = regEval_89c487447288


regEval_89c487447288

In [20]:
val paramGrid = new ParamGridBuilder().
  addGrid(lr.regParam, Array(0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000)).
  addGrid(lr.elasticNetParam, Array(0.1,0.2,0.4,0.5,0.6,0.7,0.8,0.9,1.0)).
  build()

paramGrid = 


Array({
	linReg_f33951b07d6d-elasticNetParam: 0.1,
	linReg_f33951b07d6d-regParam: 1.0E-4
}, {
	linReg_f33951b07d6d-elasticNetParam: 0.2,
	linReg_f33951b07d6d-regParam: 1.0E-4
}, {
	linReg_f33951b07d6d-elasticNetParam: 0.4,
	linReg_f33951b07d6d-regParam: 1.0E-4
}, {
	linReg_f33951b07d6d-elasticNetParam: 0.5,
	linReg_f33951b07d6d-regParam: 1.0E-4
}, {
	linReg_f33951b07d6d-elasticNetParam: 0.6,
	linReg_f33951b07d6d-regParam: 1.0E-4
}, {
	linReg_f33951b07d6d-elasticNetParam: 0.7,
	linReg_f33951b07d6d-regParam: 1.0E-4
}, {
	linReg_f33951b07d6d-elasticNetParam: 0.8,
	linReg_f33951b07d6d-regParam: 1.0E-4
}, {
	linReg_f33951b07d6d-elasticNetParam: 0.9,
	linReg_f33951b07d6d-regParam: 1.0E-4
}, {
	linReg_f33951b07d6d-elasticNetParam: 1.0,
	li...


[{
	linReg_f33951b07d6d-elasticNetParam: 0.1,
	linReg_f33951b07d6d-regParam: 1.0E-4
}, {
	linReg_f33951b07d6d-elasticNetParam: 0.2,
	linReg_f33951b07d6d-regParam: 1.0E-4
}, {
	linReg_f33951b07d6d-elasticNetParam: 0.4,
	linReg_f33951b07d6d-regParam: 1.0E-4
}, {
	linReg_f33951b07d6d-elasticNetParam: 0.5,
	linReg_f33951b07d6d-regParam: 1.0E-4
}, {
	linReg_f33951b07d6d-elasticNetParam: 0.6,
	linReg_f33951b07d6d-regParam: 1.0E-4
}, {
	linReg_f33951b07d6d-elasticNetParam: 0.7,
	linReg_f33951b07d6d-regParam: 1.0E-4
}, {
	linReg_f33951b07d6d-elasticNetParam: 0.8,
	linReg_f33951b07d6d-regParam: 1.0E-4
}, {
	linReg_f33951b07d6d-elasticNetParam: 0.9,
	linReg_f33951b07d6d-regParam: 1.0E-4
}, {
	linReg_f33951b07d6d-elasticNetParam: 1.0,
	linReg_f33951b07d6d-regParam: 1.0E-4
}, {
	linReg_f33951b07d6d-elasticNetParam: 0.1,
	linReg_f33951b07d6d-regParam: 0.001
}, {
	linReg_f33951b07d6d-elasticNetParam: 0.2,
	linReg_f33951b07d6d-regParam: 0.001
}, {
	linReg_f33951b07d6d-elasticNetParam: 0.4,
	linReg_f3

In [21]:
val cv = new CrossValidator().
  setEstimator(lr).
  setEvaluator(evaluator).
  setEstimatorParamMaps(paramGrid).
  setNumFolds(5) 

cv = cv_53a90bb8a865


cv_53a90bb8a865

In [22]:
val model = cv.fit(trainingData_s)

model = cv_53a90bb8a865


cv_53a90bb8a865

In [23]:
model.write.overwrite.save("Model_Parameters")

SLF4J: Failed to load class "org.slf4j.impl.StaticLoggerBinder".
SLF4J: Defaulting to no-operation (NOP) logger implementation
SLF4J: See http://www.slf4j.org/codes.html#StaticLoggerBinder for further details.


## Make predictions

In [24]:
val predictions = model.transform(testData_s)
predictions.show(10)

+--------------------+-----------+--------------------+------------------+
|            features|      label|      scaledFeatures|        prediction|
+--------------------+-----------+--------------------+------------------+
|[0.01381,80.0,0.4...|       50.0|[8.41857976241994...| 39.64421970338811|
|[0.01439,60.0,2.9...|29.10000038|[9.07048580543777...|31.482708332313393|
|[0.01501,80.0,2.0...|       24.5|[9.76735088590511...|27.359296491325253|
|[0.020090001,95.0...|       50.0|[1.54771497659682...|41.736599319570125|
|[0.02055,85.0,0.7...|24.70000076|[1.59941775726616...|25.029032856037944|
|[0.028750001,28.0...|       25.0|[2.52107813737862...| 28.69728902630573|
|[0.029850001,0.0,...|28.70000076|[2.64471549036476...|25.279214966624657|
|[0.031129999,0.0,...|       17.5|[2.78858418540799...|16.704947274506523|
|[0.032370001,0.0,...|33.40000153|[2.92795742629665...|28.201738089235008|
|[0.03466,35.0,6.0...|19.39999962|[3.18534780329747...|23.629315746257216|
+--------------------+---

predictions = [features: vector, label: double ... 2 more fields]


[features: vector, label: double ... 2 more fields]

## Evaluate the model

In [25]:
val r2 = evaluator.evaluate(predictions)

r2 = 0.7301019836506952


0.7301019836506952

## Compare data with prediction

In [26]:
val y = predictions.select("label").as[Double].collect
val yp = predictions.select("prediction").as[Double].collect
val d = predictions.select(col("prediction")-col("label")).as[Double].collect

y = Array(50.0, 29.10000038, 24.5, 50.0, 24.70000076, 25.0, 28.70000076, 17.5, 33.40000153, 19.39999962, 45.40000153, 33.29999924, 18.20000076, 11.89999962, 25.0, 27.10000038, 24.60000038, 23.89999962, 33.09999847, 22.39999962, 18.89999962, 33.20000076, 36.20000076, 28.70000076, 22.20000076, 32.0, 37.29999924, 21.0, 43.79999924, 26.39999962, 37.90000153, 23.60000038, 23.29999924, 32.0, 20.0, 21.70000076, 22.0, 21.5, 20.10000038, 21.39999962, 21.20000076, 18.79999924, 29.79999924, 22.60000038, 20.29999924, 25.29999924, 18.5, 17.29999924, 15.19999981, 24.5, 23.79999924, 23.10000038, 20.5, 8.100000381, 19.29999924, 22.39999962, 18.70000076, 24.39999962, 21.20000076, 22.20000076, 13.30000019, 14.39999962, 19.39999962, 44.79999924, 17.79999924, 17.39999962, 48.29999924, 17.100...


[50.0, 29.10000038, 24.5, 50.0, 24.70000076, 25.0, 28.70000076, 17.5, 33.40000153, 19.39999962, 45.40000153, 33.29999924, 18.20000076, 11.89999962, 25.0, 27.10000038, 24.60000038, 23.89999962, 33.09999847, 22.39999962, 18.89999962, 33.20000076, 36.20000076, 28.70000076, 22.20000076, 32.0, 37.29999924, 21.0, 43.79999924, 26.39999962, 37.90000153, 23.60000038, 23.29999924, 32.0, 20.0, 21.70000076, 22.0, 21.5, 20.10000038, 21.39999962, 21.20000076, 18.79999924, 29.79999924, 22.60000038, 20.29999924, 25.29999924, 18.5, 17.29999924, 15.19999981, 24.5, 23.79999924, 23.10000038, 20.5, 8.100000381, 19.29999924, 22.39999962, 18.70000076, 24.39999962, 21.20000076, 22.20000076, 13.30000019, 14.39999962, 19.39999962, 44.79999924, 17.79999924, 17.39999962, 48.29999924, 17.10000038, 42.79999924, 21.70000076, 31.60000038, 31.5, 48.79999924, 24.29999924, 17.79999924, 18.10000038, 41.70000076, 50.0, 20.39999962, 18.20000076, 30.10000038, 16.60000038, 18.20000076, 15.60000038, 22.79999924, 13.89999962, 

In [27]:
val fig = Figure()
val plt = fig.subplot(0)

fig = breeze.plot.Figure@2a88547e
plt = breeze.plot.Plot@321d63b0


breeze.plot.Plot@321d63b0

In [28]:
plt += plot(y, d, '+', name = "y_pred-y", colorcode="blue")
plt += plot(Array(0,50), Array(0,0), '-', name ="y", colorcode="red")

plt.legend = true
//plt.title = "xxx"
plt.xlabel = "y"
plt.xlim(0,50)
plt.ylabel = "y_pred -y"
//plt.ylim(0,6)

plt.legend: Boolean = true
plt.xlabel: String = y
plt.ylabel: String = y_pred -y


In [29]:
kernel.magics.html(tohtml(plt.chart))