In [1]:
import org.apache.spark.sql.functions._
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.linalg.DenseVector
import org.apache.spark.ml.regression.LinearRegression
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.feature.{MinMaxScaler,StandardScaler}
import org.apache.spark.ml.evaluation.RegressionEvaluator
import breeze.plot._
import convert.jfc.tohtml

## Load the dataset

In [2]:
val df = spark.read.
  format("csv").
  option("header", "true").
  option("inferschema", "true").
  option("delimiter",",").
  load("../Datasets/Boston.csv")

df = [CRIM: double, ZN: double ... 12 more fields]


[CRIM: double, ZN: double ... 12 more fields]

## Explore the dataset

In [3]:
df.show(10)

+-----------+----+-----------+----+-----------+-----------+-----------+-----------+---+---+-----------+-----------+-----------+-----------+
|       CRIM|  ZN|      INDUS|CHAS|        NOX|         RM|        AGE|        DIS|RAD|TAX|    PTRATIO|          B|      LSTAT|       MEDV|
+-----------+----+-----------+----+-----------+-----------+-----------+-----------+---+---+-----------+-----------+-----------+-----------+
|    0.00632|18.0|2.309999943|   0|0.537999988|6.574999809|65.19999695|4.090000153|  1|296|15.30000019|396.8999939|4.980000019|       24.0|
|0.027310001| 0.0|7.070000172|   0|0.469000012|6.421000004|78.90000153|4.967100143|  2|242|17.79999924|396.8999939|9.140000343|21.60000038|
|    0.02729| 0.0|7.070000172|   0|0.469000012|7.184999943|61.09999847|4.967100143|  2|242|17.79999924|392.8299866| 4.03000021|34.70000076|
|0.032370001| 0.0|2.180000067|   0|0.458000004|6.998000145|45.79999924|6.062200069|  3|222|18.70000076|394.6300049|2.940000057|33.40000153|
|0.069049999| 0.0|2.

In [4]:
df.printSchema

root
 |-- CRIM: double (nullable = true)
 |-- ZN: double (nullable = true)
 |-- INDUS: double (nullable = true)
 |-- CHAS: integer (nullable = true)
 |-- NOX: double (nullable = true)
 |-- RM: double (nullable = true)
 |-- AGE: double (nullable = true)
 |-- DIS: double (nullable = true)
 |-- RAD: integer (nullable = true)
 |-- TAX: integer (nullable = true)
 |-- PTRATIO: double (nullable = true)
 |-- B: double (nullable = true)
 |-- LSTAT: double (nullable = true)
 |-- MEDV: double (nullable = true)



In [5]:
df.describe().show()

+-------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+-----------------+------------------+-----------------+------------------+------------------+-----------------+
|summary|              CRIM|                ZN|             INDUS|              CHAS|               NOX|                RM|               AGE|               DIS|              RAD|               TAX|          PTRATIO|                 B|             LSTAT|             MEDV|
+-------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+-----------------+------------------+-----------------+------------------+------------------+-----------------+
|  count|               506|               506|               506|               506|               506|               506|               506|               506|              506|  

In [6]:
df.columns

[CRIM, ZN, INDUS, CHAS, NOX, RM, AGE, DIS, RAD, TAX, PTRATIO, B, LSTAT, MEDV]

In [7]:
df.columns.length

14

In [8]:
df.count

506

In [9]:
var L = List.empty[(String,Double)]

for(cn <- df.columns) L = L :+ (cn,df.stat.corr("MEDV", cn))

val df_corr = L.toDF("colname","correlation")
df_corr.show()

+-------+--------------------+
|colname|         correlation|
+-------+--------------------+
|   CRIM| -0.3883046116575088|
|     ZN| 0.36044534463752903|
|  INDUS|-0.48372517128143383|
|   CHAS| 0.17526017775291847|
|    NOX| -0.4273207763683772|
|     RM|   0.695359937127267|
|    AGE|-0.37695456714288667|
|    DIS| 0.24992873873512172|
|    RAD| -0.3816262315669168|
|    TAX|-0.46853593528654536|
|PTRATIO| -0.5077867038116085|
|      B|  0.3334608226834164|
|  LSTAT| -0.7376627294671615|
|   MEDV|                 1.0|
+-------+--------------------+



L = List((CRIM,-0.3883046116575088), (ZN,0.36044534463752903), (INDUS,-0.48372517128143383), (CHAS,0.17526017775291847), (NOX,-0.4273207763683772), (RM,0.695359937127267), (AGE,-0.37695456714288667), (DIS,0.24992873873512172), (RAD,-0.3816262315669168), (TAX,-0.46853593528654536), (PTRATIO,-0.5077867038116085), (B,0.3334608226834164), (LSTAT,-0.7376627294671615), (MEDV,1.0))
df_corr = [colname: string, correlation: double]


[colname: string, correlation: double]

In [10]:
df_corr.select(col("colname"),abs(col("correlation"))).sort(col("abs(correlation)").desc).show()

+-------+-------------------+
|colname|   abs(correlation)|
+-------+-------------------+
|   MEDV|                1.0|
|  LSTAT| 0.7376627294671615|
|     RM|  0.695359937127267|
|PTRATIO| 0.5077867038116085|
|  INDUS|0.48372517128143383|
|    TAX|0.46853593528654536|
|    NOX| 0.4273207763683772|
|   CRIM| 0.3883046116575088|
|    RAD| 0.3816262315669168|
|    AGE|0.37695456714288667|
|     ZN|0.36044534463752903|
|      B| 0.3334608226834164|
|    DIS|0.24992873873512172|
|   CHAS|0.17526017775291847|
+-------+-------------------+



## Apply Vector Assembler

In [11]:
val features = df.columns.slice(0,df.columns.length -1)

features = Array(CRIM, ZN, INDUS, CHAS, NOX, RM, AGE, DIS, RAD, TAX, PTRATIO, B, LSTAT)


[CRIM, ZN, INDUS, CHAS, NOX, RM, AGE, DIS, RAD, TAX, PTRATIO, B, LSTAT]

In [12]:
val assembler = new VectorAssembler().
  setInputCols(features).//setInputCols(Array("_c1", "_c2")).
  setOutputCol("features")

assembler = vecAssembler_387f9e3b7c82


vecAssembler_387f9e3b7c82

In [13]:
val df_v = assembler.transform(df).
    select(col("features"), col("MEDV").as("label"))

df_v.show(10)

+--------------------+-----------+
|            features|      label|
+--------------------+-----------+
|[0.00632,18.0,2.3...|       24.0|
|[0.027310001,0.0,...|21.60000038|
|[0.02729,0.0,7.07...|34.70000076|
|[0.032370001,0.0,...|33.40000153|
|[0.069049999,0.0,...|36.20000076|
|[0.029850001,0.0,...|28.70000076|
|[0.088289998,12.5...|22.89999962|
|[0.144549996,12.5...|27.10000038|
|[0.211239994,12.5...|       16.5|
|[0.170039997,12.5...|18.89999962|
+--------------------+-----------+
only showing top 10 rows



df_v = [features: vector, label: double]


[features: vector, label: double]

## Split into train and test sets

In [14]:
val Array(trainingData, testData) = df_v.randomSplit(Array(0.7, 0.3))

trainingData = [features: vector, label: double]
testData = [features: vector, label: double]


[features: vector, label: double]

## Scale the features

In [15]:
val scaler = new MinMaxScaler().//StandardScaler().
  setInputCol("features").
  setOutputCol("scaledFeatures")

scaler = minMaxScal_2df6595ca308


minMaxScal_2df6595ca308

In [16]:
val s = scaler.fit(trainingData)

s = minMaxScal_2df6595ca308


minMaxScal_2df6595ca308

In [17]:
val trainingData_s = s.transform(trainingData)
val testData_s = s.transform(testData)

trainingData_s = [features: vector, label: double ... 1 more field]
testData_s = [features: vector, label: double ... 1 more field]


[features: vector, label: double ... 1 more field]

## Train the model

In [18]:
val lr = new LinearRegression().
  setMaxIter(100).
  setElasticNetParam(1.0).
  setFeaturesCol("scaledFeatures")
  //setLabelCol("y"). //by default input columns: label and features
  //setFeaturesCol("x")

lr = linReg_63689319774f


linReg_63689319774f

In [19]:
val model = lr.fit(trainingData_s)

model = linReg_63689319774f


linReg_63689319774f

In [20]:
model.summary.totalIterations

24

In [21]:
model.coefficients

[-6.805481948655786,3.924882568841738,0.5117191964512379,4.0993418284898295,-7.23981666511485,19.576016793199187,-1.0872222153354834,-13.94842721331006,5.947537195405698,-5.85449545417968,-9.20608641752296,2.2001311214286288,-16.749445074992824]

In [22]:
model.intercept

27.314598301900453

## Make predictions

In [23]:
val predictions = model.transform(testData_s)
predictions.show(10)

+--------------------+-----------+--------------------+------------------+
|            features|      label|      scaledFeatures|        prediction|
+--------------------+-----------+--------------------+------------------+
|[0.01311,90.0,1.2...|35.40000153|[9.99786801575190...|31.043582356437287|
|[0.01381,80.0,0.4...|       50.0|[1.10285760586129...|40.124494413770854|
|[0.01432,100.0,1....|31.60000038|[1.17795204898402...|32.768275659976844|
|[0.01501,80.0,2.0...|       24.5|[1.27955041320889...|27.669154484393538|
|[0.01501,90.0,1.2...|       50.0|[1.27955041320889...|45.488305664767125|
|[0.01538,90.0,3.7...|       44.0|[1.33403069547440...| 36.82859819933283|
|[0.01778,95.0,1.4...|32.90000153|[1.68741631016961...|30.739932791080328|
|[0.0187,85.0,4.15...|23.10000038|[1.82288079580277...|25.501380316418565|
|[0.02055,85.0,0.7...|24.70000076|[2.09528220713033...| 25.09628869750609|
|[0.025429999,55.0...|23.89999962|[2.81383280976658...|27.328221927808183|
+--------------------+---

predictions = [features: vector, label: double ... 2 more fields]


[features: vector, label: double ... 2 more fields]

## Evaluate the model

In [24]:
val evaluator = new RegressionEvaluator().setMetricName("r2")
  //setLabelCol("label").
  //setPredictionCol("prediction")

evaluator = regEval_83cb76be229b


regEval_83cb76be229b

In [25]:
val r2 = evaluator.evaluate(predictions)

r2 = 0.7346612755354833


0.7346612755354833

## Compare data with prediction

In [26]:
val y = predictions.select("label").as[Double].collect
val yp = predictions.select("prediction").as[Double].collect
val d = predictions.select(col("prediction")-col("label")).as[Double].collect

y = Array(35.40000153, 50.0, 31.60000038, 24.5, 50.0, 44.0, 32.90000153, 23.10000038, 24.70000076, 23.89999962, 34.70000076, 21.60000038, 26.60000038, 17.5, 34.90000153, 22.0, 23.5, 24.79999924, 20.70000076, 23.20000076, 22.0, 33.29999924, 24.79999924, 18.20000076, 22.60000038, 21.89999962, 28.20000076, 23.39999962, 17.10000038, 27.10000038, 19.0, 50.0, 37.20000076, 29.60000038, 23.89999962, 18.70000076, 22.89999962, 22.60000038, 33.20000076, 36.20000076, 22.0, 20.29999924, 32.0, 24.10000038, 29.10000038, 21.0, 21.70000076, 23.70000076, 20.29999924, 20.79999924, 24.20000076, 37.0, 26.39999962, 32.5, 19.60000038, 21.70000076, 22.20000076, 22.0, 26.5, 21.39999962, 18.89999962, 29.79999924, 25.0, 20.39999962, 20.39999962, 19.5, 24.39999962, 18.5, 27.10000038, 19.29999924, 27...


[35.40000153, 50.0, 31.60000038, 24.5, 50.0, 44.0, 32.90000153, 23.10000038, 24.70000076, 23.89999962, 34.70000076, 21.60000038, 26.60000038, 17.5, 34.90000153, 22.0, 23.5, 24.79999924, 20.70000076, 23.20000076, 22.0, 33.29999924, 24.79999924, 18.20000076, 22.60000038, 21.89999962, 28.20000076, 23.39999962, 17.10000038, 27.10000038, 19.0, 50.0, 37.20000076, 29.60000038, 23.89999962, 18.70000076, 22.89999962, 22.60000038, 33.20000076, 36.20000076, 22.0, 20.29999924, 32.0, 24.10000038, 29.10000038, 21.0, 21.70000076, 23.70000076, 20.29999924, 20.79999924, 24.20000076, 37.0, 26.39999962, 32.5, 19.60000038, 21.70000076, 22.20000076, 22.0, 26.5, 21.39999962, 18.89999962, 29.79999924, 25.0, 20.39999962, 20.39999962, 19.5, 24.39999962, 18.5, 27.10000038, 19.29999924, 27.5, 18.70000076, 24.70000076, 19.5, 17.5, 7.0, 20.0, 23.39999962, 20.5, 8.100000381, 35.09999847, 19.29999924, 24.79999924, 19.39999962, 35.20000076, 24.39999962, 14.39999962, 19.39999962, 18.5, 19.70000076, 46.70000076, 23.0, 

In [27]:
val fig = Figure()
val plt = fig.subplot(0)

fig = breeze.plot.Figure@5c2fc769
plt = breeze.plot.Plot@2dd4ea5f


breeze.plot.Plot@2dd4ea5f

In [28]:
plt += plot(y, d, '+', name = "y_pred-y", colorcode="blue")
plt += plot(Array(0,50), Array(0,0), '-', name ="y", colorcode="red")

plt.legend = true
//plt.title = "xxx"
plt.xlabel = "y"
plt.xlim(0,50)
plt.ylabel = "y_pred -y"
//plt.ylim(0,6)

plt.legend: Boolean = true
plt.xlabel: String = y
plt.ylabel: String = y_pred -y


In [29]:
kernel.magics.html(tohtml(plt.chart))