In [1]:
import org.apache.spark.sql.functions._
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.linalg.DenseVector
import org.apache.spark.ml.regression.RandomForestRegressor
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.feature.{MinMaxScaler,StandardScaler}
import org.apache.spark.ml.evaluation.RegressionEvaluator
import breeze.plot._
import convert.jfc.tohtml

## Load the dataset

In [2]:
val df = spark.read.
  format("csv").
  option("header", "true").
  option("inferschema", "true").
  option("delimiter",",").
  load("../Datasets/Boston.csv")

df = [CRIM: double, ZN: double ... 12 more fields]


[CRIM: double, ZN: double ... 12 more fields]

## Explore the dataset

In [3]:
df.show(5)

+-----------+----+-----------+----+-----------+-----------+-----------+-----------+---+---+-----------+-----------+-----------+-----------+
|       CRIM|  ZN|      INDUS|CHAS|        NOX|         RM|        AGE|        DIS|RAD|TAX|    PTRATIO|          B|      LSTAT|       MEDV|
+-----------+----+-----------+----+-----------+-----------+-----------+-----------+---+---+-----------+-----------+-----------+-----------+
|    0.00632|18.0|2.309999943|   0|0.537999988|6.574999809|65.19999695|4.090000153|  1|296|15.30000019|396.8999939|4.980000019|       24.0|
|0.027310001| 0.0|7.070000172|   0|0.469000012|6.421000004|78.90000153|4.967100143|  2|242|17.79999924|396.8999939|9.140000343|21.60000038|
|    0.02729| 0.0|7.070000172|   0|0.469000012|7.184999943|61.09999847|4.967100143|  2|242|17.79999924|392.8299866| 4.03000021|34.70000076|
|0.032370001| 0.0|2.180000067|   0|0.458000004|6.998000145|45.79999924|6.062200069|  3|222|18.70000076|394.6300049|2.940000057|33.40000153|
|0.069049999| 0.0|2.

In [4]:
df.printSchema

root
 |-- CRIM: double (nullable = true)
 |-- ZN: double (nullable = true)
 |-- INDUS: double (nullable = true)
 |-- CHAS: integer (nullable = true)
 |-- NOX: double (nullable = true)
 |-- RM: double (nullable = true)
 |-- AGE: double (nullable = true)
 |-- DIS: double (nullable = true)
 |-- RAD: integer (nullable = true)
 |-- TAX: integer (nullable = true)
 |-- PTRATIO: double (nullable = true)
 |-- B: double (nullable = true)
 |-- LSTAT: double (nullable = true)
 |-- MEDV: double (nullable = true)



In [5]:
df.describe().show()

+-------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+-----------------+------------------+-----------------+------------------+------------------+-----------------+
|summary|              CRIM|                ZN|             INDUS|              CHAS|               NOX|                RM|               AGE|               DIS|              RAD|               TAX|          PTRATIO|                 B|             LSTAT|             MEDV|
+-------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+-----------------+------------------+-----------------+------------------+------------------+-----------------+
|  count|               506|               506|               506|               506|               506|               506|               506|               506|              506|  

In [6]:
df.columns

[CRIM, ZN, INDUS, CHAS, NOX, RM, AGE, DIS, RAD, TAX, PTRATIO, B, LSTAT, MEDV]

In [7]:
df.columns.length

14

In [8]:
df.count

506

In [9]:
var L = List.empty[(String,Double)]

for(cn <- df.columns) L = L :+ (cn,df.stat.corr("MEDV", cn))

val df_corr = L.toDF("colname","correlation")
df_corr.show()

+-------+--------------------+
|colname|         correlation|
+-------+--------------------+
|   CRIM| -0.3883046116575088|
|     ZN| 0.36044534463752903|
|  INDUS|-0.48372517128143383|
|   CHAS| 0.17526017775291847|
|    NOX| -0.4273207763683772|
|     RM|   0.695359937127267|
|    AGE|-0.37695456714288667|
|    DIS| 0.24992873873512172|
|    RAD| -0.3816262315669168|
|    TAX|-0.46853593528654536|
|PTRATIO| -0.5077867038116085|
|      B|  0.3334608226834164|
|  LSTAT| -0.7376627294671615|
|   MEDV|                 1.0|
+-------+--------------------+



L = List((CRIM,-0.3883046116575088), (ZN,0.36044534463752903), (INDUS,-0.48372517128143383), (CHAS,0.17526017775291847), (NOX,-0.4273207763683772), (RM,0.695359937127267), (AGE,-0.37695456714288667), (DIS,0.24992873873512172), (RAD,-0.3816262315669168), (TAX,-0.46853593528654536), (PTRATIO,-0.5077867038116085), (B,0.3334608226834164), (LSTAT,-0.7376627294671615), (MEDV,1.0))
df_corr = [colname: string, correlation: double]


[colname: string, correlation: double]

In [10]:
df_corr.select(col("colname"),abs(col("correlation"))).sort(col("abs(correlation)").desc).show()

+-------+-------------------+
|colname|   abs(correlation)|
+-------+-------------------+
|   MEDV|                1.0|
|  LSTAT| 0.7376627294671615|
|     RM|  0.695359937127267|
|PTRATIO| 0.5077867038116085|
|  INDUS|0.48372517128143383|
|    TAX|0.46853593528654536|
|    NOX| 0.4273207763683772|
|   CRIM| 0.3883046116575088|
|    RAD| 0.3816262315669168|
|    AGE|0.37695456714288667|
|     ZN|0.36044534463752903|
|      B| 0.3334608226834164|
|    DIS|0.24992873873512172|
|   CHAS|0.17526017775291847|
+-------+-------------------+



## Apply Vector Assembler

In [11]:
val features = df.columns.slice(0,df.columns.length -1)

features = Array(CRIM, ZN, INDUS, CHAS, NOX, RM, AGE, DIS, RAD, TAX, PTRATIO, B, LSTAT)


[CRIM, ZN, INDUS, CHAS, NOX, RM, AGE, DIS, RAD, TAX, PTRATIO, B, LSTAT]

In [12]:
val assembler = new VectorAssembler().
  setInputCols(features).//setInputCols(Array("_c1", "_c2")).
  setOutputCol("features")

assembler = vecAssembler_faa6047ca478


vecAssembler_faa6047ca478

In [13]:
val df_v = assembler.transform(df).
    select(col("features"), col("MEDV").as("label"))

df_v.show(10)

+--------------------+-----------+
|            features|      label|
+--------------------+-----------+
|[0.00632,18.0,2.3...|       24.0|
|[0.027310001,0.0,...|21.60000038|
|[0.02729,0.0,7.07...|34.70000076|
|[0.032370001,0.0,...|33.40000153|
|[0.069049999,0.0,...|36.20000076|
|[0.029850001,0.0,...|28.70000076|
|[0.088289998,12.5...|22.89999962|
|[0.144549996,12.5...|27.10000038|
|[0.211239994,12.5...|       16.5|
|[0.170039997,12.5...|18.89999962|
+--------------------+-----------+
only showing top 10 rows



df_v = [features: vector, label: double]


[features: vector, label: double]

## Split into train and test sets

In [14]:
val Array(trainingData, testData) = df_v.randomSplit(Array(0.7, 0.3))

trainingData = [features: vector, label: double]
testData = [features: vector, label: double]


[features: vector, label: double]

## Train the model

In [15]:
val rf = new RandomForestRegressor().
    setNumTrees(100).
    setMaxDepth(10)

rf = rfr_ec46a627230a


rfr_ec46a627230a

In [16]:
val model = rf.fit(trainingData)

model = RandomForestRegressionModel (uid=rfr_a1880f8ddcfa) with 100 trees


RandomForestRegressionModel (uid=rfr_a1880f8ddcfa) with 100 trees

## Make predictions

In [17]:
val predictions = model.transform(testData)
predictions.show(10)

+--------------------+-----------+------------------+
|            features|      label|        prediction|
+--------------------+-----------+------------------+
|[0.01301,35.0,1.5...|32.70000076| 32.84828876901207|
|[0.01501,90.0,1.2...|       50.0|  43.2648749503625|
|[0.0187,85.0,4.15...|23.10000038|24.353056051354653|
|[0.019509999,17.5...|       33.0| 33.67808340940357|
|[0.020090001,95.0...|       50.0| 46.96720831319917|
|[0.02187,60.0,2.9...|31.10000038|28.593010767281235|
|[0.02729,0.0,7.07...|34.70000076| 34.41900432914125|
|[0.030409999,0.0,...|       18.5|20.198703715485657|
|[0.032370001,0.0,...|33.40000153|35.403669732271645|
|[0.03427,0.0,5.19...|       19.5| 20.00478709519469|
+--------------------+-----------+------------------+
only showing top 10 rows



predictions = [features: vector, label: double ... 1 more field]


[features: vector, label: double ... 1 more field]

## Evaluate the model

In [18]:
val evaluator = new RegressionEvaluator().setMetricName("r2")
  //setLabelCol("label").
  //setPredictionCol("prediction")

evaluator = regEval_5da1fd5734a9


regEval_5da1fd5734a9

In [19]:
val r2 = evaluator.evaluate(predictions)

r2 = 0.8126596138065366


0.8126596138065366

## Compare data with prediction

In [20]:
val y = predictions.select("label").as[Double].collect
val yp = predictions.select("prediction").as[Double].collect
val d = predictions.select(col("prediction")-col("label")).as[Double].collect

y = Array(32.70000076, 50.0, 23.10000038, 33.0, 50.0, 31.10000038, 34.70000076, 18.5, 33.40000153, 19.5, 28.5, 22.0, 20.89999962, 24.79999924, 21.10000038, 18.20000076, 20.5, 23.89999962, 20.60000038, 30.29999924, 22.60000038, 11.89999962, 28.70000076, 19.0, 23.60000038, 46.0, 17.20000076, 18.89999962, 36.20000076, 22.0, 36.20000076, 30.5, 28.70000076, 33.40000153, 32.0, 24.10000038, 24.10000038, 19.70000076, 37.0, 21.39999962, 18.79999924, 33.09999847, 22.79999924, 20.0, 33.20000076, 20.10000038, 21.70000076, 22.0, 20.10000038, 23.0, 24.39999962, 18.89999962, 28.39999962, 29.79999924, 25.0, 19.79999924, 20.89999962, 20.39999962, 24.39999962, 22.60000038, 23.10000038, 28.10000038, 27.10000038, 18.70000076, 15.19999981, 23.29999924, 21.70000076, 24.5, 18.29999924, 21.79999...


[32.70000076, 50.0, 23.10000038, 33.0, 50.0, 31.10000038, 34.70000076, 18.5, 33.40000153, 19.5, 28.5, 22.0, 20.89999962, 24.79999924, 21.10000038, 18.20000076, 20.5, 23.89999962, 20.60000038, 30.29999924, 22.60000038, 11.89999962, 28.70000076, 19.0, 23.60000038, 46.0, 17.20000076, 18.89999962, 36.20000076, 22.0, 36.20000076, 30.5, 28.70000076, 33.40000153, 32.0, 24.10000038, 24.10000038, 19.70000076, 37.0, 21.39999962, 18.79999924, 33.09999847, 22.79999924, 20.0, 33.20000076, 20.10000038, 21.70000076, 22.0, 20.10000038, 23.0, 24.39999962, 18.89999962, 28.39999962, 29.79999924, 25.0, 19.79999924, 20.89999962, 20.39999962, 24.39999962, 22.60000038, 23.10000038, 28.10000038, 27.10000038, 18.70000076, 15.19999981, 23.29999924, 21.70000076, 24.5, 18.29999924, 21.79999924, 23.10000038, 20.0, 26.20000076, 24.60000038, 35.09999847, 19.29999924, 24.79999924, 15.0, 18.60000038, 16.60000038, 22.5, 16.20000076, 14.39999962, 16.20000076, 21.10000038, 18.39999962, 19.20000076, 21.70000076, 22.799999

In [21]:
val fig = Figure()
val plt = fig.subplot(0)

fig = breeze.plot.Figure@2aba6761
plt = breeze.plot.Plot@7d122a27


breeze.plot.Plot@7d122a27

In [22]:
plt += plot(y, d, '+', name = "y_pred-y", colorcode="blue")
plt += plot(Array(0,50), Array(0,0), '-', name ="y", colorcode="red")

plt.legend = true
//plt.title = "xxx"
plt.xlabel = "y"
//plt.xlim(0,50)
plt.ylabel = "y_pred -y"
//plt.ylim(0,6)

plt.legend: Boolean = true
plt.xlabel: String = y
plt.ylabel: String = y_pred -y


In [23]:
kernel.magics.html(tohtml(plt.chart))

## Feature importance

In [24]:
val fi = List.range(0,df.columns.length -1).map(i => (df.columns(i), model.featureImportances.toArray(i)) ).toDF("feature", "importance")
fi.sort(col("importance").desc).show()

+-------+--------------------+
|feature|          importance|
+-------+--------------------+
|     RM| 0.32860023409181816|
|  LSTAT| 0.30095888047308755|
|PTRATIO| 0.07064303975103525|
|  INDUS| 0.05625528437935128|
|    NOX|0.048850547557134756|
|   CRIM| 0.04425768952897345|
|    DIS|0.043391039807007155|
|    TAX|0.034903738490761685|
|    AGE|0.032148749387693944|
|      B| 0.02005529074910807|
|    RAD|0.009953698284632837|
|   CHAS|0.006969063884170661|
|     ZN|0.003012743615225035|
+-------+--------------------+



fi = [feature: string, importance: double]


[feature: string, importance: double]