In [1]:
import org.apache.spark.sql.functions._
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.linalg.DenseVector
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.regression.RandomForestRegressor
import org.apache.spark.ml.evaluation.RegressionEvaluator
import breeze.plot._
import convert.jfc.tohtml

## Load the dataset

In [2]:
val df = spark.read.
  format("csv").
  option("header", "true").
  option("inferschema", "true").
  option("delimiter",",").
  load("../Datasets/Accelerating_particle.csv")

df = [t: double, z: double]


[t: double, z: double]

## Explore the dataset

In [3]:
df.show(5)

+-------------+--------------+
|            t|             z|
+-------------+--------------+
|          0.0|0.581492499944|
|0.01001001001|0.109579774655|
|0.02002002002|0.736651613589|
|0.03003003003| 1.48598356416|
|0.04004004004| 1.74318272131|
+-------------+--------------+
only showing top 5 rows



In [4]:
df.printSchema

root
 |-- t: double (nullable = true)
 |-- z: double (nullable = true)



In [5]:
df.describe().show()

+-------+------------------+------------------+
|summary|                 t|                 z|
+-------+------------------+------------------+
|  count|              1000|              1000|
|   mean| 5.000000000000049| 17.64991205090991|
| stddev|2.8910854464052678|14.983189399049817|
|    min|               0.0|    -1.35666247476|
|    max|              10.0|      51.253265726|
+-------+------------------+------------------+



## Vector Assembler

In [6]:
val assembler = new VectorAssembler().
  setInputCols(Array("t")).
  setOutputCol("features")

assembler = vecAssembler_8bef74557120


vecAssembler_8bef74557120

In [7]:
val df_v = assembler.transform(df).select(col("z").as("label"), col("features"))
df_v.show(10)

+---------------+-----------------+
|          label|         features|
+---------------+-----------------+
| 0.581492499944|            [0.0]|
| 0.109579774655|  [0.01001001001]|
| 0.736651613589|  [0.02002002002]|
|  1.48598356416|  [0.03003003003]|
|  1.74318272131|  [0.04004004004]|
|-0.180520976165|[0.0500500500501]|
| 0.592680533783|[0.0600600600601]|
|  1.16782126493|[0.0700700700701]|
| 0.556445117136|[0.0800800800801]|
|-0.408362632843|[0.0900900900901]|
+---------------+-----------------+
only showing top 10 rows



df_v = [label: double, features: vector]


[label: double, features: vector]

## Split into train and test sets

In [8]:
val Array(trainingData, testData) = df_v.randomSplit(Array(0.7, 0.3))

trainingData = [label: double, features: vector]
testData = [label: double, features: vector]


[label: double, features: vector]

## Train the model

In [9]:
val rf = new RandomForestRegressor().
    setNumTrees(100).
    setMaxDepth(10)

rf = rfr_7a0d8ce28ce4


rfr_7a0d8ce28ce4

In [10]:
val model = rf.fit(trainingData)

model = RandomForestRegressionModel (uid=rfr_0051f46da375) with 100 trees


RandomForestRegressionModel (uid=rfr_0051f46da375) with 100 trees

## Make predictions

In [11]:
val predictions = model.transform(testData).sort("features")
predictions.show(10)

+---------------+-----------------+------------------+
|          label|         features|        prediction|
+---------------+-----------------+------------------+
| 0.556445117136|[0.0800800800801]|0.5213743954122685|
|  1.46645821002|   [0.1001001001]|0.5213743954122685|
|  1.11205142444|  [0.15015015015]|0.5213743954122685|
|  1.23614103562|  [0.17017017017]|0.5213743954122685|
|-0.279015109822|  [0.18018018018]|0.5213743954122685|
| 0.376409689971|   [0.2002002002]|0.5213743954122685|
| 0.205323407272|  [0.24024024024]|0.5213743954122685|
| 0.742837121531|  [0.28028028028]|0.5213743954122685|
| 0.994512600596|  [0.31031031031]| 1.189886419345981|
|-0.691767857101|  [0.33033033033]| 1.189886419345981|
+---------------+-----------------+------------------+
only showing top 10 rows



predictions = [label: double, features: vector ... 1 more field]


[label: double, features: vector ... 1 more field]

## Evaluate the model

In [12]:
val evaluator = new RegressionEvaluator().setMetricName("r2")
  //setLabelCol("label").
  //setPredictionCol("prediction")

evaluator = regEval_8594a624a93a


regEval_8594a624a93a

In [13]:
val r2 = evaluator.evaluate(predictions)



r2 = 0.996967689414005


0.996967689414005

## Compare data with prediction

In [14]:
val x = predictions.select("features").collect.map(row=>row(0).asInstanceOf[DenseVector](0))
val y = predictions.select("label").as[Double].collect
val yp = predictions.select("prediction").as[Double].collect

x = Array(0.0800800800801, 0.1001001001, 0.15015015015, 0.17017017017, 0.18018018018, 0.2002002002, 0.24024024024, 0.28028028028, 0.31031031031, 0.33033033033, 0.36036036036, 0.37037037037, 0.38038038038, 0.39039039039, 0.47047047047, 0.580580580581, 0.650650650651, 0.680680680681, 0.760760760761, 0.820820820821, 0.830830830831, 0.850850850851, 0.860860860861, 0.980980980981, 1.02102102102, 1.03103103103, 1.1011011011, 1.12112112112, 1.13113113113, 1.15115115115, 1.19119119119, 1.25125125125, 1.3013013013, 1.33133133133, 1.35135135135, 1.38138138138, 1.39139139139, 1.4014014014, 1.42142142142, 1.45145145145, 1.48148148148, 1.51151151151, 1.52152152152, 1.56156156156, 1.61161161161, 1.69169169169, 1.7017017017, 1.74174174174, 1.79179179179, 1.83183183183, 1.85185185185, 1....


[0.0800800800801, 0.1001001001, 0.15015015015, 0.17017017017, 0.18018018018, 0.2002002002, 0.24024024024, 0.28028028028, 0.31031031031, 0.33033033033, 0.36036036036, 0.37037037037, 0.38038038038, 0.39039039039, 0.47047047047, 0.580580580581, 0.650650650651, 0.680680680681, 0.760760760761, 0.820820820821, 0.830830830831, 0.850850850851, 0.860860860861, 0.980980980981, 1.02102102102, 1.03103103103, 1.1011011011, 1.12112112112, 1.13113113113, 1.15115115115, 1.19119119119, 1.25125125125, 1.3013013013, 1.33133133133, 1.35135135135, 1.38138138138, 1.39139139139, 1.4014014014, 1.42142142142, 1.45145145145, 1.48148148148, 1.51151151151, 1.52152152152, 1.56156156156, 1.61161161161, 1.69169169169, 1.7017017017, 1.74174174174, 1.79179179179, 1.83183183183, 1.85185185185, 1.88188188188, 1.89189189189, 1.92192192192, 1.93193193193, 1.95195195195, 1.99199199199, 2.03203203203, 2.04204204204, 2.05205205205, 2.07207207207, 2.08208208208, 2.11211211211, 2.16216216216, 2.17217217217, 2.21221221221, 2.23

In [15]:
val fig = Figure()
val plt = fig.subplot(0)

fig = breeze.plot.Figure@35220a5e
plt = breeze.plot.Plot@226be7fd


breeze.plot.Plot@226be7fd

In [16]:
plt += plot(x, y, '+', name = "Data", colorcode="red")
plt += plot(x, yp, '-', name = "Fit", colorcode="blue")

plt.legend = true
plt.title = "RF Regression"
plt.xlabel = "t"
//plt.xlim(0,11)
plt.ylabel = "z"
//plt.ylim(0,6)

plt.legend: Boolean = true
plt.title: String = RF Regression
plt.xlabel: String = t
plt.ylabel: String = z


In [17]:
kernel.magics.html(tohtml(plt.chart))