# Linear regression
Linear regression algorithms fit continuous data with a linear model. A linear model is a linear combination of a set of coefficients and features.  When our dataset has just one feature, the fitted linear model is:
$$h(x)=w_0+w_1x$$
The algorithm estimates the unknown coefficients, also known as model parameters, from training data. The fitted coefficients minimize the sum of the squares of the difference between predicted and actual observed labels in the training dataset.
$$J(w)=\frac{1}{2}\sum_{i=1}^n(y^{(i)}- h^{(i)})^2$$
Here, $h^{(i)}$ is the predicted value for the $i_{th}$ instance and $y^{(i)}$ is the observed one.
The performance of linear regression is evaluated using the $R^2$ metric:
$$R^2=1-\frac{\sum(y^{(i)}- h^{(i)})^2}{\sum(y^{(i)} - \bar{y})^2}=1-\frac{MSE}{VAR}$$

In [1]:
import org.apache.spark.sql.functions._
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.linalg.DenseVector
import org.apache.spark.ml.regression.LinearRegression
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.evaluation.RegressionEvaluator
import breeze.plot._
import convert.jfc.tohtml

## Load the dataset

In [2]:
val df = spark.read.
  format("csv").
  option("header", "true").
  option("inferschema", "true").
  option("delimiter",",").
  load("../Datasets/Age_blood_pressure.csv")

df = [Age: int, Systolic Blood Pressure: double]


[Age: int, Systolic Blood Pressure: double]

## Explore the dataset

In [3]:
df.show(10)

+---+-----------------------+
|Age|Systolic Blood Pressure|
+---+-----------------------+
| 38|            127.6940083|
| 58|          158.773654276|
| 28|          116.338801548|
| 53|          160.259089497|
| 38|           138.17068045|
| 40|          116.669958755|
| 43|          156.425071568|
| 34|            134.0272229|
| 39|            127.2820664|
| 55|          150.250726648|
+---+-----------------------+
only showing top 10 rows



In [4]:
df.printSchema

root
 |-- Age: integer (nullable = true)
 |-- Systolic Blood Pressure: double (nullable = true)



In [5]:
df.describe().show()

+-------+-----------------+-----------------------+
|summary|              Age|Systolic Blood Pressure|
+-------+-----------------+-----------------------+
|  count|              199|                    199|
|   mean|45.18090452261306|     140.93991071230258|
| stddev|17.48267359259972|     23.148276080351778|
|    min|               15|           85.346019671|
|    max|               79|          201.294613148|
+-------+-----------------+-----------------------+



## Apply Vector Assembler

In [6]:
val features = df.columns.slice(0,1)

features = Array(Age)


[Age]

In [7]:
val assembler = new VectorAssembler().
  setInputCols(features).//setInputCols(Array("_c1", "_c2")).
  setOutputCol("features")

assembler = vecAssembler_be75795946b3


vecAssembler_be75795946b3

In [8]:
val df_v = assembler.transform(df).
    select(col("features"), col("Systolic Blood Pressure").as("label"))

df_v.show(10)

+--------+-------------+
|features|        label|
+--------+-------------+
|  [38.0]|  127.6940083|
|  [58.0]|158.773654276|
|  [28.0]|116.338801548|
|  [53.0]|160.259089497|
|  [38.0]| 138.17068045|
|  [40.0]|116.669958755|
|  [43.0]|156.425071568|
|  [34.0]|  134.0272229|
|  [39.0]|  127.2820664|
|  [55.0]|150.250726648|
+--------+-------------+
only showing top 10 rows



df_v = [features: vector, label: double]


[features: vector, label: double]

## Split into train and test sets

In [9]:
val Array(trainingData, testData) = df_v.randomSplit(Array(0.7, 0.3))

trainingData = [features: vector, label: double]
testData = [features: vector, label: double]


[features: vector, label: double]

## Train the model

In [10]:
val lr = new LinearRegression().
  setMaxIter(100).
  setElasticNetParam(1.0)
  //setLabelCol("y"). //by default input columns: label and features
  //setFeaturesCol("x")

lr = linReg_64a2632e3756


linReg_64a2632e3756

In [11]:
val model = lr.fit(trainingData)

model = linReg_64a2632e3756


linReg_64a2632e3756

In [12]:
model.summary.totalIterations

3

In [13]:
model.coefficients

[1.0157377688083284]

In [14]:
model.intercept

95.5942555559709

## Make predictions

In [15]:
val predictions = model.transform(testData)
predictions.show(10)

+--------+-------------+------------------+
|features|        label|        prediction|
+--------+-------------+------------------+
|  [20.0]| 93.055037209|115.90901093213748|
|  [20.0]|116.052957535|115.90901093213748|
|  [20.0]|126.245581176|115.90901093213748|
|  [22.0]|113.268988171|117.94048646975413|
|  [22.0]|127.583169459|117.94048646975413|
|  [23.0]|124.755884462|118.95622423856246|
|  [24.0]|134.515674853|119.97196200737079|
|  [24.0]|140.227596469|119.97196200737079|
|  [26.0]| 97.513663344|122.00343754498743|
|  [26.0]|109.213183583|122.00343754498743|
+--------+-------------+------------------+
only showing top 10 rows



predictions = [features: vector, label: double ... 1 more field]


[features: vector, label: double ... 1 more field]

## Evaluate the model

In [16]:
val evaluator = new RegressionEvaluator().setMetricName("r2")
  //setLabelCol("label").
  //setPredictionCol("prediction")

evaluator = regEval_74d8c165dd0f


regEval_74d8c165dd0f

In [17]:
val r2 = evaluator.evaluate(predictions)

r2 = 0.5321610829672769


0.5321610829672769

## Compare data with prediction

In [18]:
val x = predictions.select("features").collect.map(row=>row(0).asInstanceOf[DenseVector](0))
val y = predictions.select("label").as[Double].collect
val yp = predictions.select("prediction").as[Double].collect

x = Array(20.0, 20.0, 20.0, 22.0, 22.0, 23.0, 24.0, 24.0, 26.0, 26.0, 28.0, 28.0, 30.0, 31.0, 32.0, 32.0, 35.0, 36.0, 37.0, 37.0, 39.0, 40.0, 41.0, 41.0, 42.0, 45.0, 45.0, 47.0, 48.0, 48.0, 49.0, 49.0, 52.0, 55.0, 55.0, 58.0, 58.0, 59.0, 59.0, 64.0, 67.0, 69.0, 70.0, 72.0, 73.0, 73.0, 75.0, 77.0)
y = Array(93.055037209, 116.052957535, 126.245581176, 113.268988171, 127.583169459, 124.755884462, 134.515674853, 140.227596469, 97.513663344, 109.213183583, 104.735530425, 116.338801548, 116.200326534, 150.77480389, 133.907747685, 138.592291886, 120.765202892, 118.693921241, 146.674674341, 148.084745324, 127.2820664, 149.636593431, 117.346709581, 125.438574874, 131.581449229, 150.728251583, 152.605789255, 155.735417675, 124.774831374, 139.098802313, 129.436803594,...


[93.055037209, 116.052957535, 126.245581176, 113.268988171, 127.583169459, 124.755884462, 134.515674853, 140.227596469, 97.513663344, 109.213183583, 104.735530425, 116.338801548, 116.200326534, 150.77480389, 133.907747685, 138.592291886, 120.765202892, 118.693921241, 146.674674341, 148.084745324, 127.2820664, 149.636593431, 117.346709581, 125.438574874, 131.581449229, 150.728251583, 152.605789255, 155.735417675, 124.774831374, 139.098802313, 129.436803594, 134.114602528, 143.165926462, 136.771774869, 150.250726648, 157.624363795, 158.773654276, 156.031261959, 173.563942733, 159.706511678, 154.867998703, 177.962721271, 122.601165854, 171.716593368, 152.315027195, 183.735881788, 156.085539297, 166.069882861]

In [19]:
val fig = Figure()
val plt = fig.subplot(0)

fig = breeze.plot.Figure@75b4ecda
plt = breeze.plot.Plot@5eb5c01c


breeze.plot.Plot@5eb5c01c

In [20]:
plt += plot(x, y, '+', name = "Data", colorcode="red")
plt += plot(x, yp, '-', name = "Fit", colorcode="blue")

plt.legend = true
plt.title = "Linear Regression"
plt.xlabel = "Age"
//plt.xlim(0,11)
plt.ylabel = "Systolic Blood Pressure"
//plt.ylim(0,6)

plt.legend: Boolean = true
plt.title: String = Linear Regression
plt.xlabel: String = Age
plt.ylabel: String = Systolic Blood Pressure


In [21]:
kernel.magics.html(tohtml(plt.chart))