# Linear regression sample for Spark
- envs: Toree (scala)

In [1]:
spark

Waiting for a Spark session to start...

org.apache.spark.sql.SparkSession@6aeee626

### 1. Read DataFrame

In [3]:
val df = spark.createDataFrame(Seq(
    (0, 60),
    (0, 56),
    (0, 54),
    (0, 62),
    (0, 61),
    (0, 53),
    (0, 55),
    (0, 62),
    (0, 64),
    (1, 73),
    (1, 78),
    (1, 67),
    (1, 68),
    (1, 78)
)).toDF("fail" , "temperature")
df.show(2, false)

+----+-----------+
|fail|temperature|
+----+-----------+
|0   |60         |
|0   |56         |
+----+-----------+
only showing top 2 rows



df = [fail: int, temperature: int]


[fail: int, temperature: int]

In [4]:
df.printSchema()

root
 |-- fail: integer (nullable = false)
 |-- temperature: integer (nullable = false)



### 2. Create Pipelines

In [5]:
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.regression.{LinearRegression, LinearRegressionModel}

In [6]:
// Defining features
val features = new VectorAssembler()
  .setInputCols(Array("temperature"))
  .setOutputCol("features")

features = vecAssembler_f89fa0469ae7


vecAssembler_f89fa0469ae7

In [8]:
// Define model to use
val lr = new LinearRegression().setLabelCol("fail")

lr = linReg_c9b5b05778f4


linReg_c9b5b05778f4

In [22]:
// 설정할 수 있는 Params
lr.explainParams()

aggregationDepth: suggested depth for treeAggregate (>= 2) (default: 2)
elasticNetParam: the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty (default: 0.0)
featuresCol: features column name (default: features)
fitIntercept: whether to fit an intercept term (default: true)
labelCol: label column name (default: label, current: fail)
maxIter: maximum number of iterations (>= 0) (default: 100)
predictionCol: prediction column name (default: prediction)
regParam: regularization parameter (>= 0) (default: 0.0)
solver: the solver algorithm for optimization. If this is not set or empty, default value is 'auto' (default: auto)
standardization: whether to standardize the training features before fitting ...


In [24]:
// Create a pipeline that associates the model with the data processing sequence
val pipeline = new Pipeline().setStages(Array(features, lr))

pipeline = pipeline_8bd03406c210


pipeline_8bd03406c210

### 3. Fit & Check

In [25]:
// Run the Model
val model = pipeline.fit(df)

model = pipeline_8bd03406c210


pipeline_8bd03406c210

In [26]:
// Parameters는 문서를 보고 확인하자.
// https://spark.apache.org/docs/2.2.0/api/scala/index.html#org.apache.spark.ml.regression.LinearRegressionSummary
val linRegModel = model.stages(1).asInstanceOf[LinearRegressionModel]
println(s"RMSE:  ${linRegModel.summary.rootMeanSquaredError}")
println(s"r2:    ${linRegModel.summary.r2}")
println(s"Model: Y = ${linRegModel.coefficients(0)} * X + ${linRegModel.intercept}")

RMSE:  0.24965353110553395
r2:    0.7285317871929219
Model: Y = 0.05114497726003437 * X + -2.8978696241921877


linRegModel = linReg_c9b5b05778f4


linReg_c9b5b05778f4

### 4. Prediction

In [27]:
val result = model.transform(df).select("temperature", "fail", "prediction")
result.show()

+-----------+----+--------------------+
|temperature|fail|          prediction|
+-----------+----+--------------------+
|         60|   0| 0.17082901140987428|
|         56|   0|-0.03375089763026...|
|         54|   0|-0.13604085215033157|
|         62|   0| 0.27311896592994334|
|         61|   0|  0.2219739886699088|
|         53|   0| -0.1871858294103661|
|         55|   0|-0.08489587489029748|
|         62|   0| 0.27311896592994334|
|         64|   0| 0.37540892045001195|
|         73|   1|  0.8357137157903214|
|         78|   1|  1.0914386020904931|
|         67|   1|  0.5288438522301151|
|         68|   1|  0.5799888294901496|
|         78|   1|  1.0914386020904931|
+-----------+----+--------------------+



result = [temperature: int, fail: int ... 1 more field]


[temperature: int, fail: int ... 1 more field]