In [5]:
import urllib
f = urllib.urlretrieve("http://malifauzi.lecture.ub.ac.id/files/2019/05/DatasetRegresi.zip", "DatasetRegresi.zip")

### Membaca data training dan data testing dari file

In [22]:
dataset_train = spark.read.format("csv").options(header='true', inferschema='true').load("DatasetRegresi/weight_train.csv")
dataset_test = spark.read.format("csv").options(header='true', inferschema='true').load("DatasetRegresi/weight_test.csv")
dataset_train.show()
dataset_test.show()

+-----+------------+-----------+
|Index|Brain Weight|Body Weight|
+-----+------------+-----------+
|    1|       3.385|       44.5|
|    2|        0.48|       15.5|
|    3|        1.35|        8.1|
|    4|       465.0|      423.0|
|    5|       36.33|      119.5|
|    6|       27.66|      115.0|
|    7|       14.83|       98.2|
|    8|        1.04|        5.5|
|    9|        4.19|       58.0|
|   10|       0.425|        6.4|
|   11|       0.101|        4.0|
|   12|        0.92|        5.7|
|   13|         1.0|        6.6|
|   14|       0.005|       0.14|
|   15|        0.06|        1.0|
|   16|         3.5|       10.8|
|   17|         2.0|       12.3|
|   18|         1.7|        6.3|
|   19|      2547.0|     4603.0|
|   20|       0.023|        0.3|
+-----+------------+-----------+
only showing top 20 rows

+-----+------------+-----------+
|Index|Brain Weight|Body Weight|
+-----+------------+-----------+
|   50|         3.6|       21.0|
|   51|       4.288|       39.2|
|   52|        0.

### Menentukan fitur Data Training

In [24]:
from pyspark.ml.feature import VectorAssembler

vecAssembler = VectorAssembler(inputCols=["Body Weight"], outputCol="features")
new_dataset_train = vecAssembler.transform(dataset_train)
new_dataset_train.show()

+-----+------------+-----------+--------+
|Index|Brain Weight|Body Weight|features|
+-----+------------+-----------+--------+
|    1|       3.385|       44.5|  [44.5]|
|    2|        0.48|       15.5|  [15.5]|
|    3|        1.35|        8.1|   [8.1]|
|    4|       465.0|      423.0| [423.0]|
|    5|       36.33|      119.5| [119.5]|
|    6|       27.66|      115.0| [115.0]|
|    7|       14.83|       98.2|  [98.2]|
|    8|        1.04|        5.5|   [5.5]|
|    9|        4.19|       58.0|  [58.0]|
|   10|       0.425|        6.4|   [6.4]|
|   11|       0.101|        4.0|   [4.0]|
|   12|        0.92|        5.7|   [5.7]|
|   13|         1.0|        6.6|   [6.6]|
|   14|       0.005|       0.14|  [0.14]|
|   15|        0.06|        1.0|   [1.0]|
|   16|         3.5|       10.8|  [10.8]|
|   17|         2.0|       12.3|  [12.3]|
|   18|         1.7|        6.3|   [6.3]|
|   19|      2547.0|     4603.0|[4603.0]|
|   20|       0.023|        0.3|   [0.3]|
+-----+------------+-----------+--

### Melakukan training / membuat model

In [25]:
from pyspark.ml.regression import LinearRegression
lr = LinearRegression(featuresCol = 'features', labelCol='Brain Weight', maxIter=10, regParam=0.3, elasticNetParam=0.8)
lr_model = lr.fit(new_dataset_train)
print("Coefficients: " + str(lr_model.coefficients))
print("Intercept: " + str(lr_model.intercept))

Coefficients: [0.906398695346]
Intercept: -71.3567595068


### mengevaluasi model

In [26]:
trainingSummary = lr_model.summary
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)

RMSE: 356.693715


### Tentukan fitur pada data testing

In [27]:
from pyspark.ml.feature import VectorAssembler

vecAssembler = VectorAssembler(inputCols=["Body Weight"], outputCol="features")
new_dataset_test = vecAssembler.transform(dataset_test)
new_dataset_test.show()

+-----+------------+-----------+--------+
|Index|Brain Weight|Body Weight|features|
+-----+------------+-----------+--------+
|   50|         3.6|       21.0|  [21.0]|
|   51|       4.288|       39.2|  [39.2]|
|   52|        0.28|        1.9|   [1.9]|
|   53|       0.075|        1.2|   [1.2]|
|   54|       0.122|        3.0|   [3.0]|
|   55|       0.048|       0.33|  [0.33]|
|   56|       192.0|      180.0| [180.0]|
|   57|         3.0|       25.0|  [25.0]|
|   58|       160.0|      169.0| [169.0]|
|   59|         0.9|        2.6|   [2.6]|
|   60|        1.62|       11.4|  [11.4]|
|   61|       0.104|        2.5|   [2.5]|
|   62|       4.235|       50.4|  [50.4]|
+-----+------------+-----------+--------+



### Prediksi pada data Testing

In [28]:
lr_predictions = lr_model.transform(new_dataset_test)
lr_predictions.show()

+-----+------------+-----------+--------+-------------------+
|Index|Brain Weight|Body Weight|features|         prediction|
+-----+------------+-----------+--------+-------------------+
|   50|         3.6|       21.0|  [21.0]| -52.32238690449849|
|   51|       4.288|       39.2|  [39.2]|-35.825930649207855|
|   52|        0.28|        1.9|   [1.9]| -69.63460198560018|
|   53|       0.075|        1.2|   [1.2]| -70.26908107234213|
|   54|       0.122|        3.0|   [3.0]| -68.63756342071999|
|   55|       0.048|       0.33|  [0.33]| -71.05764793729284|
|   56|       192.0|      180.0| [180.0]|  91.79500565545808|
|   57|         3.0|       25.0|  [25.0]| -48.69679212311593|
|   58|       160.0|      169.0| [169.0]|  81.82462000665606|
|   59|         0.9|        2.6|   [2.6]| -69.00012289885824|
|   60|        1.62|       11.4|  [11.4]| -61.02381437981662|
|   61|       0.104|        2.5|   [2.5]|  -69.0907627683928|
|   62|       4.235|       50.4|  [50.4]|  -25.6742652613367|
+-----+-

### Simpan hasil prediksi

In [29]:
saved_prediction = lr_predictions.select('index', 'Brain Weight', 'Body Weight', 'prediction')
saved_prediction.write.csv("/home/hduser/Documents/HasilRegresi", header = 'true')

### Mengevaluasi hasil prediksi pada Data Testing

In [30]:
from pyspark.ml.evaluation import RegressionEvaluator
lr_evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="Brain Weight",metricName="rmse")
print("RMSE on test data = %g" % lr_evaluator.evaluate(lr_predictions))

RMSE on test data = 66.607
