In [1]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Boston Housing Linear Regression example") \
    .getOrCreate()

In [2]:
path = "C:/Users/Yeojun/Documents/GitHub/Programmers_DevCourse/학습내용/빅데이터 처리 시스템 하둡과 Spark/data"

In [8]:
data = spark.read.csv(path + "/boston_housing.csv", header=True, inferSchema=True)
data.printSchema()

root
 |-- crim: double (nullable = true)
 |-- zn: double (nullable = true)
 |-- indus: double (nullable = true)
 |-- chas: integer (nullable = true)
 |-- nox: double (nullable = true)
 |-- rm: double (nullable = true)
 |-- age: double (nullable = true)
 |-- dis: double (nullable = true)
 |-- rad: integer (nullable = true)
 |-- tax: integer (nullable = true)
 |-- ptratio: double (nullable = true)
 |-- b: double (nullable = true)
 |-- lstat: double (nullable = true)
 |-- medv: double (nullable = true)



In [15]:
data.show()

+-------+----+-----+----+-----+-----+-----+------+---+---+-------+------+-----+----+
|   crim|  zn|indus|chas|  nox|   rm|  age|   dis|rad|tax|ptratio|     b|lstat|medv|
+-------+----+-----+----+-----+-----+-----+------+---+---+-------+------+-----+----+
|0.00632|18.0| 2.31|   0|0.538|6.575| 65.2|  4.09|  1|296|   15.3| 396.9| 4.98|24.0|
|0.02731| 0.0| 7.07|   0|0.469|6.421| 78.9|4.9671|  2|242|   17.8| 396.9| 9.14|21.6|
|0.02729| 0.0| 7.07|   0|0.469|7.185| 61.1|4.9671|  2|242|   17.8|392.83| 4.03|34.7|
|0.03237| 0.0| 2.18|   0|0.458|6.998| 45.8|6.0622|  3|222|   18.7|394.63| 2.94|33.4|
|0.06905| 0.0| 2.18|   0|0.458|7.147| 54.2|6.0622|  3|222|   18.7| 396.9| 5.33|36.2|
|0.02985| 0.0| 2.18|   0|0.458| 6.43| 58.7|6.0622|  3|222|   18.7|394.12| 5.21|28.7|
|0.08829|12.5| 7.87|   0|0.524|6.012| 66.6|5.5605|  5|311|   15.2| 395.6|12.43|22.9|
|0.14455|12.5| 7.87|   0|0.524|6.172| 96.1|5.9505|  5|311|   15.2| 396.9|19.15|27.1|
|0.21124|12.5| 7.87|   0|0.524|5.631|100.0|6.0821|  5|311|   15.2

### 피쳐 벡터를 만들기

In [10]:
from pyspark.ml.feature import VectorAssembler

feature_columns = data.columns[:-1]
assembler = VectorAssembler(inputCols=feature_columns, outputCol="feature")

In [11]:
data_2 = assembler.transform(data)
data_2.show()

+-------+----+-----+----+-----+-----+-----+------+---+---+-------+------+-----+----+--------------------+
|   crim|  zn|indus|chas|  nox|   rm|  age|   dis|rad|tax|ptratio|     b|lstat|medv|             feature|
+-------+----+-----+----+-----+-----+-----+------+---+---+-------+------+-----+----+--------------------+
|0.00632|18.0| 2.31|   0|0.538|6.575| 65.2|  4.09|  1|296|   15.3| 396.9| 4.98|24.0|[0.00632,18.0,2.3...|
|0.02731| 0.0| 7.07|   0|0.469|6.421| 78.9|4.9671|  2|242|   17.8| 396.9| 9.14|21.6|[0.02731,0.0,7.07...|
|0.02729| 0.0| 7.07|   0|0.469|7.185| 61.1|4.9671|  2|242|   17.8|392.83| 4.03|34.7|[0.02729,0.0,7.07...|
|0.03237| 0.0| 2.18|   0|0.458|6.998| 45.8|6.0622|  3|222|   18.7|394.63| 2.94|33.4|[0.03237,0.0,2.18...|
|0.06905| 0.0| 2.18|   0|0.458|7.147| 54.2|6.0622|  3|222|   18.7| 396.9| 5.33|36.2|[0.06905,0.0,2.18...|
|0.02985| 0.0| 2.18|   0|0.458| 6.43| 58.7|6.0622|  3|222|   18.7|394.12| 5.21|28.7|[0.02985,0.0,2.18...|
|0.08829|12.5| 7.87|   0|0.524|6.012| 66.6|5.5

### 훈련용과 테스트용 데이터를 나누고 Linear Regression 모델 만들기

In [12]:
train, test = data_2.randomSplit([0.7, 0.3])

In [20]:
from pyspark.ml.regression import LinearRegression

algo = LinearRegression(featuresCol="feature", labelCol="medv")
model = algo.fit(train)

### 모델 성능 측정

In [21]:
evaluation_summary = model.evaluate(test)
evaluation_summary

<pyspark.ml.regression.LinearRegressionSummary at 0x1f7a6187f40>

In [25]:
print(evaluation_summary.meanAbsoluteError)
print(evaluation_summary.rootMeanSquaredError)
print(evaluation_summary.r2)

3.5812962994667887
4.807321116754133
0.748799519828782


### 모델 예측값 살펴보기

In [20]:
predictions = model.transform(test)

NameError: name 'model' is not defined

In [27]:
predictions.select(predictions.columns[13:]).show()

+----+--------------------+------------------+
|medv|             feature|        prediction|
+----+--------------------+------------------+
|32.7|[0.01301,35.0,1.5...|30.193611961778362|
|35.4|[0.01311,90.0,1.2...|29.812965746291916|
|24.5|[0.01501,80.0,2.0...|26.787787109949562|
|50.0|[0.01501,90.0,1.2...| 43.30359321964781|
|44.0|[0.01538,90.0,3.7...|36.736010586764294|
|30.1|[0.01709,90.0,2.0...|24.076492714463466|
|33.0|[0.01951,17.5,1.3...|22.497100085505494|
|50.0|[0.02009,95.0,2.6...| 42.14580939050474|
|16.5|[0.02498,0.0,1.89...|22.832271405610456|
|34.7|[0.02729,0.0,7.07...|30.670558646289173|
|30.8|[0.02763,75.0,2.9...|30.804642281770462|
|28.7|[0.02985,0.0,2.18...|25.255370231122985|
|17.5|[0.03113,0.0,4.39...|16.563281091402153|
|45.4|[0.03578,20.0,3.3...|38.392851453907696|
|33.3|[0.04011,80.0,1.5...|35.928743698912974|
|28.0|[0.04113,25.0,4.8...|28.174418069930557|
|22.9|[0.04203,28.0,15....|  28.5500357464719|
|24.8|[0.04297,52.5,5.3...|   26.005786366639|
|23.4|[0.0498

In [3]:
model_path = "C:/Users/Yeojun/Documents/GitHub/Programmers_DevCourse/학습내용/빅데이터 처리 시스템 하둡과 Spark/model"

In [36]:
model.save(model_path + "/boston_housing_model")

In [None]:
from pyspark.ml.regression import LinearRegressionModel
loaded_model = LinearRegressionModel.load(model_path + "/boston_housing_model")

In [None]:
prediction2 = loaded_model.transform(test)

In [21]:
prediction2.select(prediction2.columns[13:]).show()

+----+--------------------+------------------+
|medv|             feature|        prediction|
+----+--------------------+------------------+
|22.0|[0.01096,55.0,2.2...| 26.99340480332676|
|31.6|[0.01432,100.0,1....|32.273652440375365|
|20.1|[0.01965,80.0,1.7...|18.769280865511835|
|24.7|[0.02055,85.0,0.7...| 24.39507135076819|
|30.8|[0.02763,75.0,2.9...|30.804642281770462|
|18.5|[0.03041,0.0,5.19...|18.719451586293317|
|31.2|[0.03049,55.0,3.7...|28.499417153140257|
|34.9|[0.0315,95.0,1.47...| 29.82141970218025|
|33.4|[0.03237,0.0,2.18...| 28.62582374305508|
|34.9|[0.03359,75.0,2.9...|33.759639352985545|
|19.5|[0.03427,0.0,5.19...|19.660552639054448|
|24.1|[0.03445,82.5,2.0...|29.142824029672234|
|19.4|[0.03466,35.0,6.0...| 23.39964791202941|
|22.9|[0.03551,25.0,4.8...| 24.82902083856888|
|24.8|[0.03659,25.0,4.8...|25.786547592195657|
|20.7|[0.03738,0.0,5.19...| 21.19482740196808|
|28.0|[0.04113,25.0,4.8...|28.174418069930557|
|22.9|[0.04203,28.0,15....|  28.5500357464719|
|20.6|[0.0429