In [2]:
import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, desc

### 머신러닝 파이프라인 설계

In [3]:
# SparkSession 생성
spark = (SparkSession
         .builder
         .appName("SparkMllibExampleApp")
         .getOrCreate())

23/04/01 17:43:15 WARN Utils: Your hostname, choeyunseoui-MacBookAir.local resolves to a loopback address: 127.0.0.1; using 172.30.34.243 instead (on interface en0)
23/04/01 17:43:15 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/04/01 17:43:16 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [6]:
# 데이터 불러오기
airbnbDF = spark.read.parquet("../data/sf-airbnb-clean.parquet/")
airbnbDF.select("neighbourhood_cleansed", 'room_type', 'bedrooms', 'bathrooms', 'number_of_reviews', 'price').show(5)

[Stage 1:>                                                          (0 + 1) / 1]

+----------------------+---------------+--------+---------+-----------------+-----+
|neighbourhood_cleansed|      room_type|bedrooms|bathrooms|number_of_reviews|price|
+----------------------+---------------+--------+---------+-----------------+-----+
|      Western Addition|Entire home/apt|     1.0|      1.0|            180.0|170.0|
|        Bernal Heights|Entire home/apt|     2.0|      1.0|            111.0|235.0|
|        Haight Ashbury|   Private room|     1.0|      4.0|             17.0| 65.0|
|        Haight Ashbury|   Private room|     1.0|      4.0|              8.0| 65.0|
|      Western Addition|Entire home/apt|     2.0|      1.5|             27.0|785.0|
+----------------------+---------------+--------+---------+-----------------+-----+
only showing top 5 rows



                                                                                

In [7]:
# 훈련 및 테스트 데이터 분리
trainDF, testDF = airbnbDF.randomSplit([.8, .2], seed = 42)
print(f"""There are {trainDF.count()} rows in the training set, and {testDF.count()} in the test set""")

23/04/01 17:43:44 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


[Stage 5:>                                                          (0 + 1) / 1]

There are 5780 rows in the training set, and 1366 in the test set


                                                                                

In [8]:
# VectorAssembler를 이용한 데이터 준비
from pyspark.ml.feature import VectorAssembler
vecAssembler = VectorAssembler(inputCols = ["bedrooms"], outputCol="features")
vecTrainDF = vecAssembler.transform(trainDF)
vecTrainDF.select("bedrooms", "features", "price").show(10)

[Stage 8:>                                                          (0 + 1) / 1]

+--------+--------+-----+
|bedrooms|features|price|
+--------+--------+-----+
|     1.0|   [1.0]|200.0|
|     1.0|   [1.0]|130.0|
|     1.0|   [1.0]| 95.0|
|     1.0|   [1.0]|250.0|
|     3.0|   [3.0]|250.0|
|     1.0|   [1.0]|115.0|
|     1.0|   [1.0]|105.0|
|     1.0|   [1.0]| 86.0|
|     1.0|   [1.0]|100.0|
|     2.0|   [2.0]|220.0|
+--------+--------+-----+
only showing top 10 rows



                                                                                

In [9]:
# 추정기를 사용하여 모델 구축
from pyspark.ml.regression import LinearRegression
lr = LinearRegression(featuresCol="features", labelCol="price")
lrModel = lr.fit(vecTrainDF)

23/04/01 17:43:56 WARN Instrumentation: [aaff0d4e] regParam is zero, which might cause numerical instability and overfitting.


[Stage 9:>                                                          (0 + 1) / 1]

23/04/01 17:43:57 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
23/04/01 17:43:57 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.ForeignLinkerBLAS


                                                                                

23/04/01 17:43:57 WARN InstanceBuilder$NativeLAPACK: Failed to load implementation from:dev.ludovic.netlib.lapack.JNILAPACK


                                                                                

In [10]:
# 회귀계수 출력
m = round(lrModel.coefficients[0], 2)
b = round(lrModel.intercept, 2)
print(f"""The formula for the linear regression line is price = {m}*bedrooms + {b}""")

The formula for the linear regression line is price = 123.68*bedrooms + 47.51


In [11]:
# 파이프라인 구축 (훈련데이터로 모델 학습)
from pyspark.ml import Pipeline
pipeline = Pipeline(stages = [vecAssembler, lr])
pipelineModel = pipeline.fit(trainDF)

23/04/01 17:44:00 WARN Instrumentation: [c941f594] regParam is zero, which might cause numerical instability and overfitting.


In [12]:
# 파이프라인 구축 (테스트데이터 예측)
predDF = pipelineModel.transform(testDF)
predDF.select("bedrooms", "features", "price", "prediction").show(5)

+--------+--------+-----+------------------+
|bedrooms|features|price|        prediction|
+--------+--------+-----+------------------+
|     1.0|   [1.0]| 85.0|171.18598011578285|
|     1.0|   [1.0]| 45.0|171.18598011578285|
|     1.0|   [1.0]| 70.0|171.18598011578285|
|     1.0|   [1.0]|128.0|171.18598011578285|
|     1.0|   [1.0]|159.0|171.18598011578285|
+--------+--------+-----+------------------+
only showing top 5 rows



In [13]:
# 범주형 데이터 다루기 : one-hot encoding
from pyspark.ml.feature import OneHotEncoder, StringIndexer

# 변환할 변수 지정 및 변환할 변수 명 생성
categoricalCols = [field for (field, dataType) in trainDF.dtypes if dataType == "string"]
indexOutputCols = [x + "Index" for x in categoricalCols]
oheOutputCols = [x + "OHE" for x in categoricalCols]

# StringIndexer
stringIndexer = StringIndexer(inputCols=categoricalCols, outputCols=indexOutputCols, handleInvalid = "skip")
# OneHotEncoder
oheEncoder = OneHotEncoder(inputCols=indexOutputCols, outputCols=oheOutputCols)

# 연속형 변수 지정
numericCols = [field for (field, dataType) in trainDF.dtypes if  ((dataType == "double") & (field != "price"))]

# vectorAssembler
assemblerInputs = oheOutputCols + numericCols
vecAssembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")

In [14]:
# RFormula 사용하기
from pyspark.ml.feature import RFormula

rFormula = RFormula(formula = "price ~ .",
                    featuresCol="features",
                    labelCol="price",
                    handleInvalid="skip")

In [15]:
# 범주형 데이터 처리를 포함한 파이프라인 설계
lr = LinearRegression(labelCol="price", featuresCol="features")
pipeline=Pipeline(stages = [stringIndexer, oheEncoder, vecAssembler, lr])

# RFormula
# pipeline=Pipeline(stages=[rFormula, lr])

pipelineModel = pipeline.fit(trainDF)
predDF = pipelineModel.transform(testDF)
predDF.select("features", "price", "prediction").show(5)

                                                                                

23/04/01 17:44:07 WARN Instrumentation: [5604f60d] regParam is zero, which might cause numerical instability and overfitting.


                                                                                

+--------------------+-----+------------------+
|            features|price|        prediction|
+--------------------+-----+------------------+
|(98,[0,3,6,22,43,...| 85.0| 55.24365707389188|
|(98,[0,3,6,22,43,...| 45.0|23.357685914717877|
|(98,[0,3,6,22,43,...| 70.0|28.474464479034395|
|(98,[0,3,6,12,42,...|128.0| -91.6079079594947|
|(98,[0,3,6,12,43,...|159.0| 95.05688229945372|
+--------------------+-----+------------------+
only showing top 5 rows



In [16]:
# RMSE 측정
from pyspark.ml.evaluation import RegressionEvaluator
regressionEvaluator = RegressionEvaluator(
    predictionCol = "prediction", # 예측값
    labelCol = "price", # 실제값
    metricName="rmse")
rmse = regressionEvaluator.evaluate(predDF)
print(f"RMSE if {rmse:.1f}")

RMSE if 220.6


In [17]:
# R-square
r2 = regressionEvaluator.setMetricName("r2").evaluate(predDF)
print(f"R2 is {r2}")

R2 is 0.16043316698848087


In [18]:
# 모델 저장
# pipelineModel.write().overwrite().save("저장경로")

In [19]:
# 모델 로드
# from pyspark.ml import PipelineModel
# savedPipelineModel = PipelineModel.load("저장경로")

### 하이퍼파라미터 튜닝
- 트리 기반 모델

In [20]:
from pyspark.ml.regression import DecisionTreeRegressor

# 모델 객체 생성
dt = DecisionTreeRegressor(labelCol="price")

# 숫자 열만 필터링
numericCols = [field for (field, dataType) in trainDF.dtypes
               if ((dataType == "double") & (field != "price"))]

# 위에서 정의한 StingIndexer의 출력과 숫자 열 결합
assemblerInputs = indexOutputCols + numericCols
vecAssembler = VectorAssembler(inputCols = assemblerInputs, outputCol="features")

# 단계를 파이프라인으로 결합
stages = [stringIndexer, vecAssembler, dt]
pipeline = Pipeline(stages = stages)

In [21]:
# 및 훈련 -> error
pipelineModel = pipeline.fit(trainDF)

23/04/01 17:44:16 ERROR Instrumentation: java.lang.IllegalArgumentException: requirement failed: DecisionTree requires maxBins (= 32) to be at least as large as the number of values in each categorical feature, but categorical feature 3 has 36 values. Consider removing this and other categorical features with a large number of values, or add more training examples.
	at scala.Predef$.require(Predef.scala:281)
	at org.apache.spark.ml.tree.impl.DecisionTreeMetadata$.buildMetadata(DecisionTreeMetadata.scala:151)
	at org.apache.spark.ml.tree.impl.RandomForest$.run(RandomForest.scala:274)
	at org.apache.spark.ml.regression.DecisionTreeRegressor.$anonfun$train$1(DecisionTreeRegressor.scala:126)
	at org.apache.spark.ml.util.Instrumentation$.$anonfun$instrumented$1(Instrumentation.scala:191)
	at scala.util.Try$.apply(Try.scala:213)
	at org.apache.spark.ml.util.Instrumentation$.instrumented(Instrumentation.scala:191)
	at org.apache.spark.ml.regression.DecisionTreeRegressor.train(DecisionTreeRegr

IllegalArgumentException: requirement failed: DecisionTree requires maxBins (= 32) to be at least as large as the number of values in each categorical feature, but categorical feature 3 has 36 values. Consider removing this and other categorical features with a large number of values, or add more training examples.

In [22]:
dt.setMaxBins(40)
pipelineModel = pipeline.fit(trainDF)

                                                                                

In [23]:
# 규칙 프린트
dtModel = pipelineModel.stages[-1]
print(dtModel.toDebugString)

DecisionTreeRegressionModel: uid=DecisionTreeRegressor_9ac4c456a13f, depth=5, numNodes=47, numFeatures=33
  If (feature 12 <= 2.5)
   If (feature 12 <= 1.5)
    If (feature 5 in {1.0,2.0})
     If (feature 4 in {0.0,1.0,3.0,5.0,9.0,10.0,11.0,13.0,14.0,16.0,18.0,24.0})
      If (feature 3 in {0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0,11.0,12.0,13.0,14.0,15.0,16.0,17.0,18.0,19.0,20.0,21.0,23.0,24.0,25.0,26.0,27.0,28.0,29.0,30.0,31.0,32.0,33.0,34.0})
       Predict: 104.23992784125075
      Else (feature 3 not in {0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0,11.0,12.0,13.0,14.0,15.0,16.0,17.0,18.0,19.0,20.0,21.0,23.0,24.0,25.0,26.0,27.0,28.0,29.0,30.0,31.0,32.0,33.0,34.0})
       Predict: 250.7111111111111
     Else (feature 4 not in {0.0,1.0,3.0,5.0,9.0,10.0,11.0,13.0,14.0,16.0,18.0,24.0})
      If (feature 3 in {0.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0,11.0,12.0,13.0,14.0,15.0,16.0,17.0,18.0,19.0,20.0,21.0,22.0,23.0,27.0,33.0,35.0})
       Predict: 151.94179894179894
      Else (feat

In [24]:
# 변수 중요도 출력
import pandas as pd

featureImp = pd.DataFrame(
    list(zip(vecAssembler.getInputCols(), dtModel.featureImportances)),
    columns = ["feature", "importance"]
)
featureImp.sort_values(by = "importance", ascending=False)

Unnamed: 0,feature,importance
12,bedrooms,0.283406
1,cancellation_policyIndex,0.167893
2,instant_bookableIndex,0.140081
4,property_typeIndex,0.128179
15,number_of_reviews,0.126233
3,neighbourhood_cleansedIndex,0.0562
9,longitude,0.03881
14,minimum_nights,0.029473
13,beds,0.015218
5,room_typeIndex,0.010905


In [25]:
# 랜덤포레스트
from pyspark.ml.regression import RandomForestRegressor
rf = RandomForestRegressor(labelCol="price", maxBins=40, seed = 42)

### K-폴드 교차검증

In [26]:
# 모델 정의
pipeline = Pipeline(stages = [stringIndexer, vecAssembler, rf])

In [27]:
# ParamGridBuilder()
from pyspark.ml.tuning import ParamGridBuilder
paramGrid = (ParamGridBuilder()
             .addGrid(rf.maxDepth, [2,4,6])
             .addGrid(rf.numTrees, [10, 100])
             .build())

In [28]:
# metric
evaluator = RegressionEvaluator(labelCol="price",
                                predictionCol="prediction",
                                metricName="rmse")

In [30]:
from pyspark.ml.tuning import CrossValidator
cv = CrossValidator(estimator=pipeline,
                    evaluator=evaluator,
                    estimatorParamMaps=paramGrid,
                    numFolds=3,
                    seed=42)
cvModel = cv.fit(trainDF)

                                                                                

23/04/01 17:51:13 WARN DAGScheduler: Broadcasting large task binary with size 1303.2 KiB
23/04/01 17:51:32 WARN DAGScheduler: Broadcasting large task binary with size 1136.8 KiB
23/04/01 17:51:49 WARN DAGScheduler: Broadcasting large task binary with size 1177.6 KiB
23/04/01 17:51:54 WARN DAGScheduler: Broadcasting large task binary with size 1202.5 KiB


In [32]:
cvModel.transform(testDF).toPandas().head()

Unnamed: 0,host_is_superhost,cancellation_policy,instant_bookable,host_total_listings_count,neighbourhood_cleansed,latitude,longitude,property_type,room_type,accommodates,...,review_scores_value_na,host_is_superhostIndex,cancellation_policyIndex,instant_bookableIndex,neighbourhood_cleansedIndex,property_typeIndex,room_typeIndex,bed_typeIndex,features,prediction
0,f,flexible,f,1.0,Bayview,37.72001,-122.39249,House,Entire home/apt,2.0,...,0.0,0.0,2.0,0.0,15.0,1.0,0.0,0.0,"(0.0, 2.0, 0.0, 15.0, 1.0, 0.0, 0.0, 1.0, 37.7...",143.861264
1,f,flexible,f,1.0,Bayview,37.7325,-122.39221,House,Private room,1.0,...,1.0,0.0,2.0,0.0,15.0,1.0,1.0,0.0,"[0.0, 2.0, 0.0, 15.0, 1.0, 1.0, 0.0, 1.0, 37.7...",90.568314
2,f,flexible,f,1.0,Bayview,37.73555,-122.39779,House,Private room,1.0,...,1.0,0.0,2.0,0.0,15.0,1.0,1.0,0.0,"[0.0, 2.0, 0.0, 15.0, 1.0, 1.0, 0.0, 1.0, 37.7...",91.016124
3,f,flexible,f,1.0,Bernal Heights,37.73905,-122.41269,Apartment,Private room,1.0,...,0.0,0.0,2.0,0.0,5.0,0.0,1.0,0.0,"(0.0, 2.0, 0.0, 5.0, 0.0, 1.0, 0.0, 1.0, 37.73...",93.106212
4,f,flexible,f,1.0,Bernal Heights,37.74473,-122.41516,House,Private room,1.0,...,0.0,0.0,2.0,0.0,5.0,1.0,1.0,0.0,"(0.0, 2.0, 0.0, 5.0, 1.0, 1.0, 0.0, 1.0, 37.74...",113.129252


In [33]:
spark.stop()