In [1]:
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator

### MLflow

In [2]:
# SparkSession 생성
spark = (SparkSession
         .builder
         .appName("SparkMllibExampleApp")
         .getOrCreate())

23/04/01 16:06:54 WARN Utils: Your hostname, choeyunseoui-MacBookAir.local resolves to a loopback address: 127.0.0.1; using 172.30.34.243 instead (on interface en0)
23/04/01 16:06:54 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/04/01 16:06:55 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


### randomforest model mlflow로 트래킹하기

In [3]:
filePath = "../data/sf-airbnb-clean.parquet"
airbnbDF = spark.read.parquet(filePath)
(trainDF, testDF) = airbnbDF.randomSplit([.8, .2], seed = 42)

categoricalCols = [field for (field, dataType) in trainDF.dtypes if dataType == "string"]
indexOutputCols = [x + "Index" for x in categoricalCols]
stringIndexer = StringIndexer(inputCols = categoricalCols,
                              outputCols = indexOutputCols,
                              handleInvalid = "skip")
numericCols = [field for (field, dataType) in trainDF.dtypes if ((dataType == "double") & (field != "price"))]
assemblerInputs = indexOutputCols + numericCols
vecAssembler = VectorAssembler(inputCols = assemblerInputs,
                               outputCol = "features")

rf = RandomForestRegressor(labelCol = "price", maxBins = 40, maxDepth = 5, numTrees = 100, seed = 42)

pipeline = Pipeline(stages = [stringIndexer, vecAssembler, rf])

                                                                                

In [4]:
import mlflow
import mlflow.spark
import pandas as pd

In [5]:
with mlflow.start_run(run_name = "random-forest") as run:
    # 로그 매개변수
    mlflow.log_param("num_trees", rf.getNumTrees())
    mlflow.log_param("max_depth", rf.getMaxDepth())

    # 로그 모델
    pipelineModel = pipeline.fit(trainDF)
    mlflow.spark.log_model(pipelineModel, "model")

    # 로그 매트릭
    predDF = pipelineModel.transform(testDF)
    regressionEvaluator = RegressionEvaluator(predictionCol="prediction", labelCol="price")
    rmse = regressionEvaluator.setMetricName("rmse").evaluate(predDF)
    r2 = regressionEvaluator.setMetricName("r2").evaluate(predDF)
    mlflow.log_metrics({"rmse": rmse, "r2": r2})

    # 로그 아티팩트(artifact) : 기능 중요도 점수
    rfModel = pipelineModel.stages[-1]
    pandasDF = (pd.DataFrame(list(zip(vecAssembler.getInputCols(),
                                      rfModel.featureImportances)),
                             columns=["feature", "importance"])
                .sort_values(by = "importance", ascending = False))

    # 먼저 로컬에 파일을 저장하고 MLflow에 파일 경로를 알려준다.
    pandasDF.to_csv("feature-importance.csv", index = False)
    mlflow.log_artifact("feature-importance.csv")

23/04/01 16:07:07 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


                                                                                

```
cd (mlruns 폴더가 있는 폴더로 이동)
mlflow ui
```

MLflowClient로 트래킹 파일 접근

In [7]:
from mlflow.tracking import MlflowClient

In [24]:
client = MlflowClient()
runs = client.search_runs(run.info.experiment_id,
                         order_by= ["attributes.start_time desc"],
                          max_results=3) # max_results : 몇 개까지 찾을 건지
run_id = runs[2].info.run_id
run_name = runs[2].info.run_name
print(run_id)
print(run_name)
print(runs[2].data.metrics)

443c46fd533e4bfdb33dd3efe0372c2b
random-forest
{'r2': 0.22794251914574226, 'rmse': 211.5096898777315}


배치 배포

In [30]:
# mlflow를 사용하여 저장된 모델 로드
pipelineModel = mlflow.spark.load_model(f"runs:/{run_id}/model")
# 예측 생성
inputDF = spark.read.parquet("../data/sf-airbnb-clean.parquet")
predDF = pipelineModel.transform(inputDF)

2023/04/01 17:04:18 INFO mlflow.spark: 'runs:/443c46fd533e4bfdb33dd3efe0372c2b/model' resolved as 'file:///Users/yschoi/Library/CloudStorage/Dropbox/yunseo/development/BOAZ/spark-study/spark_exam_code/spark_study/Chap11/mlruns/0/443c46fd533e4bfdb33dd3efe0372c2b/artifacts/model'
2023/04/01 17:04:18 INFO mlflow.spark: File 'file:///Users/yschoi/Library/CloudStorage/Dropbox/yunseo/development/BOAZ/spark-study/spark_exam_code/spark_study/Chap11/mlruns/0/443c46fd533e4bfdb33dd3efe0372c2b/artifacts/model/sparkml' is already on DFS, copy is not necessary.


In [37]:
predDF.toPandas().head()

                                                                                

Unnamed: 0,host_is_superhost,cancellation_policy,instant_bookable,host_total_listings_count,neighbourhood_cleansed,latitude,longitude,property_type,room_type,accommodates,...,review_scores_value_na,host_is_superhostIndex,cancellation_policyIndex,instant_bookableIndex,neighbourhood_cleansedIndex,property_typeIndex,room_typeIndex,bed_typeIndex,features,prediction
0,t,moderate,t,1.0,Western Addition,37.76931,-122.43386,Apartment,Entire home/apt,3.0,...,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,"(1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 37.76...",175.772744
1,f,strict_14_with_grace_period,f,2.0,Bernal Heights,37.74511,-122.42102,Apartment,Entire home/apt,5.0,...,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,"(0.0, 0.0, 0.0, 5.0, 0.0, 0.0, 0.0, 2.0, 37.74...",237.216472
2,f,strict_14_with_grace_period,f,10.0,Haight Ashbury,37.76669,-122.4525,Apartment,Private room,2.0,...,0.0,0.0,0.0,0.0,6.0,0.0,1.0,0.0,"(0.0, 0.0, 0.0, 6.0, 0.0, 1.0, 0.0, 10.0, 37.7...",103.335423
3,f,strict_14_with_grace_period,f,10.0,Haight Ashbury,37.76487,-122.45183,Apartment,Private room,2.0,...,0.0,0.0,0.0,0.0,6.0,0.0,1.0,0.0,"(0.0, 0.0, 0.0, 6.0, 0.0, 1.0, 0.0, 10.0, 37.7...",103.838724
4,f,strict_14_with_grace_period,f,2.0,Western Addition,37.77525,-122.43637,House,Entire home/apt,5.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,"(0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 2.0, 37.77...",302.503672


스트리밍 배포

In [41]:
# 모델 로드
pipelineModel = mlflow.spark.load_model(f"runs:/{run_id}/model")

# 스트리밍 데이터 셋업
repartitionedPath = "../data/sf-airbnb-clean-100p.parquet"
schema = spark.read.parquet(repartitionedPath).schema

streamingData = (spark
                 .readStream
                 .schema(schema)
                 .option("maxFilesPerTrigger", 1)
                 .parquet(repartitionedPath))

# 예측 생성
streamPred = pipelineModel.transform(streamingData)

2023/04/01 17:32:05 INFO mlflow.spark: 'runs:/443c46fd533e4bfdb33dd3efe0372c2b/model' resolved as 'file:///Users/yschoi/Library/CloudStorage/Dropbox/yunseo/development/BOAZ/spark-study/spark_exam_code/spark_study/Chap11/mlruns/0/443c46fd533e4bfdb33dd3efe0372c2b/artifacts/model'
2023/04/01 17:32:05 INFO mlflow.spark: File 'file:///Users/yschoi/Library/CloudStorage/Dropbox/yunseo/development/BOAZ/spark-study/spark_exam_code/spark_study/Chap11/mlruns/0/443c46fd533e4bfdb33dd3efe0372c2b/artifacts/model/sparkml' is already on DFS, copy is not necessary.


In [56]:
#streamPred.writeStream.format("console").start().awaitTermination()

In [None]:
spark.stop()

### 비 MLlib 모델에 스파크 활용

In [1]:
from pyspark.sql import SparkSession
import pandas as pd
import numpy as np
import pyspark.sql.functions as F
import pyspark.sql.types as T

In [2]:
# SparkSession 생성
spark = (SparkSession
         .builder
         .appName("learn pandas UDFs in Spark")
         .getOrCreate())

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/04/05 15:09:50 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
# sample data 생성
g = np.tile(['group a', 'group b'], 10)
x = np.linspace(0, 10., 20)
np.random.seed(3)
y_lin = 2*x + np.random.rand(len(x))/10.
y_qua = 3*x**2 + np.random.rand(len(x))
df = pd.DataFrame({'group': g, 'x':x, 'y_lin': y_lin, 'y_qua':y_qua})
schema = "group STRING, x DOUBLE, y_lin DOUBLE, y_qua DOUBLE"

df = spark.createDataFrame(df, schema=schema)

In [4]:
df.show(3)

                                                                                

+-------+------------------+-------------------+-------------------+
|  group|                 x|              y_lin|              y_qua|
+-------+------------------+-------------------+-------------------+
|group a|               0.0|0.05507979025745755|0.28352508177131874|
|group b|0.5263157894736842|  1.123446361209179| 1.5241628490609185|
|group a|1.0526315789473684|  2.134353631786031| 3.7645534406624286|
+-------+------------------+-------------------+-------------------+
only showing top 3 rows



In [5]:
# 표준화 작업 : series to series pandas UDF
@F.pandas_udf(T.DoubleType())
def standardise(col1: pd.Series) -> pd.Series:
    return (col1 - col1.mean()) / col1.std()

res = df.select(standardise(F.col('y_lin')).alias('result')) # alias : column 이름 변경

res.show(3)



+-------------------+
|             result|
+-------------------+
|-0.7071067811865475|
| 0.7071067811865475|
|-0.7071067811865472|
+-------------------+
only showing top 3 rows



                                                                                

In [24]:
# 위와 동일한 기능을 하는 코드입니다.
# def standardise_func(col1: pd.Series) -> pd.Series:
#     return (col1 - col1.mean()) / col1.std()
#
# standardise = F.pandas_udf(standardise_func, returnType=T.DoubleType())
#
# res = df.select(standardise(F.col('y_lin')).alias('result'))
# res.show(3)

In [6]:
# 전체 열에 *2 하기: Iterator of Series to Iterator of Series
from typing import Iterator

@F.pandas_udf(T.DoubleType())
def plus_one(iterator: Iterator[pd.Series]) -> Iterator[pd.Series]:
    for x in iterator:
        yield x * 2

df.select(plus_one(F.col('x'))).show(3)

+------------------+
|       plus_one(x)|
+------------------+
|               0.0|
|1.0526315789473684|
|2.1052631578947367|
+------------------+
only showing top 3 rows



In [7]:
# 두 열의 RMSE 구하기 : Iterator of Multiple Series to Iterator of Series
from typing import Tuple

@F.pandas_udf(T.DoubleType())
def calculate_rmse(iterator: Iterator[Tuple[pd.Series, pd.Series]]) -> Iterator[pd.Series]:
    for a, b in iterator:
        yield ((a * 2) - b) ** 2

df.select(calculate_rmse("x", "y_lin")).show(3)

+------------------------+
|calculate_rmse(x, y_lin)|
+------------------------+
|    0.003033783294805516|
|    0.005014733386787644|
|    8.462556712200734E-4|
+------------------------+
only showing top 3 rows



In [24]:
# applyInPandas() : group 별 평균 구하기
def subtract_mean(pdf):
    # pdf is a pandas.DataFrame
    x = pdf.x
    return pdf.assign(x = x.mean())

result = df.groupby("group").applyInPandas(subtract_mean, schema = "group STRING, x DOUBLE, y_lin DOUBLE, y_qua DOUBLE").toPandas()

print(result.head(3))
print(result.tail(3))



     group         x     y_lin      y_qua
0  group a  4.736842  0.055080   0.283525
1  group a  4.736842  2.134354   3.764553
2  group a  4.736842  4.299821  13.841048
      group         x      y_lin       y_qua
17  group b  5.263158  15.848560  187.072826
18  group b  5.263158  17.950622  240.724046
19  group b  5.263158  20.041510  300.225055


                                                                                

In [30]:
# mapInPandas() : group a 데이터만 반환하기
def filter_func(iterator):
    for pdf in iterator:
        yield pdf[pdf.group == "group a"]

result = df.mapInPandas(filter_func, schema=df.schema).toPandas()

print(result.shape)
print(result.group.value_counts())

[Stage 39:>                                                         (0 + 8) / 8]

(10, 4)
group a    10
Name: group, dtype: int64


                                                                                

In [31]:
spark.stop()

### Joblib과 Hyperopt

In [60]:
from pyspark.sql import SparkSession
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
import pandas as pd
from sklearn.utils import parallel_backend
from joblibspark import register_spark
import time

In [34]:
# SparkSession 생성
spark = (SparkSession
         .builder
         .appName("learn pandas UDFs in Spark")
         .getOrCreate())

In [35]:
register_spark() # 스파크 백엔드 등록

In [54]:
filePath = "../data/sf-airbnb-clean.parquet"
df = spark.read.parquet(filePath).toPandas()
df = df.drop(columns = ['host_is_superhost', 'cancellation_policy', 'instant_bookable', 'neighbourhood_cleansed', 'property_type', 'room_type', 'bed_type']) # 수치형 변수만 남기
X_train, X_test, y_train, y_test = train_test_split(df.drop(["price"], axis=1), df[["price"]].values.ravel(), random_state=42)

In [55]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(5359, 26)
(1787, 26)
(5359,)
(1787,)


In [56]:
rf = RandomForestRegressor(random_state=42)

In [63]:
param_grid = {"max_depth" : [2,5,7,10], "n_estimators": [20, 50, 70, 100]}
gscv = GridSearchCV(rf, param_grid, cv = 3)

In [64]:
start_time = time.time()
with parallel_backend("spark", n_jobs = 3):
    gscv.fit(X_train, y_train)
end_time = time.time()

print("실행시간 : ", end_time - start_time) # 16.25

print(gscv.cv_results_)

                                                                                

실행시간 :  16.254735946655273
{'mean_fit_time': array([0.10824704, 0.25440931, 0.2828993 , 0.55776437, 0.18814659,
       0.67691731, 0.74300226, 0.94147054, 0.24972892, 0.57117605,
       0.78440499, 1.13093535, 0.31270862, 0.77415482, 1.10046101,
       1.51572259]), 'std_fit_time': array([0.00554672, 0.00740932, 0.00099481, 0.00452771, 0.0013116 ,
       0.00244232, 0.01217439, 0.00153647, 0.00077973, 0.00340503,
       0.00252918, 0.00261801, 0.00039822, 0.00121841, 0.00346425,
       0.00418205]), 'mean_score_time': array([0.00862328, 0.00863004, 0.01081292, 0.03232431, 0.0101157 ,
       0.01667802, 0.01397141, 0.01965793, 0.00889333, 0.01314616,
       0.01860468, 0.02376199, 0.00869926, 0.0159595 , 0.02241206,
       0.03062542]), 'std_score_time': array([0.00238347, 0.00057516, 0.00053839, 0.01019505, 0.00271928,
       0.00085583, 0.00066837, 0.00054107, 0.00022919, 0.00126802,
       0.00028632, 0.00066272, 0.00029524, 0.00032214, 0.00048547,
       0.00107858]), 'param_max_dep

In [65]:
start_time = time.time()
gscv.fit(X_train, y_train)
end_time = time.time()
print("실행시간:", end_time - start_time) # 25.99

실행시간: 25.99013590812683


Hyperopt (실행X) - 스니펫 코드

In [None]:
import hyperopt

best_hyperparameters = hyperopt.fmin(
    fn = training_function, # 모델 훈련 함수
    space = search_space, # 하이퍼파라미터 탐색 영역 지정
    algo = hyperopt.tpe.suggest, # 최적화 알고리즘 선택
    max_evals = 64, # 최대 반복 횟수
    trials = hyperopt.SparkTrials(parallelism = 4)
)