### Label Encoding 수행
* StringIndexer 객체를 이용하여 Label Encoding 적용
* StringIndexer 객체 생성 시 변환될 컬럼명과 변환 후 컬럼명을 입력 받음.
* StringIndexer 객체의 fit()메소드 호출 시 DataFrame 입력하면 StringInxerModel이 반환됨.
* 반환된 StringInxerModel 객체의 transform() 메소드 호출시 DataFrame 입력하면 Label Encoding 적용된 outputCol이 추가된 DataFrame반환.

In [0]:
df = spark.createDataFrame(
    [(0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c")],
    ["id", "category"])
df.show()

+---+--------+
| id|category|
+---+--------+
|  0|       a|
|  1|       b|
|  2|       c|
|  3|       a|
|  4|       a|
|  5|       c|
+---+--------+



In [0]:
from pyspark.ml.feature import StringIndexer

# StringIndexer 클래스의 생성 인자로 DataFrame에서 Label 변환이 될 컬럼명인 inputCol, 그리고 변환 결과 컬럼명인 outputCol 필요)
indexer = StringIndexer(inputCol='category', outputCol='category_index') # StringIndexer의 인스턴스 생성

# StringIndexer는 fit() 수행시 DataFrame을 입력 받고, StringIndexerModel 객체를 반환함. 
indexer_model = indexer.fit(df) # Model 객체를 반환
print(indexer_model)

StringIndexerModel: uid=StringIndexer_dcd275010a37, handleInvalid=error


In [0]:
# StringIndexerModel에 transform()을 적용하여 outputCol로 지정된 컬럼명으로 Label Encoding 적용한 DataFrame 생성 반환. 
indexed_df = indexer_model.transform(df)
display(indexed_df.show())

+---+--------+--------------+
| id|category|category_index|
+---+--------+--------------+
|  0|       a|           0.0|
|  1|       b|           2.0|
|  2|       c|           1.0|
|  3|       a|           0.0|
|  4|       a|           0.0|
|  5|       c|           1.0|
+---+--------+--------------+



#### IndexToString 클래스를 이용하여 Label Encoding된 값을 원본 값으로 원복 할 수 있음.

In [0]:
from pyspark.ml.feature import IndexToString

converter = IndexToString(inputCol='category_index', outputCol='original_category')
converted = converter.transform(indexed_df) # 얘는 모델 없이 transform만 있음
converted.show()

+---+--------+--------------+-----------------+
| id|category|category_index|original_category|
+---+--------+--------------+-----------------+
|  0|       a|           0.0|                a|
|  1|       b|           2.0|                b|
|  2|       c|           1.0|                c|
|  3|       a|           0.0|                a|
|  4|       a|           0.0|                a|
|  5|       c|           1.0|                c|
+---+--------+--------------+-----------------+



#### 여러개의 컬럼을 Label Encoding 수행. 
* StringIndexer 객체 생성 시 inputCols에 리스트로 변환될 컬럼들을 입력하고, outputCols에 새롭게 변환된 컬럼명을 입력

In [0]:
from pyspark.ml.feature import StringIndexer

df = spark.createDataFrame(
    [(0, "a", "A"), (1, "b", "A"), (2, "c", "K"), (3, "a", "D"), (4, "a", "C"), (5, "c", "B")],
    ["id", "category1", "category2"])
df.show()

+---+---------+---------+
| id|category1|category2|
+---+---------+---------+
|  0|        a|        A|
|  1|        b|        A|
|  2|        c|        K|
|  3|        a|        D|
|  4|        a|        C|
|  5|        c|        B|
+---+---------+---------+



In [0]:
indexer = StringIndexer(inputCols=["category1", "category2"], outputCols=["label_encoded1", "label_encoded2"])
indexed_model = indexer.fit(df)
indexed_df = indexed_model.transform(df)
indexed_df.show()

+---+---------+---------+--------------+--------------+
| id|category1|category2|label_encoded1|label_encoded2|
+---+---------+---------+--------------+--------------+
|  0|        a|        A|           0.0|           0.0|
|  1|        b|        A|           2.0|           0.0|
|  2|        c|        K|           1.0|           4.0|
|  3|        a|        D|           0.0|           3.0|
|  4|        a|        C|           0.0|           2.0|
|  5|        c|        B|           1.0|           1.0|
+---+---------+---------+--------------+--------------+



### One Hot Encoding 적용
* OneHotEncoder 클래스를 이용하여 변환
* OneHotEncoder될 컬럼은 반드시 숫자형으로 변환되어 있어야 함. 따라서 OneHotEncoder를 String 컬럼에 적용 시에는 Label Encoding을 먼저 적용 후에 변환해야 함.

In [0]:
from pyspark.ml.feature import OneHotEncoder

df = spark.createDataFrame([
    (0.0, 1.0),
    (1.0, 0.0),
    (2.0, 1.0),
    (0.0, 2.0),
    (0.0, 1.0),
    (2.0, 0.0)
], ["categoryIndex1", "categoryIndex2"])

# dropLast는 마지막 인자를 제외할지를 나타냄 default는 True. 
# 5개의 카테고리(0, 1, 2, 3, 4)가 있을 경우 2는 [0.0, 0.0, 1.0, 0.0] 로 매핑. 4는 [0.0, 0.0, 0.0, 0.0]로 매핑. 
encoder = OneHotEncoder(dropLast=True, inputCols=["categoryIndex1", "categoryIndex2"],
                        outputCols=["onehot_encoded1", "onehot_encoded2"])
encoded_model = encoder.fit(df)
# OneHotEncoder는 sparse vector 형태로 onehot encoding 적용. 
encoded_df = encoded_model.transform(df)
#encoded_df = encoded_model.fit(df).transform(df) # scitkit의 fit_transform()과 같이 좀 더 축약된 코드

print(encoded_df.show())
display(encoded_df)

+--------------+--------------+---------------+---------------+
|categoryIndex1|categoryIndex2|onehot_encoded1|onehot_encoded2|
+--------------+--------------+---------------+---------------+
|           0.0|           1.0|  (2,[0],[1.0])|  (2,[1],[1.0])|
|           1.0|           0.0|  (2,[1],[1.0])|  (2,[0],[1.0])|
|           2.0|           1.0|      (2,[],[])|  (2,[1],[1.0])|
|           0.0|           2.0|  (2,[0],[1.0])|      (2,[],[])|
|           0.0|           1.0|  (2,[0],[1.0])|  (2,[1],[1.0])|
|           2.0|           0.0|      (2,[],[])|  (2,[0],[1.0])|
+--------------+--------------+---------------+---------------+

None


categoryIndex1,categoryIndex2,onehot_encoded1,onehot_encoded2
0.0,1.0,"Map(vectorType -> sparse, length -> 2, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 2, indices -> List(1), values -> List(1.0))"
1.0,0.0,"Map(vectorType -> sparse, length -> 2, indices -> List(1), values -> List(1.0))","Map(vectorType -> sparse, length -> 2, indices -> List(0), values -> List(1.0))"
2.0,1.0,"Map(vectorType -> sparse, length -> 2, indices -> List(), values -> List())","Map(vectorType -> sparse, length -> 2, indices -> List(1), values -> List(1.0))"
0.0,2.0,"Map(vectorType -> sparse, length -> 2, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 2, indices -> List(), values -> List())"
0.0,1.0,"Map(vectorType -> sparse, length -> 2, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 2, indices -> List(1), values -> List(1.0))"
2.0,0.0,"Map(vectorType -> sparse, length -> 2, indices -> List(), values -> List())","Map(vectorType -> sparse, length -> 2, indices -> List(0), values -> List(1.0))"


In [0]:
df = spark.createDataFrame(
    [(0, "a", "A"), (1, "b", "A"), (2, "c", "K"), (3, "a", "D"), (4, "a", "C"), (5, "c", "B")],
    ["id", "category1", "category2"])
print(df.show())

encoder = OneHotEncoder(inputCols=["category1", "category2"],
                        outputCols=["onehot_encoded1", "onehot_encoded2"])
                        
# 아래 코드는 string값을 One Hot Encoding 적용 시도 하였기에 오류 발생. 
encoded_model = encoder.fit(df)

+---+---------+---------+
| id|category1|category2|
+---+---------+---------+
|  0|        a|        A|
|  1|        b|        A|
|  2|        c|        K|
|  3|        a|        D|
|  4|        a|        C|
|  5|        c|        B|
+---+---------+---------+

None


[0;31m---------------------------------------------------------------------------[0m
[0;31mIllegalArgumentException[0m                  Traceback (most recent call last)
[0;32m<command-1916978023792376>[0m in [0;36m<module>[0;34m[0m
[1;32m      8[0m [0;34m[0m[0m
[1;32m      9[0m [0;31m# 아래 코드는 string값을 One Hot Encoding 적용 시도 하였기에 오류 발생.[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0;32m---> 10[0;31m [0mencoded_model[0m [0;34m=[0m [0mencoder[0m[0;34m.[0m[0mfit[0m[0;34m([0m[0mdf[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m
[0;32m/databricks/python_shell/dbruntime/MLWorkloadsInstrumentation/_pyspark.py[0m in [0;36mpatched_method[0;34m(self, *args, **kwargs)[0m
[1;32m     28[0m             [0mcall_succeeded[0m [0;34m=[0m [0;32mFalse[0m[0;34m[0m[0;34m[0m[0m
[1;32m     29[0m             [0;32mtry[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0;32m---> 30[0;31m                 [0mresult[0m [0;34m=[0m [0moriginal_method[0m[0;34m([0m[0

In [0]:
df = spark.createDataFrame(
    [(0, "a", "A"), (1, "b", "A"), (2, "c", "K"), (3, "a", "D"), (4, "a", "C"), (5, "c", "B")],
    ["id", "category1", "category2"])

# StringIndexer를 이용하여 label encoding 적용. 
label_encoder = StringIndexer(inputCols=["category1", "category2"], outputCols=["label_encoded1", "label_encoded2"])
label_encoded_df = label_encoder.fit(df).transform(df) # label_encoder.fit(df) : 모델에 트랜스폼

# 앞에서 숫자로 변환된 label encoding 컬럼들을 One Hot encoding 적용. 
onehot_encoder = OneHotEncoder(inputCols=["label_encoded1", "label_encoded2"],
                        outputCols=["onehot_encoded1", "onehot_encoded2"])
                        
# 앞에서 Label encoding 변환된 DataFrame을 이용해서 One Hot encoding 적용해야함
onehot_encoded_df = onehot_encoder.fit(label_encoded_df).transform(label_encoded_df)

display(onehot_encoded_df)

id,category1,category2,label_encoded1,label_encoded2,onehot_encoded1,onehot_encoded2
0,a,A,0.0,0.0,"Map(vectorType -> sparse, length -> 2, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 4, indices -> List(0), values -> List(1.0))"
1,b,A,2.0,0.0,"Map(vectorType -> sparse, length -> 2, indices -> List(), values -> List())","Map(vectorType -> sparse, length -> 4, indices -> List(0), values -> List(1.0))"
2,c,K,1.0,4.0,"Map(vectorType -> sparse, length -> 2, indices -> List(1), values -> List(1.0))","Map(vectorType -> sparse, length -> 4, indices -> List(), values -> List())"
3,a,D,0.0,3.0,"Map(vectorType -> sparse, length -> 2, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 4, indices -> List(3), values -> List(1.0))"
4,a,C,0.0,2.0,"Map(vectorType -> sparse, length -> 2, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 4, indices -> List(2), values -> List(1.0))"
5,c,B,1.0,1.0,"Map(vectorType -> sparse, length -> 2, indices -> List(1), values -> List(1.0))","Map(vectorType -> sparse, length -> 4, indices -> List(1), values -> List(1.0))"


### Pipeline을 이용하여 OneHot Encoding 적용 
* StringIndexer 객체와 OneHotEncoder 객체를 각각 stage로 Pipeline에 등록하여 encoding 변환.

In [0]:
from pyspark.ml import Pipeline

# Stage로 사용될 StringIndexer 객체와 OneHotEncoder 객체 생성. 
stage_1 = StringIndexer(inputCols=['category1', 'category2'], outputCols=['label_encoded1', 'label_encoded2'])
stage_2 = OneHotEncoder(inputCols=['label_encoded1', 'label_encoded2'], outputCols=['onehot_encoded1', 'onehot_encoded2'])

# stage로 StringIndexer객체와 OneHotEncoder 객체 등록하여 Pipeline 객체 생성. 
pipeline = Pipeline(stages=[stage_1, stage_2])

# pipeline.fit(df) 수행하여 PipelineModel 생성하고 PipelineModel의 transfrom(df) 호출하여 최종 변환. 
pipeline_model = pipeline.fit(df)
onehot_encoded_df = pipeline_model.transform(df)
#onehot_encoded_df = pipeline.fit(df).transform(df)

display(onehot_encoded_df)

id,category1,category2,label_encoded1,label_encoded2,onehot_encoded1,onehot_encoded2
0,a,A,0.0,0.0,"Map(vectorType -> sparse, length -> 2, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 4, indices -> List(0), values -> List(1.0))"
1,b,A,2.0,0.0,"Map(vectorType -> sparse, length -> 2, indices -> List(), values -> List())","Map(vectorType -> sparse, length -> 4, indices -> List(0), values -> List(1.0))"
2,c,K,1.0,4.0,"Map(vectorType -> sparse, length -> 2, indices -> List(1), values -> List(1.0))","Map(vectorType -> sparse, length -> 4, indices -> List(), values -> List())"
3,a,D,0.0,3.0,"Map(vectorType -> sparse, length -> 2, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 4, indices -> List(3), values -> List(1.0))"
4,a,C,0.0,2.0,"Map(vectorType -> sparse, length -> 2, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 4, indices -> List(2), values -> List(1.0))"
5,c,B,1.0,1.0,"Map(vectorType -> sparse, length -> 2, indices -> List(1), values -> List(1.0))","Map(vectorType -> sparse, length -> 4, indices -> List(1), values -> List(1.0))"


In [0]:
print(pipeline_model.stages)

[StringIndexerModel: uid=StringIndexer_6cd34f0c9ef8, handleInvalid=error, numInputCols=2, numOutputCols=2, OneHotEncoderModel: uid=OneHotEncoder_1b8a0d970f7b, dropLast=true, handleInvalid=error, numInputCols=2, numOutputCols=2]


### Scaling의 적용
* Standard 스케일링은 StandardScaler 클래스로, Min Max 스케일링은 MinMaxClass를 이용하여 적용. 
* 주의할 사용한 Scaling은 일반 컬럼형(숫자형)이 아니라 vector형에만 적용이 가능함. 이는 Spark ML이 통계 전용의 기능을 제공하기 보다는 ML에 주로 특화 되었기 때문
* 때문에 단일 컬럼에 Scaling을 적용할 때도 반드시 VectorAssembler로 변환 후에 적용해야 함

In [0]:
from sklearn.datasets import load_iris
import numpy as np
import pandas as pd

# iris 데이터 세트 로딩하고  iris 데이터 세트를 numpy에서 pandas DataFrame으로 변환 
iris = load_iris()
iris_data = iris.data
iris_label = iris.target

iris_columns = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
iris_pdf = pd.DataFrame(iris_data, columns=iris_columns)
iris_pdf['label'] = iris_label

# pandas DataFrame을 spark DataFrame으로 변환
iris_sdf = spark.createDataFrame(iris_pdf)

display(iris_sdf.limit(10))



sepal_length,sepal_width,petal_length,petal_width,label
5.1,3.5,1.4,0.2,0
4.9,3.0,1.4,0.2,0
4.7,3.2,1.3,0.2,0
4.6,3.1,1.5,0.2,0
5.0,3.6,1.4,0.2,0
5.4,3.9,1.7,0.4,0
4.6,3.4,1.4,0.3,0
5.0,3.4,1.5,0.2,0
4.4,2.9,1.4,0.2,0
4.9,3.1,1.5,0.1,0


In [0]:
from pyspark.ml.feature import StandardScaler

# number type 단일 컬럼에 StandardScaler를 적용하면 오류 발생. Vector 형으로 해당 컬럼을 변경해야 함. 
standard_scaler = StandardScaler(inputCol='sepal_length', outputCol='scaled_sepal_length')
standard_scaler_model = standard_scaler.fit(iris_sdf)
standard_scaled_df = standard_scaler_model.transform(iris_sdf)

[0;31m---------------------------------------------------------------------------[0m
[0;31mIllegalArgumentException[0m                  Traceback (most recent call last)
[0;32m<command-328170807580035>[0m in [0;36m<cell line: 5>[0;34m()[0m
[1;32m      3[0m [0;31m# number type 단일 컬럼에 StandardScaler를 적용하면 오류 발생. Vector 형으로 해당 컬럼을 변경해야 함.[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[1;32m      4[0m [0mstandard_scaler[0m [0;34m=[0m [0mStandardScaler[0m[0;34m([0m[0minputCol[0m[0;34m=[0m[0;34m'sepal_length'[0m[0;34m,[0m [0moutputCol[0m[0;34m=[0m[0;34m'scaled_sepal_length'[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;32m----> 5[0;31m [0mstandard_scaler_model[0m [0;34m=[0m [0mstandard_scaler[0m[0;34m.[0m[0mfit[0m[0;34m([0m[0miris_sdf[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[1;32m      6[0m [0mstandard_scaled_df[0m [0;34m=[0m [0mstandard_scaler_model[0m[0;34m.[0m[0mtransform[0m[0;34m([0m[0miris_sdf[0m[0;34m)[0m[0;34m[0m[0

In [0]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StandardScaler

# VectorAssembler는 반드시 생성자로 inputCols를 list 형으로 받아야 함. inputCol은 안됨.  
# vec_assembler = VectorAssembler(inputCol=sepal_length, outputCol='sepal_length_vector')는 오류 발생. 
vec_assembler = VectorAssembler(inputCols=['sepal_length'], outputCol='sepal_length_vector')

# VectorAssembler는 fit()이 없음. 
iris_sdf_vectorized = vec_assembler.transform(iris_sdf)

display(iris_sdf_vectorized.limit(10))

sepal_length,sepal_width,petal_length,petal_width,label,sepal_length_vector
5.1,3.5,1.4,0.2,0,"Map(vectorType -> dense, length -> 1, values -> List(5.1))"
4.9,3.0,1.4,0.2,0,"Map(vectorType -> dense, length -> 1, values -> List(4.9))"
4.7,3.2,1.3,0.2,0,"Map(vectorType -> dense, length -> 1, values -> List(4.7))"
4.6,3.1,1.5,0.2,0,"Map(vectorType -> dense, length -> 1, values -> List(4.6))"
5.0,3.6,1.4,0.2,0,"Map(vectorType -> dense, length -> 1, values -> List(5.0))"
5.4,3.9,1.7,0.4,0,"Map(vectorType -> dense, length -> 1, values -> List(5.4))"
4.6,3.4,1.4,0.3,0,"Map(vectorType -> dense, length -> 1, values -> List(4.6))"
5.0,3.4,1.5,0.2,0,"Map(vectorType -> dense, length -> 1, values -> List(5.0))"
4.4,2.9,1.4,0.2,0,"Map(vectorType -> dense, length -> 1, values -> List(4.4))"
4.9,3.1,1.5,0.1,0,"Map(vectorType -> dense, length -> 1, values -> List(4.9))"


In [0]:
# vector화된 컬럼에 대해서 StandardScaler 적용 
standard_scaler = StandardScaler(inputCol='sepal_length_vector', outputCol='standard_scaled_vector_01')
standard_scaler_model = standard_scaler.fit(iris_sdf_vectorized)
standard_scaled_df = standard_scaler_model.transform(iris_sdf_vectorized)
display(standard_scaled_df.limit(10))

sepal_length,sepal_width,petal_length,petal_width,label,sepal_length_vector,standard_scaled_vector_01
5.1,3.5,1.4,0.2,0,"Map(vectorType -> dense, length -> 1, values -> List(5.1))","Map(vectorType -> dense, length -> 1, values -> List(6.158928408838792))"
4.9,3.0,1.4,0.2,0,"Map(vectorType -> dense, length -> 1, values -> List(4.9))","Map(vectorType -> dense, length -> 1, values -> List(5.917401804570605))"
4.7,3.2,1.3,0.2,0,"Map(vectorType -> dense, length -> 1, values -> List(4.7))","Map(vectorType -> dense, length -> 1, values -> List(5.675875200302417))"
4.6,3.1,1.5,0.2,0,"Map(vectorType -> dense, length -> 1, values -> List(4.6))","Map(vectorType -> dense, length -> 1, values -> List(5.555111898168322))"
5.0,3.6,1.4,0.2,0,"Map(vectorType -> dense, length -> 1, values -> List(5.0))","Map(vectorType -> dense, length -> 1, values -> List(6.038165106704698))"
5.4,3.9,1.7,0.4,0,"Map(vectorType -> dense, length -> 1, values -> List(5.4))","Map(vectorType -> dense, length -> 1, values -> List(6.521218315241074))"
4.6,3.4,1.4,0.3,0,"Map(vectorType -> dense, length -> 1, values -> List(4.6))","Map(vectorType -> dense, length -> 1, values -> List(5.555111898168322))"
5.0,3.4,1.5,0.2,0,"Map(vectorType -> dense, length -> 1, values -> List(5.0))","Map(vectorType -> dense, length -> 1, values -> List(6.038165106704698))"
4.4,2.9,1.4,0.2,0,"Map(vectorType -> dense, length -> 1, values -> List(4.4))","Map(vectorType -> dense, length -> 1, values -> List(5.313585293900135))"
4.9,3.1,1.5,0.1,0,"Map(vectorType -> dense, length -> 1, values -> List(4.9))","Map(vectorType -> dense, length -> 1, values -> List(5.917401804570605))"


In [0]:
standard_scaler = StandardScaler(inputCol='sepal_length_vector', outputCol='standard_scaled_vector_02', withMean=True, withStd=True)
standard_scaler_model = standard_scaler.fit(iris_sdf_vectorized)
standard_scaled_df = standard_scaler_model.transform(iris_sdf_vectorized)
display(standard_scaled_df.limit(10))

sepal_length,sepal_width,petal_length,petal_width,label,sepal_length_vector,standard_scaled_vector_02
5.1,3.5,1.4,0.2,0,"Map(vectorType -> dense, length -> 1, values -> List(5.1))","Map(vectorType -> dense, length -> 1, values -> List(-0.8976738791967649))"
4.9,3.0,1.4,0.2,0,"Map(vectorType -> dense, length -> 1, values -> List(4.9))","Map(vectorType -> dense, length -> 1, values -> List(-1.139200483464952))"
4.7,3.2,1.3,0.2,0,"Map(vectorType -> dense, length -> 1, values -> List(4.7))","Map(vectorType -> dense, length -> 1, values -> List(-1.3807270877331401))"
4.6,3.1,1.5,0.2,0,"Map(vectorType -> dense, length -> 1, values -> List(4.6))","Map(vectorType -> dense, length -> 1, values -> List(-1.5014903898672347))"
5.0,3.6,1.4,0.2,0,"Map(vectorType -> dense, length -> 1, values -> List(5.0))","Map(vectorType -> dense, length -> 1, values -> List(-1.0184371813308586))"
5.4,3.9,1.7,0.4,0,"Map(vectorType -> dense, length -> 1, values -> List(5.4))","Map(vectorType -> dense, length -> 1, values -> List(-0.5353839727944822))"
4.6,3.4,1.4,0.3,0,"Map(vectorType -> dense, length -> 1, values -> List(4.6))","Map(vectorType -> dense, length -> 1, values -> List(-1.5014903898672347))"
5.0,3.4,1.5,0.2,0,"Map(vectorType -> dense, length -> 1, values -> List(5.0))","Map(vectorType -> dense, length -> 1, values -> List(-1.0184371813308586))"
4.4,2.9,1.4,0.2,0,"Map(vectorType -> dense, length -> 1, values -> List(4.4))","Map(vectorType -> dense, length -> 1, values -> List(-1.7430169941354219))"
4.9,3.1,1.5,0.1,0,"Map(vectorType -> dense, length -> 1, values -> List(4.9))","Map(vectorType -> dense, length -> 1, values -> List(-1.139200483464952))"


In [0]:
# 전체 컬럼에 Standard Scaler 적용. 
vec_assembler = VectorAssembler(inputCols=iris_columns, outputCol='features')
# 이 부분이 가장 큰 차이
standard_scaler = StandardScaler(inputCol='features', outputCol='standard_scaled_features', withMean=True, withStd=True)

iris_sdf_vectorized = vec_assembler.transform(iris_sdf)
standard_scaled_df = standard_scaler.fit(iris_sdf_vectorized).transform(iris_sdf_vectorized)

standard_scaled_df.limit(10).show(truncate=False)

+------------+-----------+------------+-----------+-----+-----------------+---------------------------------------------------------------------------------+
|sepal_length|sepal_width|petal_length|petal_width|label|features         |standard_scaled_features                                                         |
+------------+-----------+------------+-----------+-----+-----------------+---------------------------------------------------------------------------------+
|5.1         |3.5        |1.4         |0.2        |0    |[5.1,3.5,1.4,0.2]|[-0.8976738791967649,1.0156019907136327,-1.3357516342415199,-1.3110521482051305] |
|4.9         |3.0        |1.4         |0.2        |0    |[4.9,3.0,1.4,0.2]|[-1.139200483464952,-0.13153881205026055,-1.3357516342415199,-1.3110521482051305]|
|4.7         |3.2        |1.3         |0.2        |0    |[4.7,3.2,1.3,0.2]|[-1.3807270877331401,0.3273175090552971,-1.3923992862449772,-1.3110521482051305] |
|4.6         |3.1        |1.5         |0.2        |0

In [0]:
#Pipeline을 이용하여 Standard Scaling  변환
from pyspark.ml import Pipeline

pipeline = Pipeline(stages = [vec_assembler, standard_scaler])
standard_scaled_df = pipeline.fit(iris_sdf).transform(iris_sdf)

standard_scaled_df.limit(10).show(truncate=False)


+------------+-----------+------------+-----------+-----+-----------------+---------------------------------------------------------------------------------+
|sepal_length|sepal_width|petal_length|petal_width|label|features         |standard_scaled_features                                                         |
+------------+-----------+------------+-----------+-----+-----------------+---------------------------------------------------------------------------------+
|5.1         |3.5        |1.4         |0.2        |0    |[5.1,3.5,1.4,0.2]|[-0.8976738791967649,1.0156019907136327,-1.3357516342415199,-1.3110521482051305] |
|4.9         |3.0        |1.4         |0.2        |0    |[4.9,3.0,1.4,0.2]|[-1.139200483464952,-0.13153881205026055,-1.3357516342415199,-1.3110521482051305]|
|4.7         |3.2        |1.3         |0.2        |0    |[4.7,3.2,1.3,0.2]|[-1.3807270877331401,0.3273175090552971,-1.3923992862449772,-1.3110521482051305] |
|4.6         |3.1        |1.5         |0.2        |0

#### MinMax 스케일링 변환

In [0]:
from pyspark.ml.feature import MinMaxScaler

# 전체 feature 컬럼에 minmax scaler 적용
vec_assembler = VectorAssembler(inputCols=iris_columns, outputCol='features')
iris_sdf_vectorized = vec_assembler.transform(iris_sdf)

minmax_scaler = MinMaxScaler(inputCol='features', outputCol='minmax_scaled_features')
minmax_scaled_df = minmax_scaler.fit(iris_sdf_vectorized).transform(iris_sdf_vectorized)

display(minmax_scaled_df.limit(10))

sepal_length,sepal_width,petal_length,petal_width,label,features,minmax_scaled_features
5.1,3.5,1.4,0.2,0,"Map(vectorType -> dense, length -> 4, values -> List(5.1, 3.5, 1.4, 0.2))","Map(vectorType -> dense, length -> 4, values -> List(0.22222222222222213, 0.625, 0.06779661016949151, 0.04166666666666667))"
4.9,3.0,1.4,0.2,0,"Map(vectorType -> dense, length -> 4, values -> List(4.9, 3.0, 1.4, 0.2))","Map(vectorType -> dense, length -> 4, values -> List(0.1666666666666668, 0.41666666666666663, 0.06779661016949151, 0.04166666666666667))"
4.7,3.2,1.3,0.2,0,"Map(vectorType -> dense, length -> 4, values -> List(4.7, 3.2, 1.3, 0.2))","Map(vectorType -> dense, length -> 4, values -> List(0.11111111111111119, 0.5, 0.05084745762711865, 0.04166666666666667))"
4.6,3.1,1.5,0.2,0,"Map(vectorType -> dense, length -> 4, values -> List(4.6, 3.1, 1.5, 0.2))","Map(vectorType -> dense, length -> 4, values -> List(0.08333333333333327, 0.4583333333333333, 0.0847457627118644, 0.04166666666666667))"
5.0,3.6,1.4,0.2,0,"Map(vectorType -> dense, length -> 4, values -> List(5.0, 3.6, 1.4, 0.2))","Map(vectorType -> dense, length -> 4, values -> List(0.19444444444444448, 0.6666666666666666, 0.06779661016949151, 0.04166666666666667))"
5.4,3.9,1.7,0.4,0,"Map(vectorType -> dense, length -> 4, values -> List(5.4, 3.9, 1.7, 0.4))","Map(vectorType -> dense, length -> 4, values -> List(0.30555555555555564, 0.7916666666666665, 0.11864406779661016, 0.12500000000000003))"
4.6,3.4,1.4,0.3,0,"Map(vectorType -> dense, length -> 4, values -> List(4.6, 3.4, 1.4, 0.3))","Map(vectorType -> dense, length -> 4, values -> List(0.08333333333333327, 0.5833333333333333, 0.06779661016949151, 0.08333333333333333))"
5.0,3.4,1.5,0.2,0,"Map(vectorType -> dense, length -> 4, values -> List(5.0, 3.4, 1.5, 0.2))","Map(vectorType -> dense, length -> 4, values -> List(0.19444444444444448, 0.5833333333333333, 0.0847457627118644, 0.04166666666666667))"
4.4,2.9,1.4,0.2,0,"Map(vectorType -> dense, length -> 4, values -> List(4.4, 2.9, 1.4, 0.2))","Map(vectorType -> dense, length -> 4, values -> List(0.027777777777777922, 0.37499999999999994, 0.06779661016949151, 0.04166666666666667))"
4.9,3.1,1.5,0.1,0,"Map(vectorType -> dense, length -> 4, values -> List(4.9, 3.1, 1.5, 0.1))","Map(vectorType -> dense, length -> 4, values -> List(0.1666666666666668, 0.4583333333333333, 0.0847457627118644, 0.0))"


In [0]:
from pyspark.ml import Pipeline

pipeline = Pipeline(stages = [vec_assembler, minmax_scaler])
minmax_scaled_df = pipeline.fit(iris_sdf).transform(iris_sdf)

display(minmax_scaled_df.limit(10))

sepal_length,sepal_width,petal_length,petal_width,label,features,minmax_scaled_features
5.1,3.5,1.4,0.2,0,"Map(vectorType -> dense, length -> 4, values -> List(5.1, 3.5, 1.4, 0.2))","Map(vectorType -> dense, length -> 4, values -> List(0.22222222222222213, 0.625, 0.06779661016949151, 0.04166666666666667))"
4.9,3.0,1.4,0.2,0,"Map(vectorType -> dense, length -> 4, values -> List(4.9, 3.0, 1.4, 0.2))","Map(vectorType -> dense, length -> 4, values -> List(0.1666666666666668, 0.41666666666666663, 0.06779661016949151, 0.04166666666666667))"
4.7,3.2,1.3,0.2,0,"Map(vectorType -> dense, length -> 4, values -> List(4.7, 3.2, 1.3, 0.2))","Map(vectorType -> dense, length -> 4, values -> List(0.11111111111111119, 0.5, 0.05084745762711865, 0.04166666666666667))"
4.6,3.1,1.5,0.2,0,"Map(vectorType -> dense, length -> 4, values -> List(4.6, 3.1, 1.5, 0.2))","Map(vectorType -> dense, length -> 4, values -> List(0.08333333333333327, 0.4583333333333333, 0.0847457627118644, 0.04166666666666667))"
5.0,3.6,1.4,0.2,0,"Map(vectorType -> dense, length -> 4, values -> List(5.0, 3.6, 1.4, 0.2))","Map(vectorType -> dense, length -> 4, values -> List(0.19444444444444448, 0.6666666666666666, 0.06779661016949151, 0.04166666666666667))"
5.4,3.9,1.7,0.4,0,"Map(vectorType -> dense, length -> 4, values -> List(5.4, 3.9, 1.7, 0.4))","Map(vectorType -> dense, length -> 4, values -> List(0.30555555555555564, 0.7916666666666665, 0.11864406779661016, 0.12500000000000003))"
4.6,3.4,1.4,0.3,0,"Map(vectorType -> dense, length -> 4, values -> List(4.6, 3.4, 1.4, 0.3))","Map(vectorType -> dense, length -> 4, values -> List(0.08333333333333327, 0.5833333333333333, 0.06779661016949151, 0.08333333333333333))"
5.0,3.4,1.5,0.2,0,"Map(vectorType -> dense, length -> 4, values -> List(5.0, 3.4, 1.5, 0.2))","Map(vectorType -> dense, length -> 4, values -> List(0.19444444444444448, 0.5833333333333333, 0.0847457627118644, 0.04166666666666667))"
4.4,2.9,1.4,0.2,0,"Map(vectorType -> dense, length -> 4, values -> List(4.4, 2.9, 1.4, 0.2))","Map(vectorType -> dense, length -> 4, values -> List(0.027777777777777922, 0.37499999999999994, 0.06779661016949151, 0.04166666666666667))"
4.9,3.1,1.5,0.1,0,"Map(vectorType -> dense, length -> 4, values -> List(4.9, 3.1, 1.5, 0.1))","Map(vectorType -> dense, length -> 4, values -> List(0.1666666666666668, 0.4583333333333333, 0.0847457627118644, 0.0))"
