In [1]:
import findspark
findspark.init()
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
sc=SparkContext()
spark = SparkSession(sparkContext=sc)

In [2]:
import pandas as pd
pdf = pd.DataFrame({
        'x1': ['a','a','b','b', 'b', 'c'],
        'x2': ['apple', 'orange', 'orange','orange', 'peach', 'peach'],
        'x3': [1, 1, 2, 2, 2, 4],
        'x4': [2.4, 2.5, 3.5, 1.4, 2.1,1.5],
        'y1': [1, 0, 1, 0, 0, 1],
        'y2': ['yes', 'no', 'no', 'yes', 'yes', 'yes']
    })
df = spark.createDataFrame(pdf)
df.show()

+---+------+---+---+---+---+
| x1|    x2| x3| x4| y1| y2|
+---+------+---+---+---+---+
|  a| apple|  1|2.4|  1|yes|
|  a|orange|  1|2.5|  0| no|
|  b|orange|  2|3.5|  1| no|
|  b|orange|  2|1.4|  0|yes|
|  b| peach|  2|2.1|  0|yes|
|  c| peach|  4|1.5|  1|yes|
+---+------+---+---+---+---+



# VectorAssembler
To fit a ML model, we need to combine all feature columns into one single column of ctors, the **featureCol**. The VectorAssembler can be used to combine multiple OneHotEncoder columns and other continuous variable columns into single one column

In [3]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml import Pipeline

In [7]:


all_stages = [StringIndexer(inputCol=c, outputCol='idx_' + c) for c in ['x1', 'x2', 'x3']] + \
             [OneHotEncoder(inputCol='idx_' + c, outputCol='ohe_' + c) for c in ['x1', 'x2', 'x3']]
all_stages



[StringIndexer_55127baee48f,
 StringIndexer_e787dbf9ef20,
 StringIndexer_1d97e258ab79,
 OneHotEncoder_4d9c240fb539,
 OneHotEncoder_8e84adfbbb95,
 OneHotEncoder_d7667063c4a1]

In [8]:
df_new = Pipeline(stages=all_stages).fit(df).transform(df)
df_new.show()

+---+------+---+---+---+---+------+------+------+-------------+-------------+-------------+
| x1|    x2| x3| x4| y1| y2|idx_x1|idx_x2|idx_x3|       ohe_x1|       ohe_x2|       ohe_x3|
+---+------+---+---+---+---+------+------+------+-------------+-------------+-------------+
|  a| apple|  1|2.4|  1|yes|   1.0|   2.0|   1.0|(2,[1],[1.0])|    (2,[],[])|(2,[1],[1.0])|
|  a|orange|  1|2.5|  0| no|   1.0|   0.0|   1.0|(2,[1],[1.0])|(2,[0],[1.0])|(2,[1],[1.0])|
|  b|orange|  2|3.5|  1| no|   0.0|   0.0|   0.0|(2,[0],[1.0])|(2,[0],[1.0])|(2,[0],[1.0])|
|  b|orange|  2|1.4|  0|yes|   0.0|   0.0|   0.0|(2,[0],[1.0])|(2,[0],[1.0])|(2,[0],[1.0])|
|  b| peach|  2|2.1|  0|yes|   0.0|   1.0|   0.0|(2,[0],[1.0])|(2,[1],[1.0])|(2,[0],[1.0])|
|  c| peach|  4|1.5|  1|yes|   2.0|   1.0|   2.0|    (2,[],[])|(2,[1],[1.0])|    (2,[],[])|
+---+------+---+---+---+---+------+------+------+-------------+-------------+-------------+



# Assemble feature column into one signle featuresCol with *VectorAssembler*.

In [9]:
df_assembled = VectorAssembler(inputCols=['ohe_x1', 'ohe_x2', 'ohe_x3', 'x4'],outputCol='featuresCol').transform(df_new).drop('idx_x1','idx_x2','idx_x3')
df_assembled.show(truncate=False)

+---+------+---+---+---+---+-------------+-------------+-------------+-----------------------------+
|x1 |x2    |x3 |x4 |y1 |y2 |ohe_x1       |ohe_x2       |ohe_x3       |featuresCol                  |
+---+------+---+---+---+---+-------------+-------------+-------------+-----------------------------+
|a  |apple |1  |2.4|1  |yes|(2,[1],[1.0])|(2,[],[])    |(2,[1],[1.0])|(7,[1,5,6],[1.0,1.0,2.4])    |
|a  |orange|1  |2.5|0  |no |(2,[1],[1.0])|(2,[0],[1.0])|(2,[1],[1.0])|[0.0,1.0,1.0,0.0,0.0,1.0,2.5]|
|b  |orange|2  |3.5|1  |no |(2,[0],[1.0])|(2,[0],[1.0])|(2,[0],[1.0])|[1.0,0.0,1.0,0.0,1.0,0.0,3.5]|
|b  |orange|2  |1.4|0  |yes|(2,[0],[1.0])|(2,[0],[1.0])|(2,[0],[1.0])|[1.0,0.0,1.0,0.0,1.0,0.0,1.4]|
|b  |peach |2  |2.1|0  |yes|(2,[0],[1.0])|(2,[1],[1.0])|(2,[0],[1.0])|[1.0,0.0,0.0,1.0,1.0,0.0,2.1]|
|c  |peach |4  |1.5|1  |yes|(2,[],[])    |(2,[1],[1.0])|(2,[],[])    |(7,[3,6],[1.0,1.5])          |
+---+------+---+---+---+---+-------------+-------------+-------------+---------------------

# Convert sparse vectors in featuresCol to dense vectors

In [10]:
from pyspark.sql.functions import udf
from pyspark.sql.types import *
from pyspark.ml.linalg import SparseVector, DenseVector

In [11]:
def dense_features_col(x):
    return(x.toArray().dtype)
dense_features_col_udf = udf(dense_features_col,returnType=StringType())

In [12]:
df_assembled.rdd.map(lambda x: x['featuresCol']).take(4)

[SparseVector(7, {1: 1.0, 5: 1.0, 6: 2.4}),
 DenseVector([0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 2.5]),
 DenseVector([1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 3.5]),
 DenseVector([1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.4])]

In [13]:
df_assembled.rdd.map(lambda x:list(x['featuresCol'].toArray())).take(5)

[[0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 2.4],
 [0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 2.5],
 [1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 3.5],
 [1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.4],
 [1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 2.1]]