# Setup

In [1]:
from __future__ import print_function
from __future__ import division
from __future__ import unicode_literals

In [2]:
import pyspark.sql.functions as F
from pyspark.sql import Row

In [31]:
from pyspark.ml import Pipeline, PipelineModel
from pyspark.ml.feature import Imputer, VectorAssembler, StandardScaler

In [4]:
from tykuo_spark_model import StringDisassembler, ImputeCategoricalWithMode, VectorDisassembler

# Sample Data

In [5]:
df = sc \
    .parallelize([
        (1, 'a', 'A', 56., 175.), 
        (2, 'a', 'B', 66., None), 
        (3, 'b', 'B', None, 182.), 
        (4, 'c', None, 71., 171.), 
        (5, None, 'B', 48., 173.)]) \
    .toDF(["id", "x1", 'x2', 'x3', 'x4'])

In [6]:
df.show()

+---+----+----+----+-----+
| id|  x1|  x2|  x3|   x4|
+---+----+----+----+-----+
|  1|   a|   A|56.0|175.0|
|  2|   a|   B|66.0| null|
|  3|   b|   B|null|182.0|
|  4|   c|null|71.0|171.0|
|  5|null|   B|48.0|173.0|
+---+----+----+----+-----+



# Data Processing Pipeline

In [7]:
stage = [
    # impute
    ImputeCategoricalWithMode().setInputCols(['x1', 'x2']).setOutputCols(['x1_im', 'x2_im']),
    Imputer(inputCols=['x3', 'x4'], outputCols=['x3_im', 'x4_im']).setStrategy('median'),
    # Numeric 
     VectorAssembler(inputCols=['x3_im', 'x4_im'], outputCol='numericFeatures'),
    StandardScaler(inputCol='numericFeatures', outputCol='scaledNumericFeatures'),
    VectorDisassembler().setInputCol('scaledNumericFeatures').setOutputCols(['x3_scaled', 'x4_scaled']),
    # Categorical
    StringDisassembler().setInputCol('x1_im'),
    StringDisassembler().setInputCol('x2_im'),
]

In [8]:
pipelineModel = Pipeline(stages=stage).fit(df)

In [9]:
df_with_features = pipelineModel.transform(df)

In [11]:
df_with_features.printSchema()

root
 |-- id: long (nullable = true)
 |-- x1: string (nullable = true)
 |-- x2: string (nullable = true)
 |-- x3: double (nullable = true)
 |-- x4: double (nullable = true)
 |-- x1_im: string (nullable = true)
 |-- x2_im: string (nullable = true)
 |-- x3_im: double (nullable = true)
 |-- x4_im: double (nullable = true)
 |-- numericFeatures: vector (nullable = true)
 |-- scaledNumericFeatures: vector (nullable = true)
 |-- x3_scaled: double (nullable = true)
 |-- x4_scaled: double (nullable = true)
 |-- is_x1_im_a: double (nullable = true)
 |-- is_x1_im_c: double (nullable = true)
 |-- is_x1_im_b: double (nullable = true)
 |-- is_x2_im_A: double (nullable = true)
 |-- is_x2_im_B: double (nullable = true)



In [23]:
df_with_features_pd = df_with_features.toPandas().set_index('id')
df_with_features_pd

Unnamed: 0_level_0,x1,x2,x3,x4,x1_im,x2_im,x3_im,x4_im,numericFeatures,scaledNumericFeatures,x3_scaled,x4_scaled,is_x1_im_a,is_x1_im_c,is_x1_im_b,is_x2_im_A,is_x2_im_B
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1,a,A,56.0,175.0,a,A,56.0,175.0,"[56.0, 175.0]","[6.15421775169, 41.0206327071]",6.154218,41.020633,1.0,0.0,0.0,1.0,0.0
2,a,B,66.0,,a,B,66.0,173.0,"[66.0, 173.0]","[7.25318520735, 40.5518254762]",7.253185,40.551825,1.0,0.0,0.0,0.0,1.0
3,b,B,,182.0,b,B,56.0,182.0,"[56.0, 182.0]","[6.15421775169, 42.6614580154]",6.154218,42.661458,0.0,0.0,1.0,0.0,1.0
4,c,,71.0,171.0,c,B,71.0,171.0,"[71.0, 171.0]","[7.80266893518, 40.0830182452]",7.802669,40.083018,0.0,1.0,0.0,0.0,1.0
5,,B,48.0,173.0,a,B,48.0,173.0,"[48.0, 173.0]","[5.27504378717, 40.5518254762]",5.275044,40.551825,1.0,0.0,0.0,0.0,1.0


In [27]:
final_features = ['x3_scaled', 'x4_scaled', 'is_x1_im_a', 'is_x1_im_c', 'is_x1_im_b', 'is_x2_im_A', 'is_x2_im_B']

In [29]:
df_with_features_pd[final_features]

Unnamed: 0_level_0,x3_scaled,x4_scaled,is_x1_im_a,is_x1_im_c,is_x1_im_b,is_x2_im_A,is_x2_im_B
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,6.154218,41.020633,1.0,0.0,0.0,1.0,0.0
2,7.253185,40.551825,1.0,0.0,0.0,0.0,1.0
3,6.154218,42.661458,0.0,0.0,1.0,0.0,1.0
4,7.802669,40.083018,0.0,1.0,0.0,0.0,1.0
5,5.275044,40.551825,1.0,0.0,0.0,0.0,1.0


# Pipeline IO

In [30]:
pipelineModel.save('tmp/final')

In [None]:
PipelineModel