# Setup

In [1]:
from __future__ import print_function
from __future__ import division
from __future__ import unicode_literals

In [2]:
from pyspark.ml.pipeline import Pipeline
from pyspark.ml.feature import VectorAssembler, StandardScaler

In [3]:
from tykuo_spark_model.imputer import TykuoImputer
from tykuo_spark_model.onehot import StringDisassembler
from tykuo_spark_model.vec_disamb import VectorDisassembler

In [4]:
spark, sc, sql

(<pyspark.sql.session.SparkSession at 0x11184d090>,
 <pyspark.context.SparkContext at 0x11175be50>,
 <bound method SparkSession.sql of <pyspark.sql.session.SparkSession object at 0x11184d090>>)

# Sample Dataset

In [5]:
df = sc.parallelize([
        (1, 'a', 300, None), 
        (2, 'a', 400, 30.3), 
        (3, 'b', None, 27.8), 
        (4, 'c', 600, 31.2), 
        (5, None, 700, 32.5)]) \
    .toDF(["id", "x1", 'x2', 'x3'])

In [6]:
df.show()

+---+----+----+----+
| id|  x1|  x2|  x3|
+---+----+----+----+
|  1|   a| 300|null|
|  2|   a| 400|30.3|
|  3|   b|null|27.8|
|  4|   c| 600|31.2|
|  5|null| 700|32.5|
+---+----+----+----+



# Full Pipeline

In [7]:
pipeline = Pipeline(stages=[
    TykuoImputer().setInputCol('x1').setOutputCol('x1_imp').setStrategy('mode'),
    TykuoImputer().setInputCol('x2').setOutputCol('x2_imp').setStrategy('median'),
    TykuoImputer().setInputCol('x3').setOutputCol('x3_imp').setStrategy('mean'),
    StringDisassembler().setInputCol('x1_imp'),
    VectorAssembler(inputCols=['x2_imp', 'x3_imp'], outputCol='features'),
    StandardScaler(inputCol='features', outputCol='scaledFeatures'),
    VectorDisassembler().setInputCol('scaledFeatures').setOutputCols(['x2_scaled', 'x3_scaled'])
])

In [8]:
model = pipeline.fit(df)

In [9]:
model.transform(df).show()

+---+----+----+----+------+------+------+-----------+-----------+-----------+-------------+--------------------+------------------+------------------+
| id|  x1|  x2|  x3|x1_imp|x2_imp|x3_imp|is_x1_imp_a|is_x1_imp_c|is_x1_imp_b|     features|      scaledFeatures|         x2_scaled|         x3_scaled|
+---+----+----+----+------+------+------+-----------+-----------+-----------+-------------+--------------------+------------------+------------------+
|  1|   a| 300|null|     a| 300.0| 30.45|        1.0|        0.0|        0.0|[300.0,30.45]|[1.82574185835055...|1.8257418583505538|17.721168042270467|
|  2|   a| 400|30.3|     a| 400.0|  30.3|        1.0|        0.0|        0.0| [400.0,30.3]|[2.43432247780073...|2.4343224778007384|17.633871647973567|
|  3|   b|null|27.8|     b| 400.0|  27.8|        0.0|        0.0|        1.0| [400.0,27.8]|[2.43432247780073...|2.4343224778007384|16.178931743025252|
|  4|   c| 600|31.2|     c| 600.0|  31.2|        0.0|        1.0|        0.0| [600.0,31.2]|[3.