# Setup

In [1]:
from __future__ import print_function
from __future__ import division
from __future__ import unicode_literals

In [2]:
from pyspark.ml.pipeline import Pipeline

In [3]:
spark, sc, sql

(<pyspark.sql.session.SparkSession at 0x1049f7090>,
 <pyspark.context.SparkContext at 0x104905e50>,
 <bound method SparkSession.sql of <pyspark.sql.session.SparkSession object at 0x1049f7090>>)

In [4]:
from tykuo_spark_model import TykuoImputer

# Sample Dataset

In [5]:
df = sc.parallelize([
        (1, 'a', 300, None), 
        (2, 'a', 400, 30.3), 
        (3, 'b', None, 27.8), 
        (4, 'c', 600, 31.2), 
        (5, None, 700, 32.5)]) \
    .toDF(["id", "x1", 'x2', 'x3'])

In [6]:
df.show()

+---+----+----+----+
| id|  x1|  x2|  x3|
+---+----+----+----+
|  1|   a| 300|null|
|  2|   a| 400|30.3|
|  3|   b|null|27.8|
|  4|   c| 600|31.2|
|  5|null| 700|32.5|
+---+----+----+----+



# Impute Pipeline

In [7]:
pipeline = Pipeline(stages=[
    TykuoImputer().setInputCol('x1').setOutputCol('x1_imp').setStrategy('mode'),
    TykuoImputer().setInputCol('x2').setOutputCol('x2_imp').setStrategy('median'),
    TykuoImputer().setInputCol('x3').setOutputCol('x3_imp').setStrategy('mean'),
])

In [8]:
model = pipeline.fit(df)

In [9]:
model.transform(df).show()

+---+----+----+----+------+------+------+
| id|  x1|  x2|  x3|x1_imp|x2_imp|x3_imp|
+---+----+----+----+------+------+------+
|  1|   a| 300|null|     a| 300.0| 30.45|
|  2|   a| 400|30.3|     a| 400.0|  30.3|
|  3|   b|null|27.8|     b| 400.0|  27.8|
|  4|   c| 600|31.2|     c| 600.0|  31.2|
|  5|null| 700|32.5|     a| 700.0|  32.5|
+---+----+----+----+------+------+------+

