# Setup

In [1]:
from __future__ import print_function
from __future__ import division
from __future__ import unicode_literals

In [211]:
from pyspark.ml.pipeline import Estimator, Model, Pipeline, PipelineModel
from pyspark.ml.param.shared import *
import pyspark.sql.functions as F

In [178]:
from pyspark.ml.util import JavaMLReadable, JavaMLWritable, DefaultParamsReadable, DefaultParamsWritable

In [59]:
from pyspark.ml.feature import Imputer

In [3]:
spark

# Impute Categorical Features with Mode

In [179]:
class HasModeDict(Params):

    mode_dict = Param(Params._dummy(),
            "mode_dict", "mode for every column")

    def __init__(self):
        super(HasModeDict, self).__init__()

    def setModeDict(self, value):
        return self._set(mode_dict=value)

    def getModeDict(self):
        return self.getOrDefault(self.mode_dict)

In [196]:
class ImputeCategoricalWithModeModel(Model, HasInputCols, HasOutputCols, HasModeDict, DefaultParamsReadable, DefaultParamsWritable):
    
    def _transform(self, dataset):
        xs = self.getInputCols()
        ys = self.getOutputCols()
        mode_dict = self.getModeDict()
        imputed_df = dataset
        for x, y in zip(xs, ys):
            imputed_df = imputed_df \
                .withColumn(y, F.when(F.col(x).isNull(), mode_dict[x]).otherwise(F.col(x)))
        return imputed_df

In [197]:
class ImputeCategoricalWithMode(Estimator, HasInputCols, HasOutputCols):
    
    def prepare_io_params(self):
        xs = self.getInputCols()
        ys = []
        try:
            ys = self.getOutputCols()
        except:
            ys = []
        n = len(xs) - len(ys)
        if n > 0:
            ys = ys[:] + xs[-n:]
        elif n < 0:
            ys = ys[:n]
        return xs, ys
    
    def _fit(self, dataset):
        xs, ys = self.prepare_io_params()
        mode_dict = {}
        for c in xs:
            rows = df.where('{} is not null'.format(c)) \
                .groupBy(c) \
                .agg(F.count('*').alias('count')) \
                .orderBy(F.desc('count')) \
                .take(1) 
            if len(rows) > 0:
                mode_dict[c] = rows[0][c]
        impute_model = ImputeCategoricalWithModeModel() \
            .setInputCols(xs) \
            .setOutputCols(ys) \
            .setModeDict(mode_dict)
            
        return impute_model

In [198]:
df = sc \
    .parallelize([(1, 'a', 'A'), (2, 'a', 'B'), (3, 'b', 'B'), (4, 'c', None), (4, None, 'B')]) \
    .toDF(["id", "x1", 'x2'])

In [199]:
df.show()

+---+----+----+
| id|  x1|  x2|
+---+----+----+
|  1|   a|   A|
|  2|   a|   B|
|  3|   b|   B|
|  4|   c|null|
|  4|null|   B|
+---+----+----+



In [200]:
impute_mode = ImputeCategoricalWithMode() \
    .setInputCols(['x1', 'x2']) \
    .setOutputCols(['x1_im', 'x2_im'])

In [201]:
impute_model = impute_mode.fit(df)

In [202]:
impute_model.transform(df).show()

+---+----+----+-----+-----+
| id|  x1|  x2|x1_im|x2_im|
+---+----+----+-----+-----+
|  1|   a|   A|    a|    A|
|  2|   a|   B|    a|    B|
|  3|   b|   B|    b|    B|
|  4|   c|null|    c|    B|
|  4|null|   B|    a|    B|
+---+----+----+-----+-----+



# Impute Pipeline

In [204]:
df = sc.parallelize([
        (1, 'a', 'A', None), 
        (2, 'a', 'B', 30.3), 
        (3, 'b', 'B', 27.8), 
        (4, 'c', None, 31.2), 
        (5, None, 'B', 32.5)]) \
    .toDF(["id", "x1", 'x2', 'score'])

In [205]:
df.show()

+---+----+----+-----+
| id|  x1|  x2|score|
+---+----+----+-----+
|  1|   a|   A| null|
|  2|   a|   B| 30.3|
|  3|   b|   B| 27.8|
|  4|   c|null| 31.2|
|  5|null|   B| 32.5|
+---+----+----+-----+



In [206]:
impute_mode = ImputeCategoricalWithMode() \
    .setInputCols(['x1', 'x2']) \
    .setOutputCols(['x1_im', 'x2_im'])

In [207]:
impute_median = Imputer(inputCols=["score"], outputCols=["score_im"]) \
    .setStrategy('median')

In [208]:
model = Pipeline(stages=[impute_mode, impute_median]).fit(df)

In [209]:
model.transform(df).show()

+---+----+----+-----+-----+-----+--------+
| id|  x1|  x2|score|x1_im|x2_im|score_im|
+---+----+----+-----+-----+-----+--------+
|  1|   a|   A| null|    a|    A|    30.3|
|  2|   a|   B| 30.3|    a|    B|    30.3|
|  3|   b|   B| 27.8|    b|    B|    27.8|
|  4|   c|null| 31.2|    c|    B|    31.2|
|  5|null|   B| 32.5|    a|    B|    32.5|
+---+----+----+-----+-----+-----+--------+



# Custom Pipeline Model Save/Load

In [210]:
model.save("tmp/impute-pipeline")

In [213]:
loaded_model = PipelineModel.load('tmp/impute-pipeline')

In [214]:
loaded_model.stages

[ImputeCategoricalWithModeModel_4fe986717ecaa3cd9327,
 Imputer_4bd58cecabb0b6930ac8]

In [215]:
loaded_model.transform(df).show()

+---+----+----+-----+-----+-----+--------+
| id|  x1|  x2|score|x1_im|x2_im|score_im|
+---+----+----+-----+-----+-----+--------+
|  1|   a|   A| null|    a|    A|    30.3|
|  2|   a|   B| 30.3|    a|    B|    30.3|
|  3|   b|   B| 27.8|    b|    B|    27.8|
|  4|   c|null| 31.2|    c|    B|    31.2|
|  5|null|   B| 32.5|    a|    B|    32.5|
+---+----+----+-----+-----+-----+--------+

