# Setup

In [1]:
from __future__ import print_function
from __future__ import division
from __future__ import unicode_literals

In [2]:
from pyspark.ml.pipeline import Estimator, Model, Pipeline, PipelineModel
from pyspark.ml.param.shared import *
import pyspark.sql.functions as F

In [6]:
spark, sc, sql

(<pyspark.sql.session.SparkSession at 0x10e638090>,
 <pyspark.context.SparkContext at 0x10e546e50>,
 <bound method SparkSession.sql of <pyspark.sql.session.SparkSession object at 0x10e638090>>)

# Sample Dataset

In [8]:
df = sc.parallelize([
        (1, 'a', 'A', None), 
        (2, 'a', 'B', 30.3), 
        (3, 'b', 'B', 27.8), 
        (4, 'c', None, 31.2), 
        (5, None, 'B', 32.5)]) \
    .toDF(["id", "x1", 'x2', 'score'])

In [9]:
df.show()

+---+----+----+-----+
| id|  x1|  x2|score|
+---+----+----+-----+
|  1|   a|   A| null|
|  2|   a|   B| 30.3|
|  3|   b|   B| 27.8|
|  4|   c|null| 31.2|
|  5|null|   B| 32.5|
+---+----+----+-----+



# Default Params IO

In [56]:
import os

# Tykuo Imputer

In [69]:
class HasImputeValue(Params):
    
    impute_value = Param(Params._dummy(),
                         'impute_value', 
                         'value imputed to missing column')
    
    def __init__(self):
        super(HasImputeValue, self).__init__()
    
    def setImputeValue(self, value):
        return self._set(impute_value=value)
    
    def getImputeValue(self):
        return self.getOrDefault(self.impute_value)

In [74]:
class TykuoImputerModel(Model, HasInputCol, HasOutputCol, HasImputeValue):
    
    def _transform(self, dataset):
        x = self.getInputCol()
        y = self.getOutputCol()
        impute_value = self.getImputeValue()
        imputed_df = dataset.withColumn(
            y, F.when(F.col(x).isNull(), impute_value).otherwise(F.col(x)))
        
        return imputed_df

In [131]:
class ImputeValueGenerator(object):
    
    def generate(self, dataset, inputCol):
        raise NotImplementedError

In [132]:
class ModeGenerator(ImputeValueGenerator):
    
    def generate(self, dataset, inputCol):
        impute_val = None
        
        if inputCol is None:
            return impute_val
        
        rows = dataset.where('{} is not null'.format(inputCol)) \
            .groupBy(inputCol) \
            .agg(F.count('*').alias('count')) \
            .orderBy(F.desc('count')) \
            .take(1)
        
        if len(rows) > 0:
            impute_val = rows[0][inputCol]
        
        return impute_val

In [133]:
ModeGenerator().generate(df, 'x1')

u'a'

In [134]:
class MedianGenerator(ImputeValueGenerator):
    
    def generate(self, dataset, inputCol):
        impute_val = None
        
        if inputCol is None:
            return impute_val
        
        impute_val = dataset \
            .approxQuantile(str(inputCol), [0.5], 0.25)
        
        if isinstance(impute_val, list):
            impute_val = impute_val[0]
            
        return impute_val

In [135]:
MedianGenerator().generate(df, 'score')

30.3

In [136]:
class MeanGenerator(ImputeValueGenerator):
    
    def generate(self, dataset, inputCol):
        agg_func = 'mean'
        impute_val = None
        
        if inputCol is None:
            return impute_val
        
        impute_val = dataset \
            .agg(F.mean(inputCol).alias(agg_func)) \
            .take(1)
        
        if len(impute_val) > 0:
            impute_val = impute_val[0][agg_func]
            
        return impute_val

In [137]:
MeanGenerator().generate(df, 'score')

30.45

In [147]:
class HasImputeStrategy(Params):
    
    strategy = Param(Params._dummy(),
                         'strategy', 
                         'strategy to impute value',
                         typeConverter=TypeConverters.toString)
    
    def __init__(self):
        super(HasImputeStrategy, self).__init__()
        self._setDefault(strategy='median')
    
    def setStrategy(self, value):
        return self._set(strategy=value)
    
    def getStrategy(self):
        return self.getOrDefault(self.strategy)

In [157]:
class TykuoImputer(Estimator, HasInputCol, HasOutputCol, HasImputeStrategy):
    
    def get_value_generator(self, strategy='median'):
        if strategy == 'median':
            return MedianGenerator()
        elif strategy == 'mean':
            return MeanGenerator()
        elif strategy == 'mode':
            return ModeGenerator()
        else:
            raise Exception('Strategy should only be median, mean or mode')
    
    def _fit(self, dataset):
        # get value generator
        strategy = self.getStrategy()
        value_gen = self.get_value_generator(strategy)
        
        # compute value to be imputed
        x = self.getInputCol()
        impute_value = value_gen.generate(dataset, x)
        
        # create impute model
        impute_model = TykuoImputerModel() \
            .setInputCol(x) \
            .setOutputCol(self.getOutputCol()) \
            .setImputeValue(impute_value)
            
        return impute_model

# Try TykuoImputer

In [158]:
impute_model = TykuoImputer() \
    .setInputCol('score') \
    .setOutputCol('score_imp') \
    .setStrategy('median') \
    .fit(df)

In [159]:
impute_model.transform(df).show()

+---+----+----+-----+---------+
| id|  x1|  x2|score|score_imp|
+---+----+----+-----+---------+
|  1|   a|   A| null|     30.3|
|  2|   a|   B| 30.3|     30.3|
|  3|   b|   B| 27.8|     27.8|
|  4|   c|null| 31.2|     31.2|
|  5|null|   B| 32.5|     32.5|
+---+----+----+-----+---------+



In [160]:
TykuoImputer() \
    .setInputCol('score') \
    .setOutputCol('score_imp') \
    .setStrategy('mean') \
    .fit(df).transform(df).show()

+---+----+----+-----+---------+
| id|  x1|  x2|score|score_imp|
+---+----+----+-----+---------+
|  1|   a|   A| null|    30.45|
|  2|   a|   B| 30.3|     30.3|
|  3|   b|   B| 27.8|     27.8|
|  4|   c|null| 31.2|     31.2|
|  5|null|   B| 32.5|     32.5|
+---+----+----+-----+---------+



In [161]:
TykuoImputer() \
    .setInputCol('x1') \
    .setOutputCol('x1_imp') \
    .setStrategy('mode') \
    .fit(df).transform(df).show()

+---+----+----+-----+------+
| id|  x1|  x2|score|x1_imp|
+---+----+----+-----+------+
|  1|   a|   A| null|     a|
|  2|   a|   B| 30.3|     a|
|  3|   b|   B| 27.8|     b|
|  4|   c|null| 31.2|     c|
|  5|null|   B| 32.5|     a|
+---+----+----+-----+------+

