# Setup 

In [1]:
from __future__ import print_function
from __future__ import division
from __future__ import unicode_literals

In [20]:
import numpy as np
import pandas as pd
import random
import functools

In [3]:
import pyspark.sql.functions as F
from pyspark.sql import Row

In [4]:
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.pipeline import Pipeline

In [5]:
DATA_NUM = 100
sample_data = zip(
    range(DATA_NUM), 
    map(lambda d: float(d), np.random.normal(loc=50, scale=15, size=DATA_NUM)),
    map(lambda d: float(d), np.random.normal(loc=160, scale=30, size=DATA_NUM)))

df = sc.parallelize(sample_data).toDF(['id', 'weight', 'height'])
df.show()

+---+------------------+------------------+
| id|            weight|            height|
+---+------------------+------------------+
|  0|24.862496971660626|169.77471325769076|
|  1| 55.80551688815535|109.85509994731584|
|  2| 79.89623842556591|223.65330592276075|
|  3| 46.51574257351342|153.25551918704497|
|  4| 67.33359143317264|217.56658688765393|
|  5|34.950155924038555| 147.7942978656798|
|  6| 26.75763950912918|131.72153831365569|
|  7| 37.88466518272371|121.21302088408152|
|  8| 67.14547809537832|157.23391964988303|
|  9| 61.84796317335014|170.36848445078232|
| 10|  64.7433707910027|  137.474025005728|
| 11| 68.24412909395267|143.45330845636215|
| 12| 56.36909080787042| 167.0939978322262|
| 13|42.624468669977276|253.95564119099015|
| 14| 62.36030264447476| 164.2979476390597|
| 15|45.150395656233194|134.46706820513793|
| 16|53.834811394131904|231.46209828078543|
| 17|27.306832754873753|153.03802937001228|
| 18| 70.25860844015696| 150.5187264620797|
| 19|11.120871201509068|138.2136

In [6]:
features = ['weight', 'height']

# Vector Assembler

In [7]:
pipeline = Pipeline(stages=[
    VectorAssembler(inputCols=features, outputCol='features'),
    StandardScaler(inputCol='features', outputCol='scaledFeatures')
])

In [8]:
pipelineModel = pipeline.fit(df)

In [9]:
dataset = pipelineModel.transform(df)
dataset.select('id', 'features', 'scaledFeatures').show(5, truncate=False)

+---+---------------------------------------+--------------------------------------+
|id |features                               |scaledFeatures                        |
+---+---------------------------------------+--------------------------------------+
|0  |[24.862496971660626,169.77471325769076]|[1.6075393273534673,5.005036860183438]|
|1  |[55.80551688815535,109.85509994731584] |[3.608228215502761,3.2385790201918914]|
|2  |[79.89623842556591,223.65330592276075] |[5.165866707720625,6.593402624961242] |
|3  |[46.51574257351342,153.25551918704497] |[3.007577461475638,4.518043399039362] |
|4  |[67.33359143317264,217.56658688765393] |[4.353601185116437,6.413963340136475] |
+---+---------------------------------------+--------------------------------------+
only showing top 5 rows



# Vector Dessembler

In [10]:
sample_row = dataset.take(1)[0]

In [11]:
sample_vec = sample_row['scaledFeatures']
sample_vec

DenseVector([1.6075, 5.005])

In [12]:
type(sample_vec[0])

numpy.float64

In [13]:
def disassemble_vector_from_row(row, vec_col, col_names):
    vector = row[vec_col]
    values = map(lambda v: float(v), vector)
    vec_value_dict = dict(zip(col_names, values))
    row_dict = row.asDict()
    row_dict.update(vec_value_dict)
    return Row(**row_dict)

In [14]:
new_cols = map(lambda f: 'scaled_{}'.format(f), features)
disassemble_vector_from_row(sample_row, 'scaledFeatures', new_cols)

Row(features=DenseVector([24.8625, 169.7747]), height=169.77471325769076, id=0, scaledFeatures=DenseVector([1.6075, 5.005]), scaled_height=5.005036860183438, scaled_weight=1.6075393273534673, weight=24.862496971660626)

In [15]:
vec_dis_df = dataset.rdd \
    .map(lambda r: disassemble_vector_from_row(r, 'scaledFeatures', new_cols)) \
    .toDF() \
    .select(dataset.columns + new_cols)

In [16]:
vec_dis_df.show(5, truncate=False)

+---+------------------+------------------+---------------------------------------+--------------------------------------+------------------+------------------+
|id |weight            |height            |features                               |scaledFeatures                        |scaled_weight     |scaled_height     |
+---+------------------+------------------+---------------------------------------+--------------------------------------+------------------+------------------+
|0  |24.862496971660626|169.77471325769076|[24.862496971660626,169.77471325769076]|[1.6075393273534673,5.005036860183438]|1.6075393273534673|5.005036860183438 |
|1  |55.80551688815535 |109.85509994731584|[55.80551688815535,109.85509994731584] |[3.608228215502761,3.2385790201918914]|3.608228215502761 |3.2385790201918914|
|2  |79.89623842556591 |223.65330592276075|[79.89623842556591,223.65330592276075] |[5.165866707720625,6.593402624961242] |5.165866707720625 |6.593402624961242 |
|3  |46.51574257351342 |153.255519

# Custom Estimator

In [22]:
from pyspark import keyword_only
from pyspark.ml.pipeline import Pipeline, PipelineModel, Transformer
from pyspark.ml.param.shared import *
from pyspark.ml.util import DefaultParamsReadable, DefaultParamsWritable

In [49]:
class VectorDisassembler(Transformer, HasInputCol, HasOutputCols, DefaultParamsReadable, DefaultParamsWritable):
    
    def get_ordered_columns(self, cols):
        ys = self.getOutputCols()
        return cols + [y for y in ys if y not in cols]
    
    @staticmethod
    def disassemble(vec_col, output_cols, row):
        vector = row[vec_col]
        values = map(lambda v: float(v), vector)
        vec_value_dict = dict(zip(output_cols, values))
        row_dict = row.asDict()
        row_dict.update(vec_value_dict)
        return Row(**row_dict)
    
    def _transform(self, dataset):
        x = self.getInputCol()
        ys = self.getOutputCols()
        disamb_func = functools.partial(self.disassemble, x, ys)
        final_cols = self.get_ordered_columns(dataset.columns)
        print(final_cols)
        return dataset.rdd \
            .map(disamb_func) \
            .toDF() \
            .select(final_cols)

In [39]:
new_cols

[u'scaled_weight', u'scaled_height']

In [41]:
VectorDisassembler() \
    .setInputCol('scaledFeatures') \
    .setOutputCols(new_cols) \
    .transform(dataset).show()

['id', 'weight', 'height', 'features', 'scaledFeatures', u'scaled_weight', u'scaled_height']
+---+------------------+------------------+--------------------+--------------------+------------------+------------------+
| id|            weight|            height|            features|      scaledFeatures|     scaled_weight|     scaled_height|
+---+------------------+------------------+--------------------+--------------------+------------------+------------------+
|  0|24.862496971660626|169.77471325769076|[24.8624969716606...|[1.60753932735346...|1.6075393273534673| 5.005036860183438|
|  1| 55.80551688815535|109.85509994731584|[55.8055168881553...|[3.60822821550276...| 3.608228215502761|3.2385790201918914|
|  2| 79.89623842556591|223.65330592276075|[79.8962384255659...|[5.16586670772062...| 5.165866707720625| 6.593402624961242|
|  3| 46.51574257351342|153.25551918704497|[46.5157425735134...|[3.00757746147563...| 3.007577461475638| 4.518043399039362|
|  4| 67.33359143317264|217.56658688765

# Put All Together

In [50]:
features, new_cols

([u'weight', u'height'], [u'scaled_weight', u'scaled_height'])

In [51]:
pipeline = Pipeline(stages=[
    VectorAssembler(inputCols=features, outputCol='features'),
    StandardScaler(inputCol='features', outputCol='scaledFeatures'),
    VectorDisassembler().setInputCol('scaledFeatures').setOutputCols(new_cols)
])

In [52]:
pipelineModel = pipeline.fit(df)

In [53]:
pipelineModel.transform(df).show(5)

['id', 'weight', 'height', 'features', 'scaledFeatures', u'scaled_weight', u'scaled_height']
+---+------------------+------------------+--------------------+--------------------+------------------+------------------+
| id|            weight|            height|            features|      scaledFeatures|     scaled_weight|     scaled_height|
+---+------------------+------------------+--------------------+--------------------+------------------+------------------+
|  0|24.862496971660626|169.77471325769076|[24.8624969716606...|[1.60753932735346...|1.6075393273534673| 5.005036860183438|
|  1| 55.80551688815535|109.85509994731584|[55.8055168881553...|[3.60822821550276...| 3.608228215502761|3.2385790201918914|
|  2| 79.89623842556591|223.65330592276075|[79.8962384255659...|[5.16586670772062...| 5.165866707720625| 6.593402624961242|
|  3| 46.51574257351342|153.25551918704497|[46.5157425735134...|[3.00757746147563...| 3.007577461475638| 4.518043399039362|
|  4| 67.33359143317264|217.56658688765

In [54]:
pipelineModel.save('tmp/vec_disassembe_pipeline')

In [55]:
loadedModel = PipelineModel.load('tmp/vec_disassembe_pipeline')

In [56]:
loadedModel.transform(df).show(5)

['id', 'weight', 'height', 'features', 'scaledFeatures', u'scaled_weight', u'scaled_height']
+---+------------------+------------------+--------------------+--------------------+------------------+------------------+
| id|            weight|            height|            features|      scaledFeatures|     scaled_weight|     scaled_height|
+---+------------------+------------------+--------------------+--------------------+------------------+------------------+
|  0|24.862496971660626|169.77471325769076|[24.8624969716606...|[1.60753932735346...|1.6075393273534673| 5.005036860183438|
|  1| 55.80551688815535|109.85509994731584|[55.8055168881553...|[3.60822821550276...| 3.608228215502761|3.2385790201918914|
|  2| 79.89623842556591|223.65330592276075|[79.8962384255659...|[5.16586670772062...| 5.165866707720625| 6.593402624961242|
|  3| 46.51574257351342|153.25551918704497|[46.5157425735134...|[3.00757746147563...| 3.007577461475638| 4.518043399039362|
|  4| 67.33359143317264|217.56658688765