# Setup

In [1]:
from __future__ import print_function
from __future__ import division
from __future__ import unicode_literals

In [7]:
import numpy as np

In [9]:
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.pipeline import Pipeline

In [61]:
from pyspark.ml.pipeline import Transformer, Pipeline, PipelineModel
from pyspark.ml.param.shared import *
import pyspark.sql.functions as F

In [33]:
from pyspark.sql.types import DoubleType, StringType

In [3]:
spark, sc, sql

(<pyspark.sql.session.SparkSession at 0x1088d9090>,
 <pyspark.context.SparkContext at 0x1087e7e50>,
 <bound method SparkSession.sql of <pyspark.sql.session.SparkSession object at 0x1088d9090>>)

# Sample Dataset

In [11]:
DATA_NUM = 100
sample_data = zip(
    range(DATA_NUM), 
    map(lambda d: float(d), np.random.normal(loc=50, scale=15, size=DATA_NUM)),
    map(lambda d: float(d), np.random.normal(loc=160, scale=30, size=DATA_NUM)))

df = sc.parallelize(sample_data).toDF(['id', 'weight', 'height'])
df.show()

+---+------------------+------------------+
| id|            weight|            height|
+---+------------------+------------------+
|  0| 43.47904739506433|178.67303210378856|
|  1| 32.89517343790158|126.11088876632151|
|  2| 53.71225421154686|170.33061904219957|
|  3|30.625379263194358|179.66304415390425|
|  4|26.761923249706616|137.75022300836957|
|  5| 45.18700418555743|198.95573632859129|
|  6| 66.10391443448712| 158.7765870351398|
|  7| 54.10505203029727|130.99487942095664|
|  8|26.940086496609005|210.21310222126573|
|  9| 61.18553946952882|187.32136784669873|
| 10|58.166526414811464| 212.1387568033278|
| 11|52.923679462270606|144.23857732344854|
| 12| 58.42308286576034|119.20756980020424|
| 13| 39.46929511972158| 191.7576007820383|
| 14| 59.96293834692389|163.02444813167617|
| 15| 49.45566242687979|149.58587671946088|
| 16| 49.04942386738486|199.45066227167976|
| 17| 55.72142811047398|134.41477565590012|
| 18| 8.850630603941937|172.17499159292177|
| 19|45.897648138143026| 197.288

In [12]:
features = ['weight', 'height']

# Vector Assembler

In [13]:
pipeline = Pipeline(stages=[
    VectorAssembler(inputCols=features, outputCol='features'),
    StandardScaler(inputCol='features', outputCol='scaledFeatures')
])

In [15]:
pipelineModel = pipeline.fit(df)

In [17]:
dataset = pipelineModel.transform(df)
dataset.select('id', 'features', 'scaledFeatures').show(5, truncate=False)

+---+---------------------------------------+--------------------------------------+
|id |features                               |scaledFeatures                        |
+---+---------------------------------------+--------------------------------------+
|0  |[43.47904739506433,178.67303210378856] |[2.539503998971415,5.967459603765877] |
|1  |[32.89517343790158,126.11088876632151] |[1.9213260063718043,4.211948638510167]|
|2  |[53.71225421154686,170.33061904219957] |[3.137200388145502,5.688833264039114] |
|3  |[30.625379263194358,179.66304415390425]|[1.7887529228095973,6.000524789075292]|
|4  |[26.761923249706616,137.75022300836957]|[1.5630979790166926,4.600688092284052]|
+---+---------------------------------------+--------------------------------------+
only showing top 5 rows



In [44]:
row_data = dataset.take(1)[0]
row_data

Row(id=0, weight=43.47904739506433, height=178.67303210378856, features=DenseVector([43.479, 178.673]), scaledFeatures=DenseVector([2.5395, 5.9675]))

In [46]:
row_data['features'].size

2

In [48]:
get_element = F.udf(lambda f, i: float(f[i]), DoubleType())
dataset.select(get_element('features', F.lit(1)).alias('f1')).show()

+------------------+
|                f1|
+------------------+
|178.67303210378856|
|126.11088876632151|
|170.33061904219957|
|179.66304415390425|
|137.75022300836957|
|198.95573632859129|
| 158.7765870351398|
|130.99487942095664|
|210.21310222126573|
|187.32136784669873|
| 212.1387568033278|
|144.23857732344854|
|119.20756980020424|
| 191.7576007820383|
|163.02444813167617|
|149.58587671946088|
|199.45066227167976|
|134.41477565590012|
|172.17499159292177|
| 197.2888613061005|
+------------------+
only showing top 20 rows



In [55]:
vec_disamb_data = dataset.select('id', 'features')
field = 'features'
for i in range(row_data[field].size):
    vec_disamb_data = vec_disamb_data.withColumn('f{}'.format(i), get_element(field, F.lit(i)))

In [57]:
vec_disamb_data.show(truncate=False)

+---+---------------------------------------+------------------+------------------+
|id |features                               |f0                |f1                |
+---+---------------------------------------+------------------+------------------+
|0  |[43.47904739506433,178.67303210378856] |43.47904739506433 |178.67303210378856|
|1  |[32.89517343790158,126.11088876632151] |32.89517343790158 |126.11088876632151|
|2  |[53.71225421154686,170.33061904219957] |53.71225421154686 |170.33061904219957|
|3  |[30.625379263194358,179.66304415390425]|30.625379263194358|179.66304415390425|
|4  |[26.761923249706616,137.75022300836957]|26.761923249706616|137.75022300836957|
|5  |[45.18700418555743,198.95573632859129] |45.18700418555743 |198.95573632859129|
|6  |[66.10391443448712,158.7765870351398]  |66.10391443448712 |158.7765870351398 |
|7  |[54.10505203029727,130.99487942095664] |54.10505203029727 |130.99487942095664|
|8  |[26.940086496609005,210.21310222126573]|26.940086496609005|210.21310222

# Vector Disassembler

In [75]:
class HasOutputCols(Params):

    outputCols = Param(Params._dummy(), "outputCols", 
                         "output columns",
                         typeConverter=TypeConverters.toList)

    def __init__(self):
        super(HasOutputCols, self).__init__()
        
    def setOutputCols(self, value):
        return self._set(outputCols=value)

    def getOutputCols(self):
        return self.getOrDefault(self.outputCols)

In [76]:
class VectorDisassembler(Transformer, HasInputCol, HasOutputCols):
    
    def _transform(self, dataset):
        x = self.getInputCol()  
        
        rows = dataset.select(x).take(1)
        if len(rows) == 0:
            return dataset
        
        vec_size = rows[0][x].size
        ys = None
        try:
            ys = self.getOutputCols()
        except:
            ys = map(lambda i: '{}_{}'.format(x, i), range(vec_size))
        
        vec_disamb_data = dataset
        get_element = F.udf(lambda f, i: float(f[i]), DoubleType())
        for i, y in enumerate(ys):
            vec_disamb_data = vec_disamb_data \
                .withColumn(y, get_element(x, F.lit(i)))
    
        return vec_disamb_data

# Try VectorDisassembler

In [80]:
test_data = dataset.drop(*features).drop('scaledFeatures')
test_data.show(truncate=False)

+---+---------------------------------------+
|id |features                               |
+---+---------------------------------------+
|0  |[43.47904739506433,178.67303210378856] |
|1  |[32.89517343790158,126.11088876632151] |
|2  |[53.71225421154686,170.33061904219957] |
|3  |[30.625379263194358,179.66304415390425]|
|4  |[26.761923249706616,137.75022300836957]|
|5  |[45.18700418555743,198.95573632859129] |
|6  |[66.10391443448712,158.7765870351398]  |
|7  |[54.10505203029727,130.99487942095664] |
|8  |[26.940086496609005,210.21310222126573]|
|9  |[61.18553946952882,187.32136784669873] |
|10 |[58.166526414811464,212.1387568033278] |
|11 |[52.923679462270606,144.23857732344854]|
|12 |[58.42308286576034,119.20756980020424] |
|13 |[39.46929511972158,191.7576007820383]  |
|14 |[59.96293834692389,163.02444813167617] |
|15 |[49.45566242687979,149.58587671946088] |
|16 |[49.04942386738486,199.45066227167976] |
|17 |[55.72142811047398,134.41477565590012] |
|18 |[8.850630603941937,172.174991

In [83]:
VectorDisassembler() \
    .setInputCol('features') \
    .transform(test_data) \
    .show(truncate=False)

+---+---------------------------------------+------------------+------------------+
|id |features                               |features_0        |features_1        |
+---+---------------------------------------+------------------+------------------+
|0  |[43.47904739506433,178.67303210378856] |43.47904739506433 |178.67303210378856|
|1  |[32.89517343790158,126.11088876632151] |32.89517343790158 |126.11088876632151|
|2  |[53.71225421154686,170.33061904219957] |53.71225421154686 |170.33061904219957|
|3  |[30.625379263194358,179.66304415390425]|30.625379263194358|179.66304415390425|
|4  |[26.761923249706616,137.75022300836957]|26.761923249706616|137.75022300836957|
|5  |[45.18700418555743,198.95573632859129] |45.18700418555743 |198.95573632859129|
|6  |[66.10391443448712,158.7765870351398]  |66.10391443448712 |158.7765870351398 |
|7  |[54.10505203029727,130.99487942095664] |54.10505203029727 |130.99487942095664|
|8  |[26.940086496609005,210.21310222126573]|26.940086496609005|210.21310222

In [84]:
VectorDisassembler() \
    .setInputCol('features') \
    .setOutputCols(features) \
    .transform(test_data) \
    .show(truncate=False)

+---+---------------------------------------+------------------+------------------+
|id |features                               |weight            |height            |
+---+---------------------------------------+------------------+------------------+
|0  |[43.47904739506433,178.67303210378856] |43.47904739506433 |178.67303210378856|
|1  |[32.89517343790158,126.11088876632151] |32.89517343790158 |126.11088876632151|
|2  |[53.71225421154686,170.33061904219957] |53.71225421154686 |170.33061904219957|
|3  |[30.625379263194358,179.66304415390425]|30.625379263194358|179.66304415390425|
|4  |[26.761923249706616,137.75022300836957]|26.761923249706616|137.75022300836957|
|5  |[45.18700418555743,198.95573632859129] |45.18700418555743 |198.95573632859129|
|6  |[66.10391443448712,158.7765870351398]  |66.10391443448712 |158.7765870351398 |
|7  |[54.10505203029727,130.99487942095664] |54.10505203029727 |130.99487942095664|
|8  |[26.940086496609005,210.21310222126573]|26.940086496609005|210.21310222