In [21]:
from __future__ import print_function
from __future__ import unicode_literals
from __future__ import division

In [22]:
import numpy as np
import pandas as pd
import datetime as dt

In [71]:
import h5py

In [23]:
from pyspark.sql import Row
from pyspark.sql.functions import col

In [24]:
spark

# Load Train DF

In [25]:
train_df = spark.read.parquet('tmp/train')

In [26]:
train_df.printSchema()

root
 |-- user_id: string (nullable = true)
 |-- 0: double (nullable = true)
 |-- 1: double (nullable = true)
 |-- 2: double (nullable = true)
 |-- 3: double (nullable = true)
 |-- 4: double (nullable = true)
 |-- 5: double (nullable = true)
 |-- 6: double (nullable = true)
 |-- 7: double (nullable = true)
 |-- 8: double (nullable = true)
 |-- 9: double (nullable = true)
 |-- 10: double (nullable = true)
 |-- 11: double (nullable = true)
 |-- 12: double (nullable = true)
 |-- 13: double (nullable = true)
 |-- 14: double (nullable = true)
 |-- 15: double (nullable = true)
 |-- 16: double (nullable = true)
 |-- 17: double (nullable = true)
 |-- 18: double (nullable = true)
 |-- 19: double (nullable = true)
 |-- 20: double (nullable = true)
 |-- 21: double (nullable = true)
 |-- 22: double (nullable = true)
 |-- 23: double (nullable = true)
 |-- 24: double (nullable = true)
 |-- 25: double (nullable = true)
 |-- 26: double (nullable = true)
 |-- 27: double (nullable = true)
 |-- 28: doubl

In [27]:
train_df.select('user_id', '0', '1', '895', 'time_slot_0', 'time_slot_1', 'time_slot_27').show()

+-------+---+---+----+-----------+-----------+------------+
|user_id|  0|  1| 895|time_slot_0|time_slot_1|time_slot_27|
+-------+---+---+----+-----------+-----------+------------+
|  10311|0.0|0.0| 0.0|        0.0|        1.0|         0.0|
|   1078|0.0|0.0| 0.0|        0.0|        0.0|         0.0|
|  10988|0.0|0.0| 0.0|        0.0|        0.0|         0.0|
|  11003|0.0|0.0| 0.0|        0.0|        0.0|         0.0|
|  11005|0.0|0.0|10.0|        1.0|        1.0|         0.0|
|  11062|0.0|0.0| 0.0|        0.0|        0.0|         0.0|
|   1107|0.0|0.0| 0.0|        0.0|        0.0|         0.0|
|  11373|0.0|0.0| 0.0|        0.0|        0.0|         0.0|
|   1149|0.0|0.0| 0.0|        0.0|        0.0|         0.0|
|  11836|0.0|0.0| 0.0|        0.0|        0.0|         0.0|
|   1184|0.0|0.0| 0.0|        0.0|        0.0|         0.0|
|  11913|0.0|0.0| 0.0|        0.0|        0.0|         0.0|
|  12749|0.0|0.0| 0.0|        0.0|        0.0|         0.0|
|  12887|0.0|0.0| 0.0|        0.0|      

# Standardization

In [28]:
from pyspark.ml.feature import StandardScaler, VectorAssembler

In [29]:
features = map(lambda i: str(i), range(896))
dataAssembler = VectorAssembler(inputCols=features, outputCol='features')
dataScaler = StandardScaler(inputCol='features', outputCol='std_features')

In [30]:
labels = map(lambda i: 'time_slot_{}'.format(i), range(28))
labelAssembler = VectorAssembler(inputCols=labels, outputCol='labels')

In [31]:
from pyspark.ml.pipeline import Pipeline

In [32]:
pipeline = Pipeline(stages=[dataAssembler, dataScaler, labelAssembler])

In [33]:
pipeModel = pipeline.fit(train_df)

In [34]:
train_features_df = pipeModel.transform(train_df)

In [35]:
train_features_df.select('user_id', 'std_features', 'labels').show()

+-------+--------------------+--------------------+
|user_id|        std_features|              labels|
+-------+--------------------+--------------------+
|  10311|(896,[554,556,557...|(28,[1,5,6,9,13],...|
|   1078|(896,[839,840,841...|          (28,[],[])|
|  10988|(896,[625,626,834...|          (28,[],[])|
|  11003|(896,[772,775,776...|     (28,[20],[1.0])|
|  11005|(896,[489,491,492...|(28,[0,1,4,7,9,11...|
|  11062|(896,[842,844,845...|          (28,[],[])|
|   1107|(896,[236,237,238...|(28,[24,25,26],[1...|
|  11373|(896,[5,561,562,5...|(28,[4,5,9,12,13,...|
|   1149|(896,[373,376,382...|          (28,[],[])|
|  11836|(896,[836,837,839...|          (28,[],[])|
|   1184|(896,[830,832,838...|          (28,[],[])|
|  11913|(896,[649,651,652...|          (28,[],[])|
|  12749|(896,[5,9,16,21,2...|(28,[9,16],[1.0,1...|
|  12887|(896,[76,85,89,93...|          (28,[],[])|
|  12984|(896,[377,381,382...|          (28,[],[])|
|  13186|(896,[137,848,849...|(28,[4,5,8,9,12,1...|
|  13403|(89

In [36]:
train_features_df.count()

57139

In [18]:
train_features_df.show()

+-------+---+---+---+---+---+----+---+---+---+---+---+---+---+----+---+---+---+---+---+---+---+----+---+---+---+---+----+---+---+---+---+---+----+---+---+---+---+----+---+---+---+---+---+---+----+---+---+---+---+----+---+---+---+----+---+---+---+---+---+---+---+----+---+---+---+---+---+---+---+----+---+---+---+---+---+---+----+---+---+---+---+---+---+---+----+---+---+---+----+---+---+---+---+---+---+---+---+---+---+---+---+---+----+---+----+----+---+---+---+----+---+---+----+----+----+---+----+----+---+----+----+----+---+---+----+----+---+---+---+----+----+---+---+----+---+---+---+----+---+---+---+----+---+---+---+---+---+---+----+---+---+---+---+----+---+---+----+----+---+---+---+----+---+---+----+----+---+---+---+----+---+---+----+----+---+---+----+---+---+---+---+---+---+---+---+----+---+---+---+---+---+---+---+----+---+---+---+----+---+----+----+----+---+---+----+----+---+---+---+----+---+---+----+----+---+---+---+----+---+---+---+----+---+---+----+---+---+---+----+---+---+---+---+

In [43]:
train_features_df \
    .select('user_id', 'std_features', 'labels') \
    .write.mode('overwrite') \
    .format('parquet') \
    .save('tmp/train_features')

In [44]:
spark.read.parquet('tmp/train_features').show()

+-------+--------------------+--------------------+
|user_id|        std_features|              labels|
+-------+--------------------+--------------------+
|  10311|(896,[554,556,557...|(28,[1,5,6,9,13],...|
|   1078|(896,[839,840,841...|          (28,[],[])|
|  10988|(896,[625,626,834...|          (28,[],[])|
|  11003|(896,[772,775,776...|     (28,[20],[1.0])|
|  11005|(896,[489,491,492...|(28,[0,1,4,7,9,11...|
|  11062|(896,[842,844,845...|          (28,[],[])|
|   1107|(896,[236,237,238...|(28,[24,25,26],[1...|
|  11373|(896,[5,561,562,5...|(28,[4,5,9,12,13,...|
|   1149|(896,[373,376,382...|          (28,[],[])|
|  11836|(896,[836,837,839...|          (28,[],[])|
|   1184|(896,[830,832,838...|          (28,[],[])|
|  11913|(896,[649,651,652...|          (28,[],[])|
|  12749|(896,[5,9,16,21,2...|(28,[9,16],[1.0,1...|
|  12887|(896,[76,85,89,93...|          (28,[],[])|
|  12984|(896,[377,381,382...|          (28,[],[])|
|  13186|(896,[137,848,849...|(28,[4,5,8,9,12,1...|
|  13403|(89

# Sample User Tensors

In [45]:
features_df = spark.read.parquet('tmp/train_features')

In [46]:
user_tensor = features_df.take(1)[0]

In [50]:
sample_x = user_tensor.std_features.toArray()

In [51]:
sample_y = user_tensor.labels.toArray()

In [55]:
sample_x.reshape(1, 32, 28, 1)

array([[[[ 0.        ],
         [ 0.        ],
         [ 0.        ],
         [ 0.        ],
         [ 0.        ],
         [ 0.        ],
         [ 0.        ],
         [ 0.        ],
         [ 0.        ],
         [ 0.        ],
         [ 0.        ],
         [ 0.        ],
         [ 0.        ],
         [ 0.        ],
         [ 0.        ],
         [ 0.        ],
         [ 0.        ],
         [ 0.        ],
         [ 0.        ],
         [ 0.        ],
         [ 0.        ],
         [ 0.        ],
         [ 0.        ],
         [ 0.        ],
         [ 0.        ],
         [ 0.        ],
         [ 0.        ],
         [ 0.        ]],

        [[ 0.        ],
         [ 0.        ],
         [ 0.        ],
         [ 0.        ],
         [ 0.        ],
         [ 0.        ],
         [ 0.        ],
         [ 0.        ],
         [ 0.        ],
         [ 0.        ],
         [ 0.        ],
         [ 0.        ],
         [ 0.        ],
         [ 0. 

# Build Tensors

In [62]:
def build_tensor(d):
    t = {
        'x': d.std_features.toArray().reshape(1, 32, 28, 1), 
        'y': d.labels.toArray().reshape(1, 28)
    }
    return t

def merge_tensor(t1, t2):
    t = {
        'x': np.concatenate([t1['x'], t2['x']]),
        'y': np.concatenate([t1['y'], t2['y']])
    }
    return t

u_tensors = features_df.sample(False, 0.001).rdd \
    .map(build_tensor) \
    .reduce(merge_tensor)

In [64]:
u_tensors['x'].shape

(55, 32, 28, 1)

In [65]:
u_tensors['y'].shape

(55, 28)

In [66]:
train_tensors = features_df.rdd \
    .map(build_tensor) \
    .reduce(merge_tensor)

In [69]:
train_tensors['x'].shape

(57139, 32, 28, 1)

In [70]:
train_tensors['y'].shape

(57139, 28)

In [72]:
with h5py.File('tmp/train_tensor.h5', 'w') as hf:
    hf.create_dataset("features",  data=train_tensors['x'])
    hf.create_dataset('labels', data=train_tensors['y'])