# Understanding Petastorm converter and types ( Chapter 7 )

In [1]:
SAMPLE_SIZE = 50

Define schema

In [2]:
import numpy as np
from pyspark.sql import SparkSession
from pyspark.sql.types import IntegerType

from petastorm.codecs import ScalarCodec, CompressedImageCodec, NdarrayCodec
from petastorm.etl.dataset_metadata import materialize_dataset
from petastorm.unischema import dict_to_spark_row, Unischema, UnischemaField

from petastorm.spark import SparkDatasetConverter, make_spark_converter
from petastorm import TransformSpec 

from pyspark.sql.functions import col
from pyspark.sql.types import *


from pyspark.sql.types import (ArrayType, BinaryType, BooleanType, ByteType,
                               DoubleType, FloatType, IntegerType, LongType,
                               ShortType, StringType, StructField, StructType)


  from pyarrow import LocalFileSystem


In [3]:
# start Spark session:

spark = SparkSession \
    .builder \
    .appName("Create petastorm store") \
    .config("spark.memory.offHeap.enabled",True) \
    .config("spark.memory.offHeap.size","30g")\
    .getOrCreate()

spark.conf.set(SparkDatasetConverter.PARENT_CACHE_DIR_URL_CONF, 'file:/home/jovyan/petastorm_tmp_cache')

## understand primitives

In [4]:
 schema = StructType([
        StructField("bool_col", BooleanType(), False),
        StructField("float_col", FloatType(), False),
        StructField("double_col", DoubleType(), False),
        StructField("short_col", ShortType(), False),
        StructField("int_col", IntegerType(), False),
        StructField("long_col", LongType(), False),
        StructField("str_col", StringType(), False),
        StructField("bin_col", BinaryType(), False),
        StructField("byte_col", ByteType(), False),
    ])

In [5]:
df = spark.createDataFrame(
        [(True, 0.12, 432.1, 5, 5, 0, "hello",
          bytearray(b"spark\x01\x02"), -128),
         
         (False, 123.45, 0.987, 9, 908, 765, "petastorm",
          bytearray(b"\x0012345"), 127)],
        schema=schema).coalesce(1)

    # If we use numPartition > 1 in coalesce, the order of the loaded dataset would
    # be non-deterministic.
# just for the learning phase - DO NOT USE IN PRODUCTION!    
expected_df = df.collect()

In [6]:
expected_df

[Row(bool_col=True, float_col=0.11999999731779099, double_col=432.1, short_col=5, int_col=5, long_col=0, str_col='hello', bin_col=bytearray(b'spark\x01\x02'), byte_col=-128),
 Row(bool_col=False, float_col=123.44999694824219, double_col=0.987, short_col=9, int_col=908, long_col=765, str_col='petastorm', bin_col=bytearray(b'\x0012345'), byte_col=127)]

 ## Test TensorFlow dataset

In [7]:
# create tf operations graph

In [8]:
import tensorflow as tf


converter = make_spark_converter(df)
with converter.make_tf_dataset() as dataset:
        iterator = iter(dataset)
        print(iterator.element_spec)
        
        tensor = iterator.get_next()
        print(tensor)

  self._filesystem = pyarrow.localfs
Converting floating-point columns to float32


inferred_schema_view(bool_col=TensorSpec(shape=(None,), dtype=tf.bool, name=None), float_col=TensorSpec(shape=(None,), dtype=tf.float32, name=None), double_col=TensorSpec(shape=(None,), dtype=tf.float32, name=None), short_col=TensorSpec(shape=(None,), dtype=tf.int16, name=None), int_col=TensorSpec(shape=(None,), dtype=tf.int32, name=None), long_col=TensorSpec(shape=(None,), dtype=tf.int64, name=None), str_col=TensorSpec(shape=(None,), dtype=tf.string, name=None), bin_col=TensorSpec(shape=(None,), dtype=tf.string, name=None), byte_col=TensorSpec(shape=(None,), dtype=tf.int8, name=None))
inferred_schema_view(bool_col=<tf.Tensor: shape=(32,), dtype=bool, numpy=
array([ True, False,  True, False,  True, False,  True, False,  True,
       False,  True, False,  True, False,  True, False,  True, False,
        True, False,  True, False,  True, False,  True, False,  True,
       False,  True, False,  True, False])>, float_col=<tf.Tensor: shape=(32,), dtype=float32, numpy=
array([1.2000e-01, 1.