# Prepare the Inclusive Classifier Dataset using TFRecord and Parquet
This converts the full training and test datasets into TFRecord format and Parquet format for tensorFlow and for Petasorm
 - Maps Parquet data into a dataframe
 - Save the dataframe as TFRecords using spark-tensorflow-connector
   (see https://github.com/tensorflow/ecosystem/tree/master/spark/spark-tensorflow-connector)

Data in TFRecord format can be processed natively using TensorFlow and the tf.data API. Other formats, notably Apache Parquet, cannot be fed directly into TensorFlow, but require adapters, for example the Petastorm library.  
The TFRecord dataset is used as input for the TensorFlow Keras with TFRecord example notebook. 


In [2]:
# No need to run this when using CERN SWAN service
# Just add the configuration parameters for Spark on the "star" button integration

# pip install pyspark or use your favorite way to set Spark Home, here we use findspark
import findspark
findspark.init('/home/luca/Spark/spark-3.3.2-bin-hadoop3') #set path to SPARK_HOME

# Create Spark session and configure according to your environment
from pyspark.sql import SparkSession

# Spark-Tensorflow connector for scala 2.12
# for spark 2.4.8 and scala 2.11 use from maven central: 
# --packages org.tensorflow:spark-tensorflow-connector_2.11:1.14.0
JAR = "http://canali.web.cern.ch/res/spark-tensorflow-connector_2.12-1.11.0.jar"

spark = ( SparkSession.builder
            .appName("Prepare TFRecord dataset")
            .master("yarn")
            .config("spark.driver.memory","2g")
            .config("spark.jars", JAR)
            .config("spark.executor.memory","64g")
            .config("spark.executor.cores.memory","8")   
            .config("spark.ui.showConsoleProgress", "false")
            .getOrCreate()
        )


### Define and prepare the input datasets

In [4]:
# Download the datasets from https://github.com/cerndb/SparkDLTrigger/tree/master/Data
#
# For CERN users, data is available on EOS and HDFS
# PATH = "/eos/project/s/sparkdltrigger/public/"
PATH = "hdfs://analytix/Training/Spark/TopologyClassifier/"

df_test_raw = spark.read.parquet(PATH + "testUndersampled.parquet")
df_test_raw.printSchema()

df_train_raw = spark.read.parquet(PATH + "trainUndersampled.parquet")
df_train_raw.printSchema()

root
 |-- hfeatures: vector (nullable = true)
 |-- label: long (nullable = true)
 |-- lfeatures: array (nullable = true)
 |    |-- element: array (containsNull = true)
 |    |    |-- element: double (containsNull = true)
 |-- hfeatures_dense: vector (nullable = true)
 |-- encoded_label: vector (nullable = true)
 |-- HLF_input: vector (nullable = true)
 |-- GRU_input: array (nullable = true)
 |    |-- element: array (containsNull = true)
 |    |    |-- element: double (containsNull = true)

root
 |-- hfeatures: vector (nullable = true)
 |-- label: long (nullable = true)
 |-- lfeatures: array (nullable = true)
 |    |-- element: array (containsNull = true)
 |    |    |-- element: double (containsNull = true)
 |-- hfeatures_dense: vector (nullable = true)
 |-- encoded_label: vector (nullable = true)
 |-- HLF_input: vector (nullable = true)
 |-- GRU_input: array (nullable = true)
 |    |-- element: array (containsNull = true)
 |    |    |-- element: double (containsNull = true)



In [9]:
# Extacts the columns of interest for the high level features classifier
# Transform Vectors in Arrays
# This is because we need Array for the Petastorm example notebook
# Select the fields used by the Inclusive classifier: HLF_input, GRU_input and encoded_label
# Note: GRU_input is flattened, this allows to use the Example record format (default),
# restoring the Array to its original shape will be handled in TensorFlow using tf.data and tf.io

from pyspark.ml.functions import vector_to_array
from pyspark.sql.functions import flatten

df_test = ( df_test_raw
             .withColumn('HLF_input', vector_to_array('HLF_input'))
             .withColumn('GRU_input', flatten('GRU_input'))
             .withColumn('encoded_label', vector_to_array('encoded_label'))
             .select('HLF_input', 'GRU_input', 'encoded_label')
          )

df_test.printSchema()

df_train = ( df_train_raw
               .withColumn('HLF_input', vector_to_array('HLF_input'))
               .withColumn('GRU_input', flatten('GRU_input'))
               .withColumn('encoded_label', vector_to_array('encoded_label'))
               .select('HLF_input', 'GRU_input', 'encoded_label')
           )

df_train.printSchema()

root
 |-- HLF_input: array (nullable = false)
 |    |-- element: double (containsNull = false)
 |-- GRU_input: array (nullable = true)
 |    |-- element: double (containsNull = true)
 |-- encoded_label: array (nullable = false)
 |    |-- element: double (containsNull = false)

root
 |-- HLF_input: array (nullable = false)
 |    |-- element: double (containsNull = false)
 |-- GRU_input: array (nullable = true)
 |    |-- element: double (containsNull = true)
 |-- encoded_label: array (nullable = false)
 |    |-- element: double (containsNull = false)



### Save the datasets in TFRecord format
This will be used with TensorFLowto train the inclusive classifier


In [11]:
# Customize the output path
outputPATH = PATH
#outputPATH = '/user/canali/training/'

numPartitions = 200

# Save the test dataset
# compact output in numPartitions files using coalesce()
df_test.coalesce(numPartitions).write.format("tfrecords").save(outputPATH + "testUndersampled_InclusiveClassifier.tfrecord")


In [12]:
# Repeat for the training dataset

# Save the training dataset in TFRecord format
# compact output in numPartitions files using coalesce()
df_train.coalesce(numPartitions).write.format("tfrecords").save(outputPATH + "trainUndersampled_InclusiveClassifier.tfrecord")


### Save the datasets in Parquet format
This will be used with Petastorm to train the inclusive classifier

In [13]:
df_test.coalesce(numPartitions).write.parquet(outputPATH + "testUndersampled_InclusiveClassifier.parquet")

df_train.coalesce(numPartitions).write.parquet(outputPATH + "trainUndersampled_InclusiveClassifier.parquet")

In [None]:
spark.stop()