# Convert the High Level Features Dataset from Parquet to TFRecord
This converts the High Level feature classifier training and test datasets to TFRecord format
 - Reads Parquet data into a dataframe
 - Save the dataframe as TFRecords using spark-tensorflow-connector
   (see https://github.com/tensorflow/ecosystem/tree/master/spark/spark-tensorflow-connector)

Data in TFRecord format can be processed natively using TensorFlow and the tf.data API. Other formats, notably Apache Parquet, cannot be fed directly into TensorFlow, but require adapters, for example the Petastorm library.  
The TFRecord dataset is used as input for the TensorFlow Keras with TFRecord example notebook. 


In [1]:
# No need to run this when using CERN SWAN service
# Just add the configuration parameters for Spark on the "star" button integration

# pip install pyspark or use your favorite way to set Spark Home, here we use findspark
import findspark
findspark.init('/home/luca/Spark/spark-3.3.2-bin-hadoop3') #set path to SPARK_HOME

# Create Spark session and configure according to your environment
from pyspark.sql import SparkSession

# Spark-Tensorflow connector for scala 2.12
# for spark 2.4.8 and scala 2.11 use from maven central: 
# --packages org.tensorflow:spark-tensorflow-connector_2.11:1.14.0
JAR = "http://canali.web.cern.ch/res/spark-tensorflow-connector_2.12-1.11.0.jar"

spark = ( SparkSession.builder
            .appName("Prepare TFRecord dataset")
            .master("local[*]")
            .config("spark.driver.memory","2g")
            .config("spark.jars", JAR)
            .config("spark.ui.showConsoleProgress", "false")
            .getOrCreate()
        )


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/03/08 16:27:28 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


### Define the input datasets

In [2]:
# Download the datasets from https://github.com/cerndb/SparkDLTrigger/tree/master/Data
#
# For CERN users, data is available on EOS
PATH = "/eos/project/s/sparkdltrigger/public/"

# PATH = "../Data/sparkdltrigger.web.cern.ch/sparkdltrigger/"

df_test = spark.read.parquet(PATH + "testUndersampled_HLF_features.parquet")
df_test.printSchema()

df_train = spark.read.parquet(PATH + "trainUndersampled_HLF_features.parquet")
df_train.printSchema()

root
 |-- HLF_input: array (nullable = true)
 |    |-- element: double (containsNull = true)
 |-- encoded_label: array (nullable = true)
 |    |-- element: double (containsNull = true)

root
 |-- HLF_input: array (nullable = true)
 |    |-- element: double (containsNull = true)
 |-- encoded_label: array (nullable = true)
 |    |-- element: double (containsNull = true)



### Save the datasets in TFRecord format


In [3]:
# Customize the output path
outputPATH = PATH

# Process the test dataset
# compact output in 2 files with coalesce(2)
df_test.coalesce(2).write.format("tfrecords").save(outputPATH + "testUndersampled_HLF_features.tfrecord")

In [4]:
# Repeat for the training dataset

# save the training dataset in TFRecord format
# compact output in 4 files with coalesce(4)
df_train.coalesce(4).write.format("tfrecords").save(outputPATH + "trainUndersampled_HLF_features.tfrecord")

In [5]:
spark.stop()