# Extract the High Level Features Dataset from the full dataset
This extracts the High Level features training and test datasets from the full datasets.  
It is used as input for the TensorFlow and Petastorm example notebooks.  
Key actions are:
 - Converting Vectors into Arrays
 - Saving the resulting Spark DataFrames as Parquet files


In [1]:
# No need to run this when using CERN SWAN service
# Just add the configuration parameters for Spark on the "star" button integration

# pip install pyspark or use your favorite way to set Spark Home, here we use findspark
import findspark
findspark.init('/home/luca/Spark/spark-3.3.2-bin-hadoop3') #set path to SPARK_HOME

# Create Spark session and configure according to your environment
from pyspark.sql import SparkSession

spark = ( SparkSession.builder
            .appName("Prepare TFRecord dataset")
            .master("yarn")
            .config("spark.driver.memory","2g")
            .config("spark.executor.memory","32g")
            .config("spark.executor.cores.memory","6")       
            .config("spark.ui.showConsoleProgress", "false")
            .getOrCreate()
        )


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/03/08 16:44:13 WARN Client: Neither spark.yarn.jars nor spark.yarn.archive is set, falling back to uploading libraries under SPARK_HOME.
23/03/08 16:44:33 WARN YarnSchedulerBackend$YarnSchedulerEndpoint: Attempted to request executors before the AM has registered!


### Define and prepare the input datasets

In [9]:
# Download the datasets from https://github.com/cerndb/SparkDLTrigger/tree/master/Data
#
# For CERN users, data is available on EOS and HDFS
# PATH = "/eos/project/s/sparkdltrigger/public/"
PATH = "hdfs://analytix/Training/Spark/TopologyClassifier/"

df_test_raw = spark.read.parquet(PATH + "testUndersampled.parquet")
df_test_raw.printSchema()

df_train_raw = spark.read.parquet(PATH + "trainUndersampled.parquet")
df_train_raw.printSchema()

root
 |-- hfeatures: vector (nullable = true)
 |-- label: long (nullable = true)
 |-- lfeatures: array (nullable = true)
 |    |-- element: array (containsNull = true)
 |    |    |-- element: double (containsNull = true)
 |-- hfeatures_dense: vector (nullable = true)
 |-- encoded_label: vector (nullable = true)
 |-- HLF_input: vector (nullable = true)
 |-- GRU_input: array (nullable = true)
 |    |-- element: array (containsNull = true)
 |    |    |-- element: double (containsNull = true)

root
 |-- hfeatures: vector (nullable = true)
 |-- label: long (nullable = true)
 |-- lfeatures: array (nullable = true)
 |    |-- element: array (containsNull = true)
 |    |    |-- element: double (containsNull = true)
 |-- hfeatures_dense: vector (nullable = true)
 |-- encoded_label: vector (nullable = true)
 |-- HLF_input: vector (nullable = true)
 |-- GRU_input: array (nullable = true)
 |    |-- element: array (containsNull = true)
 |    |    |-- element: double (containsNull = true)



In [10]:
# Extacts the columns of interest for the high level features classifier
# Transform Vectors in Arrays
# This is because we need Array for the Petastorm example notebook

from pyspark.ml.functions import vector_to_array

df_test = ( df_test_raw
             .withColumn('HLF_input', vector_to_array('HLF_input'))
             .withColumn('encoded_label', vector_to_array('encoded_label'))
             .select('HLF_input', 'encoded_label')
          )

df_test.printSchema()

df_train = ( df_train_raw
               .withColumn('HLF_input', vector_to_array('HLF_input'))
               .withColumn('encoded_label', vector_to_array('encoded_label'))
               .select('HLF_input', 'encoded_label')
           )

df_train.printSchema()

root
 |-- HLF_input: array (nullable = false)
 |    |-- element: double (containsNull = false)
 |-- encoded_label: array (nullable = false)
 |    |-- element: double (containsNull = false)

root
 |-- HLF_input: array (nullable = false)
 |    |-- element: double (containsNull = false)
 |-- encoded_label: array (nullable = false)
 |    |-- element: double (containsNull = false)



### Save the datasets in Parquet format

An additional (and optional) tuning is to set the Parquet block size of 1MB, this forces row groups to 1MB. 
This action is motivated by the use of Petastorm.  
Petastorm by default uses Parquet block size in make_batch_reader to determine the batch size to feed to Tensorflow.  
Note it is also possible to change the batch size in Petastorm, so this is not strictly necessary + using a rowgroup size of 1MB is not ideal in many cases (too small as a rowgroup and too large as a batch size).  
If you don't need to use Petastorm, you can skip the setting option("parquet.block.size", 1024 * 1024) and use defaults (128 MB for the rowgropup size).


In [None]:
# Customize the output path
outputPATH = PATH

# Process the test dataset
# compact output in 1 file with coalesce(1)
# this will force the operation to run on 1 task only, 
# so it's a bit slow but we get the output in compact form

( df_test 
    .coalesce(1)
    .write
    .option("parquet.block.size", 1024 * 1024)
    .parquet(outputPATH + "testUndersampled_HLF_features.parquet")
)


In [21]:
# Repeat for the training dataset

# compact output in 4 files with coalesce(4)
( df_train 
    .coalesce(4)
    .write
    .option("parquet.block.size", 1024 * 1024)
    .parquet(outputPATH + "trainUndersampled_HLF_features.parquet")
)

In [22]:
spark.stop()