## Workspace setup

In [1]:
from multiprocessing import Process, Queue
import tensorflow as tf
import numpy as np
from os.path import isfile
import io_functions as io


from tensorflow.data import Dataset, TFRecordDataset
from tensorflow.io import TFRecordWriter, TFRecordOptions
from tensorflow.train import BytesList, FloatList, Int64List
from tensorflow.train import Example, Features, Feature

from tensorflow.python.ops.numpy_ops import np_config
np_config.enable_numpy_behavior()
dataPath = '/scratch/pszyc'

2023-12-12 08:48:11.366836: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## TFRecord creation

In [2]:
def _bytes_feature(value):
  """Returns a bytes_list from a string / byte."""
  if isinstance(value, type(tf.constant(0))):
    value = value.numpy() # BytesList won't unpack a string from an EagerTensor.
  return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def serialize(charge_array, target):
  feature = {'myChargeArray' : _bytes_feature(tf.io.serialize_tensor(charge_array)),
             'target' : _bytes_feature(tf.io.serialize_tensor(target))}
  example = tf.train.Example(features=tf.train.Features(feature=feature))
  return example.SerializeToString()

def conversion(filename, queue):
    options = TFRecordOptions(compression_type='GZIP')
    writer = TFRecordWriter(filename, options=options)
    while True:
        item = queue.get()
        if item == None:
            break
        charge_array, target = item
        charge_array= io.proc_features(charge_array)
        
        example = serialize(charge_array, target)
        writer.write(example)

In [3]:
def XYZtoUVWT(data):
    referencePoint = tf.constant([-138.9971, 98.25])
    phi = np.pi/6.0
    stripPitch = 1.5
    f = 1.0/25*6.46
    u = -(data[:, 1]-99.75)
    v = (data[:, 0]-referencePoint[0]) * np.cos(phi) - (data[:, 1]-referencePoint[1]) * np.sin(phi)
    w = (data[:, 0]-referencePoint[0]) * np.cos(-phi) - (data[:, 1]-referencePoint[1]) * np.sin(-phi) + 98.75
    t = data[:, 2]/f + 256
    u/=stripPitch
    v/=stripPitch
    w/=stripPitch
    return tf.stack([u,v,w,t], axis=0).T

def conversion_uvwt(filename, queue):
    options = TFRecordOptions(compression_type='GZIP')
    writer = TFRecordWriter(filename, options=options)
    scale = 100
    n_projections = 3
    while True:
        item = queue.get()
        if item == None:
            break
        myChargeArray, target = item
        charge_array= io.proc_features(myChargeArray)
        charge_array = tf.transpose(charge_array, perm = [0, 3, 1, 2])
        uvwt_1 = XYZtoUVWT(scale*target[:, 0:3])
        uvwt_2 = XYZtoUVWT(scale*target[:, 3:6])
        uvwt_3 = XYZtoUVWT(scale*target[:, 6:9])

        points = []
        for i in range(n_projections):
          points.append([
              uvwt_1[:, 3], uvwt_1[:, i],
              uvwt_2[:, 3], uvwt_2[:, i],
              uvwt_3[:, 3], uvwt_3[:, i]
          ])
        points = np.stack(points, axis = 1).T
        for index in range(points.shape[0]):
          example = serialize(charge_array[index], points[index])
          writer.write(example)

In [4]:
def process_and_save(output_files, datasetGenerator, conversion_function):
    nFiles = len(output_files)
                 
    for file in output_files:
        if isfile(file):
            raise Exception('output file already exists')
    
    if __name__ == '__main__':
        processes = []
        q = Queue(2*nFiles)

        for name in output_files:
            p = Process(target=conversion_function, args=(name, q))
            processes.append(p)
            p.start()
            print(p.name + ' started')
    
        counter = 0
        for item in datasetGenerator:
            q.put(item)
            counter+=1
            if counter%100 == 0:
                print(f'read {counter} batches')
    
        for _ in range(nFiles):
            q.put(None)
        
        for p in processes:
            p.join()
            print(p.name + ' done')

### Trainset

In [5]:
file = "out_random_sigma-001"
batchSize = 5
nFiles = 5 # number of output files, equal to number of processes

input_files = [f'{dataPath}/{file}.root:TPCData']
output_files = [f"{dataPath}/data/{file}-part-{i}.tfrecord" for i in range(nFiles)]
datasetGenerator = io.minimal_generator(files=input_files, batchSize=batchSize)

In [6]:
%%time
process_and_save(output_files, datasetGenerator, conversion_uvwt)

Process-1 started
Process-2 started
Process-3 started
Process-4 started
Process-5 started


2023-12-12 08:03:28.324323: E tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:268] failed call to cuInit: UNKNOWN ERROR (34)
2023-12-12 08:03:28.480230: E tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:268] failed call to cuInit: UNKNOWN ERROR (34)
2023-12-12 08:03:28.596708: E tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:268] failed call to cuInit: UNKNOWN ERROR (34)
2023-12-12 08:03:28.784603: E tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:268] failed call to cuInit: UNKNOWN ERROR (34)
2023-12-12 08:03:28.899872: E tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:268] failed call to cuInit: UNKNOWN ERROR (34)


read 100 batches
read 200 batches
read 300 batches
read 400 batches
read 500 batches
read 600 batches
read 700 batches
read 800 batches
read 900 batches
read 1000 batches
read 1100 batches
read 1200 batches
read 1300 batches
read 1400 batches
read 1500 batches
read 1600 batches
read 1700 batches
read 1800 batches
read 1900 batches
read 2000 batches
read 2100 batches
read 2200 batches
read 2300 batches
read 2400 batches
read 2500 batches
read 2600 batches
read 2700 batches
read 2800 batches
read 2900 batches
read 3000 batches
read 3100 batches
read 3200 batches
read 3300 batches
read 3400 batches
read 3500 batches
read 3600 batches
read 3700 batches
read 3800 batches
read 3900 batches
read 4000 batches
read 4100 batches
read 4200 batches
read 4300 batches
read 4400 batches
read 4500 batches
read 4600 batches
read 4700 batches
read 4800 batches
read 4900 batches
read 5000 batches
read 5100 batches
read 5200 batches
read 5300 batches
read 5400 batches
read 5500 batches
read 5600 batches
r

### Testset

In [5]:
file = "out_random_sigma2k2mm"
batchSize = 5
nFiles = 5 # number of output files, equal to number of processes

input_files = [f'{dataPath}/{file}.root:TPCData']
output_files = [f"{dataPath}/data/{file}-part-{i}.tfrecord" for i in range(nFiles)]
datasetGenerator = io.minimal_generator(files=input_files, batchSize=batchSize)

In [6]:
%%time
process_and_save(output_files, datasetGenerator, conversion_uvwt)

Process-1 started
Process-2 started
Process-3 started
Process-4 started
Process-5 started


2023-12-12 08:48:17.698296: E tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:268] failed call to cuInit: UNKNOWN ERROR (34)
2023-12-12 08:48:17.849620: E tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:268] failed call to cuInit: UNKNOWN ERROR (34)
2023-12-12 08:48:17.951207: E tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:268] failed call to cuInit: UNKNOWN ERROR (34)
2023-12-12 08:48:18.052097: E tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:268] failed call to cuInit: UNKNOWN ERROR (34)
2023-12-12 08:48:18.223847: E tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:268] failed call to cuInit: UNKNOWN ERROR (34)


read 100 batches
read 200 batches
read 300 batches
read 400 batches
Process-1 done
Process-2 done
Process-3 done
Process-4 done
Process-5 done
CPU times: user 34.7 s, sys: 18.1 s, total: 52.7 s
Wall time: 44.5 s


### Validation set

In [9]:
file = "out_random_sigma"
batchSize = 5
nFiles = 5 # number of output files, equal to number of processes

input_files = [f'{dataPath}/{file}.root:TPCData']
output_files = [f"{dataPath}/data/{file}-part-{i}.tfrecord" for i in range(nFiles)]
datasetGenerator = io.minimal_generator(files=input_files, batchSize=batchSize)

In [10]:
%%time
process_and_save(output_files, datasetGenerator, conversion_uvwt)

Process-6 started
Process-7 started
Process-8 started
Process-9 started
Process-10 started


2023-12-12 08:50:53.097174: E tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:268] failed call to cuInit: UNKNOWN ERROR (34)
2023-12-12 08:50:53.263273: E tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:268] failed call to cuInit: UNKNOWN ERROR (34)
2023-12-12 08:50:53.416226: E tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:268] failed call to cuInit: UNKNOWN ERROR (34)
2023-12-12 08:50:53.618560: E tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:268] failed call to cuInit: UNKNOWN ERROR (34)
2023-12-12 08:50:53.678759: E tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:268] failed call to cuInit: UNKNOWN ERROR (34)


read 100 batches
read 200 batches
Process-6 done
Process-7 done
Process-8 done
Process-9 done
Process-10 done
CPU times: user 18.6 s, sys: 8.99 s, total: 27.6 s
Wall time: 23.9 s


### Read TFRecord

In [9]:
filenames = [f"{dataPath}/test/{file}-part-{i}.tfrecord" for i in range(nFiles)]
train_dataset = tf.data.TFRecordDataset(filenames, compression_type='GZIP', num_parallel_reads=5)
# Create a description of the features.
feature_description = {
    'myChargeArray': tf.io.FixedLenFeature([], tf.string),
    'target': tf.io.FixedLenFeature([], tf.string),

}

def _parse_function(example_proto):
  # Parse the input `tf.train.Example` proto using the dictionary above.
    parsed_features = tf.io.parse_single_example(example_proto, feature_description)
    charge, target = parsed_features['myChargeArray'], parsed_features['target']
    # decode from bytes
    charge = tf.io.parse_tensor(charge, tf.float64)
    target = tf.io.parse_tensor(target, tf.float64)

    return charge, target


train_dataset = train_dataset.map(_parse_function, num_parallel_calls=tf.data.AUTOTUNE)

In [11]:
for image, target in train_dataset.take(1):
  print(target)

tf.Tensor(
[[ 73.44932381  68.96072799 199.87771658  67.77527623  51.1
   69.17049803]
 [ 73.44932381 149.05760926 199.87771658 157.46395649  51.1
  147.28143112]
 [ 73.44932381 146.9302146  199.87771658 156.52201359  51.1
  144.94426642]], shape=(3, 6), dtype=float64)
