## Workspace setup

In [1]:
from datetime import datetime 
import uproot
import awkward as ak
import tensorflow as tf
import numpy as np
import importlib
from functools import partial

from tensorflow.data import Dataset, TFRecordDataset
from tensorflow.data.experimental import TFRecordWriter
from tensorflow.train import BytesList, FloatList, Int64List
from tensorflow.train import Example, Features, Feature

2023-05-30 14:08:02.828531: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## TFRecord creation

In [3]:
%%time

import io_functions as io
importlib.reload(io)

train_files = ['/scratch_ssd/akalinow/ELITPC/data/E_11_sigma_2/out_C_arr_1.root:TPCData']
batchSize = 200

datasetGenerator = partial(io.generator, files=train_files, batchSize=batchSize)

train_dataset = tf.data.Dataset.from_generator(
     datasetGenerator,
     output_signature=(
         tf.TensorSpec(shape=(batchSize,)+ io.projections.shape, dtype=tf.int32),
         tf.TensorSpec(shape=(batchSize, 9), dtype=tf.float32)))


for aBatch in train_dataset:
    pass

2023-05-30 14:13:23.960551: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype int32
	 [[{{node Placeholder/_0}}]]


CPU times: user 1min 37s, sys: 24.7 s, total: 2min 2s
Wall time: 2min 1s


In [104]:
%%time

dataDirectory = "/scratch_hdd/akalinow/ELITPC/PythonAnalysis/data/E_11_sigma_2/"
dataFile = "out_C_arr_1.root"
treeName = ":TPCData"
inputObj = dataDirectory+dataFile+treeName
batchSize = 1

fields = [
    #"SimEvent/reactionType",
    "SimEvent/tracks/tracks.startPos",
    "SimEvent/tracks/tracks.stopPos",
    #"SimEvent/tracks/tracks.prim.pID",
    #"SimEvent/tracks/tracks.prim.fourMomentum",
    #"Event/myChargeMap",
    "Event/myChargeArray*",
    "SimEvent/tracks/tracks.truncatedStartPosUVWT.*",
    "SimEvent/tracks/tracks.truncatedStopPosUVWT.*",
]


def generator(files):
    for array in uproot.iterate(files, step_size=batchSize, filter_name=fields, library="ak"):
      
        fX = array['tracks.startPos']['fX'].to_numpy()
        fY = array['tracks.startPos']['fY'].to_numpy()
        fZ = array['tracks.startPos']['fZ'].to_numpy()
        startPos = np.stack([fX, fY, fZ], axis=1)[:,:,[0]]
        
        fX = array['tracks.stopPos']['fX'].to_numpy()
        fY = array['tracks.stopPos']['fY'].to_numpy()
        fZ = array['tracks.stopPos']['fZ'].to_numpy()
        stopPos = np.stack([fX, fY, fZ], axis=1)
        
        target = np.concatenate([startPos, stopPos], axis=2)
        
        features = array["myChargeArray[3][3][256][512]"].to_numpy()
        features = np.sum(features, axis=2)
        features = np.moveaxis(features, 1, -1)

        yield features, target
        
for item in generator(files=inputObj):
    pass

CPU times: user 2min 24s, sys: 663 ms, total: 2min 25s
Wall time: 2min 24s


In [80]:
%%time

import io_functions as io
importlib.reload(io)

for item in io.generator(files=inputObj):
    pass

CPU times: user 1min 50s, sys: 10.3 s, total: 2min
Wall time: 2min


In [None]:




def saveDatasetToTFRecord(dataset, fileName):  
    dataset = dataset.map(.io.serialize_tensor)
    writer = tf.data.experimental.TFRecordWriter(fileName, compression_type="GZIP")
    writer.write(dataset)

In [None]:
path_tf = 'startPos.tfrecord'
item_of_TPCData_list = 'SimEvent/tracks/tracks.startPos'

In [None]:
path_tf = 'stopPos.tfrecord'
item_of_TPCData_list = 'SimEvent/tracks/tracks.stopPos'

In [None]:
def generate_tfrecord(path_tf, item_of_TPCData_list):
    with tf.io.TFRecordWriter(path_tf) as file_writer:
        for x in TPCData.iterate(item_of_TPCData_list, step_size=1):

            record_bytes = tf.train.Example(features=tf.train.Features(feature={
                "fX": tf.train.Feature(bytes_list=BytesList(value=[tf.io.serialize_tensor(x[item_of_TPCData_list]['fX']).numpy()])),
                "fY": tf.train.Feature(bytes_list=BytesList(value=[tf.io.serialize_tensor(x[item_of_TPCData_list]['fY']).numpy()])),
                "fZ": tf.train.Feature(bytes_list=BytesList(value=[tf.io.serialize_tensor(x[item_of_TPCData_list]['fZ']).numpy()])),


            })).SerializeToString()
            file_writer.write(record_bytes)

In [None]:
generate_tfrecord(path_tf, item_of_TPCData_list)

In [None]:
list_of_tfrecord_files = ['stopPos.tfrecord', 'startPos.tfrecord']
dataset = tf.data.TFRecordDataset(list_of_tfrecord_files)

filename = 'root.tfrecord'
writer = tf.data.experimental.TFRecordWriter(filename)
writer.write(dataset)

In [None]:
#train_dataset = tf.data.Dataset.from_generator(
#     datasetGenerator,
#     output_signature=(
#         tf.TensorSpec(shape=(io.projections.shape), dtype=tf.float32),
#         tf.TensorSpec(shape=(9), dtype=tf.float64)))

In [None]:
nStrips=256
nTimeSlices = 512
nProj = 3
projections = np.zeros((nStrips,nTimeSlices, nProj))

In [None]:
def generate_tfrecord_tfDataset(path_tf):
    with tf.io.TFRecordWriter(path_tf) as file_writer:
        for x in train_dataset:

            record_bytes = tf.train.Example(features=tf.train.Features(feature={
                "projections": tf.train.Feature(bytes_list=BytesList(value=[tf.io.serialize_tensor(x).numpy()]),
                                         )


            })).SerializeToString()
            file_writer.write(record_bytes)

In [None]:
#generate_tfrecord_tfDataset('projections_test.tfrecord')