In [1]:
import pathlib
import glob
import os
import pandas as pd
import numpy as np
from root_numpy import root2array
from root_pandas import read_root
from collections import OrderedDict

import tensorflow as tf

workerEnv = str(os.getenv("USER"))
workOnPrometheus = workerEnv.find("plg")>-1
inputDataPrefix = "/home/user1/scratch/akalinow/"
if workOnPrometheus:
    inputDataPrefix = "/net/people/plgakalinow/plggcmsml/"
dataDir = inputDataPrefix+"/ProgrammingProjects/MachineLearning/TauTauMass/data/25_01_2021/"   

Welcome to JupyROOT 6.18/04


## Transform pickled data to pandas DataFrame

In [None]:
fileName = "/home/user1/scratch/akalinow/ProgrammingProjects/RootAnalysis/build_Docker/RootAnalysis_SVfitMLAnalysisMuTau_Pythia8.root"

for iChunk, dfChunk in enumerate(read_root(paths=fileName, key="Summary/tree",chunksize=int(1000))):
            print("\tProcessing chunk: {}".format(iChunk))
            print(dfChunk)
            break

In [3]:
def getNumpyMatricesFromRawData(filePath):

        legs, jets, global_params, properties = pd.read_pickle(filePath)
        properties = OrderedDict(sorted(properties.items(), key=lambda t: t[0]))

        print("no of legs: ", len(legs))
        print("no of jets: ", len(jets))
        print("global params: ", global_params.keys())
        print("object properties:",properties.keys())

        genMass = np.array(global_params["genMass"])
        fastMTT = np.array(global_params["fastMTTMass"])
        visMass = np.array(global_params["visMass"])
        caMass = np.array(global_params["caMass"])
        
        covMET00 = np.array(global_params["covMET00"])
        covMET01 = np.array(global_params["covMET01"])
        covMET10 = np.array(global_params["covMET10"])
        covMET11 = np.array(global_params["covMET11"])
    
        leg1P4 = np.array(legs[0])
        leg2P4 = np.array(legs[1])
        leg1GenP4 = np.array(legs[2])
        leg2GenP4 = np.array(legs[3])        
        leg2Properties = np.array(properties["leg_2_decayMode"])
        leg1Properties = np.array(properties["leg_1_combreliso"])
        jet1P4 = np.array(jets[1])
        jet2P4 = np.array(jets[2])        
        met = np.array(jets[0][0:3])

        genMass = np.reshape(genMass, (-1,1))
        visMass = np.reshape(visMass, (-1,1))
        caMass = np.reshape(caMass, (-1,1))
        fastMTT = np.reshape(fastMTT, (-1,1))
    
        covMET00 = np.reshape(covMET00,(-1,1))
        covMET01 = np.reshape(covMET01,(-1,1))
        covMET10 = np.reshape(covMET10,(-1,1))
        covMET11 = np.reshape(covMET11,(-1,1))
        
        leg2Properties = np.reshape(leg2Properties, (-1,1))
        leg1Properties = np.reshape(leg1Properties, (-1,1))
        leg1P4 = np.transpose(leg1P4)
        leg2P4 = np.transpose(leg2P4)
        leg1GenP4 = np.transpose(leg1GenP4)
        leg2GenP4 = np.transpose(leg2GenP4)        
        jet1P4 = np.transpose(jet1P4)
        jet2P4 = np.transpose(jet2P4)
        met = np.transpose(met)
                       
        columns = ["genMass", "visMass", "caMass", "fastMTT",
                   "covMET00", "covMET01", "covMET10", "covMET11",
                   "leg1_e", "leg1_px","leg1_py","leg1_pz", 
                   "leg2_e", "leg2_px","leg2_py","leg2_pz",
                   "leg_2_decayMode",
                   "met", "met_x", "met_y"]
        
        features = np.hstack((genMass, visMass, caMass, fastMTT, covMET00, covMET01, covMET10, covMET11, leg1P4, leg2P4, leg2Properties, met))
        df = pd.DataFrame(data=features, columns=columns)     
        return df

## Load train and test datasets

In [4]:
dataDir = inputDataPrefix+"/ProgrammingProjects/MachineLearning/TauTauMass/data/25_01_2021/"
fileNames = glob.glob(dataDir + 'RootAnalysis_SVfitMLAnalysisMuTau*.root')

#fileNames = ["/home/user1/scratch/akalinow/ProgrammingProjects/RootAnalysis/build/RootAnalysis_SVfitMLAnalysisMuTau_Pythia8.root",
#             "/home/user1/scratch/akalinow/ProgrammingProjects/RootAnalysis/build/RootAnalysis_SVfitMLAnalysisMuTau_Pythia8_smearMET.root",
#             "/home/user1/scratch/akalinow/ProgrammingProjects/RootAnalysis/build/RootAnalysis_SVfitMLAnalysisMuTau_DY_ggH125.root"
#            ]
 
for fileName in fileNames:
    print("Processing file:",dataDir+fileName)
    label = fileName.split("/")[-1]
    label = label.rstrip(".pkl")
    label = label.rstrip(".root")
    path = str(pathlib.Path(fileName).parent)+"/"
    parquetFile = path+'df.parquet_{}.gzip'.format(label)
    #df = getNumpyMatricesFromRawData(fileName)
    df = read_root(paths=fileName, key="Summary/tree")
    df = df.sample(frac=1.0)
    df.to_parquet(parquetFile, compression='gzip')

Processing file: /home/user1/scratch/akalinow//ProgrammingProjects/MachineLearning/TauTauMass/data/25_01_2021//home/user1/scratch/akalinow//ProgrammingProjects/MachineLearning/TauTauMass/data/25_01_2021/RootAnalysis_SVfitMLAnalysisMuTau_Pythia8.root


## Importing data into TF

In [5]:
parquetFile = dataDir + 'df.parquet_RootAnalysis_SVfitMLAnalysisMuTau_Pythia8.gzip'
df = pd.read_parquet(parquetFile)
labels = df.pop("genMass")    
features = df.values
dataset = tf.data.Dataset.from_tensor_slices((features, labels))
print(dataset)
print(labels)
for item in dataset:
    print(item)
    break

<TensorSliceDataset shapes: ((38,), ()), types: (tf.float64, tf.float32)>
1080323    287.847656
1617005    159.656158
833214     237.852951
1663734    167.583023
243256      94.823570
              ...    
2404742     80.385017
2890340    186.847946
1657987    166.550674
3073100    233.095627
2613613    122.759987
Name: genMass, Length: 3404302, dtype: float32
(<tf.Tensor: shape=(38,), dtype=float64, numpy=
array([ 1.00000000e+00,  0.00000000e+00,  1.77789902e+02,  2.89348358e+02,
        2.86383240e+02,  9.99999978e-03,  0.00000000e+00,  0.00000000e+00,
        9.99999978e-03,  1.26785290e+02,  4.01327466e+01, -1.79280704e+01,
       -1.18922015e+02,  6.59298203e+01, -4.21450848e+01, -9.42902597e+00,
        4.98032481e+01,  1.26785290e+02,  4.01327466e+01, -1.79280704e+01,
       -1.18922015e+02,  6.59298203e+01, -4.21450848e+01, -9.42902597e+00,
        4.98032481e+01,  4.26391736e+01,  3.60728146e+01, -2.27343611e+01,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.000

2021-01-25 09:58:13.725772: W tensorflow/stream_executor/platform/default/dso_loader.cc:55] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2021-01-25 09:58:13.725787: E tensorflow/stream_executor/cuda/cuda_driver.cc:351] failed call to cuInit: UNKNOWN ERROR (303)
2021-01-25 09:58:13.725802: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:163] no NVIDIA GPU device is present: /dev/nvidia0 does not exist
2021-01-25 09:58:13.726260: I tensorflow/core/platform/cpu_feature_guard.cc:142] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA
2021-01-25 09:58:13.752642: I tensorflow/core/platform/profile_utils/cpu_utils.cc:94] CPU Frequency: 3600000000 Hz
2021-01-25 09:58:13.753297: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x55d2e1d60490 initialized for platform Host (this does not guarantee that XLA will be used). Devices:
2021-01-25 09:58:13.75330