In [2]:
import pathlib
import glob
import os
import pandas as pd
import numpy as np
from root_numpy import root2array
from root_pandas import read_root
from collections import OrderedDict

import tensorflow as tf

workerEnv = str(os.getenv("USER"))
workOnPrometheus = workerEnv.find("plg")>-1
inputDataPrefix = "/home/user1/scratch/akalinow/"
if workOnPrometheus:
    inputDataPrefix = "/net/people/plgakalinow/plggcmsml/"
dataDir = inputDataPrefix+"/ProgrammingProjects/MachineLearning/TauTauMass/data/15_09_2020/"   

Welcome to JupyROOT 6.18/04


## Transform pickled data to pandas DataFrame

In [2]:
fileName = "/home/user1/scratch/akalinow/ProgrammingProjects/RootAnalysis/build/RootAnalysis_SVfitMLAnalysisMuTau_Pythia8.root"

for iChunk, dfChunk in enumerate(read_root(paths=fileName, key="Summary/tree",chunksize=int(1000))):
            print("\tProcessing chunk: {}".format(iChunk))
            print(dfChunk)
            break

	Processing chunk: 0
     eventWeight  sampleType     genMass     visMass      caMass  fastMTTMass  \
0            1.0         0.0  300.463104  146.578888  262.584076   261.000061   
1            1.0         0.0  298.187927  126.315224  329.410095   311.258087   
2            1.0         0.0  300.600220  155.025940  357.113220   380.608032   
3            1.0         0.0  302.933014  197.339188  310.018890   315.109985   
4            1.0         0.0  302.955383  227.782898  321.198700   320.852264   
..           ...         ...         ...         ...         ...          ...   
995          1.0         0.0  303.151154  118.697479  302.291138   158.263138   
996          1.0         0.0  302.037842  137.270767  324.324402   318.719910   
997          1.0         0.0  266.697906  188.864731  265.739258   266.429047   
998          1.0         0.0  301.575165  192.724030  268.356659   279.543030   
999          1.0         0.0  301.725769  126.662796  292.734711   295.039886   

     c

In [3]:
def getNumpyMatricesFromRawData(filePath):

        legs, jets, global_params, properties = pd.read_pickle(filePath)
        properties = OrderedDict(sorted(properties.items(), key=lambda t: t[0]))

        print("no of legs: ", len(legs))
        print("no of jets: ", len(jets))
        print("global params: ", global_params.keys())
        print("object properties:",properties.keys())

        genMass = np.array(global_params["genMass"])
        fastMTT = np.array(global_params["fastMTTMass"])
        visMass = np.array(global_params["visMass"])
        caMass = np.array(global_params["caMass"])
        
        covMET00 = np.array(global_params["covMET00"])
        covMET01 = np.array(global_params["covMET01"])
        covMET10 = np.array(global_params["covMET10"])
        covMET11 = np.array(global_params["covMET11"])
    
        leg1P4 = np.array(legs[0])
        leg2P4 = np.array(legs[1])
        leg1GenP4 = np.array(legs[2])
        leg2GenP4 = np.array(legs[3])        
        leg2Properties = np.array(properties["leg_2_decayMode"])
        leg1Properties = np.array(properties["leg_1_combreliso"])
        jet1P4 = np.array(jets[1])
        jet2P4 = np.array(jets[2])        
        met = np.array(jets[0][0:3])

        genMass = np.reshape(genMass, (-1,1))
        visMass = np.reshape(visMass, (-1,1))
        caMass = np.reshape(caMass, (-1,1))
        fastMTT = np.reshape(fastMTT, (-1,1))
    
        covMET00 = np.reshape(covMET00,(-1,1))
        covMET01 = np.reshape(covMET01,(-1,1))
        covMET10 = np.reshape(covMET10,(-1,1))
        covMET11 = np.reshape(covMET11,(-1,1))
        
        leg2Properties = np.reshape(leg2Properties, (-1,1))
        leg1Properties = np.reshape(leg1Properties, (-1,1))
        leg1P4 = np.transpose(leg1P4)
        leg2P4 = np.transpose(leg2P4)
        leg1GenP4 = np.transpose(leg1GenP4)
        leg2GenP4 = np.transpose(leg2GenP4)        
        jet1P4 = np.transpose(jet1P4)
        jet2P4 = np.transpose(jet2P4)
        met = np.transpose(met)
                       
        columns = ["genMass", "visMass", "caMass", "fastMTT",
                   "covMET00", "covMET01", "covMET10", "covMET11",
                   "leg1_e", "leg1_px","leg1_py","leg1_pz", 
                   "leg2_e", "leg2_px","leg2_py","leg2_pz",
                   "leg_2_decayMode",
                   "met", "met_x", "met_y"]
        
        features = np.hstack((genMass, visMass, caMass, fastMTT, covMET00, covMET01, covMET10, covMET11, leg1P4, leg2P4, leg2Properties, met))
        df = pd.DataFrame(data=features, columns=columns)     
        return df

## Load train and test datasets

In [4]:
dataDir = inputDataPrefix+"/ProgrammingProjects/MachineLearning/TauTauMass/data/15_09_2020/"
fileNames = glob.glob(dataDir + 'RootAnalysis_SVfitMLAnalysisMuTau*.root')

#fileNames = ["/home/user1/scratch/akalinow/ProgrammingProjects/RootAnalysis/build/RootAnalysis_SVfitMLAnalysisMuTau_Pythia8.root",
#             "/home/user1/scratch/akalinow/ProgrammingProjects/RootAnalysis/build/RootAnalysis_SVfitMLAnalysisMuTau_Pythia8_smearMET.root",
#             "/home/user1/scratch/akalinow/ProgrammingProjects/RootAnalysis/build/RootAnalysis_SVfitMLAnalysisMuTau_DY_ggH125.root"
#            ]
 
for fileName in fileNames:
    print("Processing file:",dataDir+fileName)
    label = fileName.split("/")[-1]
    label = label.rstrip(".pkl")
    label = label.rstrip(".root")
    path = str(pathlib.Path(fileName).parent)+"/"
    parquetFile = path+'df.parquet_{}.gzip'.format(label)
    #df = getNumpyMatricesFromRawData(fileName)
    df = read_root(paths=fileName, key="Summary/tree")
    df = df.sample(frac=1.0)
    df.to_parquet(parquetFile, compression='gzip')

Processing file: /home/user1/scratch/akalinow//ProgrammingProjects/MachineLearning/TauTauMass/data/15_09_2020//home/user1/scratch/akalinow//ProgrammingProjects/MachineLearning/TauTauMass/data/15_09_2020/RootAnalysis_SVfitMLAnalysisMuTau_Pythia8.root


## Importing data into TF

In [3]:
parquetFile = dataDir + 'df.parquet_RootAnalysis_SVfitMLAnalysisMuTau_Pythia8.gzip'
df = pd.read_parquet(parquetFile)
labels = df.pop("genMass")    
features = df.values
dataset = tf.data.Dataset.from_tensor_slices((features, labels))
print(dataset)
print(labels)
for item in dataset:
    print(item)
    break

<TensorSliceDataset shapes: ((38,), ()), types: (tf.float64, tf.float32)>
1803439    270.445007
1893565    150.100311
1987509     92.666321
46326      297.996185
861716      98.532669
              ...    
359049     184.680939
775161     147.959152
622367     251.188965
1262186    211.023300
509647     211.381958
Name: genMass, Length: 2054631, dtype: float32
(<tf.Tensor: shape=(38,), dtype=float64, numpy=
array([ 1.00000000e+00,  0.00000000e+00,  1.58413696e+02,  2.93184998e+02,
        2.90967590e+02,  9.99999978e-03,  0.00000000e+00,  0.00000000e+00,
        9.99999978e-03,  5.23411705e+01,  7.50387624e+00,  4.01947952e+01,
        3.26750249e+01,  1.20929497e+02, -2.00881389e+01, -7.66965717e+01,
       -9.13016417e+01,  5.23411705e+01,  7.50387624e+00,  4.01947952e+01,
        3.26750249e+01,  1.20929497e+02, -2.00881389e+01, -7.66965717e+01,
       -9.13016417e+01,  4.42923396e+01,  6.33742140e+00,  4.38366107e+01,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.000

2020-09-16 09:00:36.114390: W tensorflow/stream_executor/platform/default/dso_loader.cc:55] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2020-09-16 09:00:36.114404: E tensorflow/stream_executor/cuda/cuda_driver.cc:351] failed call to cuInit: UNKNOWN ERROR (303)
2020-09-16 09:00:36.114419: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:163] no NVIDIA GPU device is present: /dev/nvidia0 does not exist
2020-09-16 09:00:36.114756: I tensorflow/core/platform/cpu_feature_guard.cc:142] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA
2020-09-16 09:00:36.120818: I tensorflow/core/platform/profile_utils/cpu_utils.cc:94] CPU Frequency: 3600000000 Hz
2020-09-16 09:00:36.121556: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x55fba8bb6520 initialized for platform Host (this does not guarantee that XLA will be used). Devices:
2020-09-16 09:00:36.12156