In [127]:
import pathlib
import glob
import os
import pandas as pd
import numpy as np
from collections import OrderedDict

import tensorflow as tf

workerEnv = str(os.getenv("USER"))
workOnPrometheus = workerEnv.find("plg")>-1
inputDataPrefix = "/home/user1/scratch/akalinow/"
if workOnPrometheus:
    inputDataPrefix = "/net/people/plgakalinow/plggcmsml/"
dataDir = inputDataPrefix+"/ProgrammingProjects/MachineLearning/TauTauMass/data/"   

## Transform pickled data to pandas DataFrame

In [113]:
def getNumpyMatricesFromRawData(filePath):

        legs, jets, global_params, properties = pd.read_pickle(filePath)
        properties = OrderedDict(sorted(properties.items(), key=lambda t: t[0]))

        print("no of legs: ", len(legs))
        print("no of jets: ", len(jets))
        print("global params: ", global_params.keys())
        print("object properties:",properties.keys())

        genMass = np.array(global_params["genMass"])
        fastMTT = np.array(global_params["fastMTTMass"])
        visMass = np.array(global_params["visMass"])
        caMass = np.array(global_params["caMass"])
        leg1P4 = np.array(legs[0])
        leg2P4 = np.array(legs[1])
        leg1GenP4 = np.array(legs[2])
        leg2GenP4 = np.array(legs[3])        
        leg2Properties = np.array(properties["leg_2_decayMode"])
        leg1Properties = np.array(properties["leg_1_combreliso"])
        jet1P4 = np.array(jets[1])
        jet2P4 = np.array(jets[2])        
        met = np.array(jets[0][0:3])

        genMass = np.reshape(genMass, (-1,1))
        visMass = np.reshape(visMass, (-1,1))
        caMass = np.reshape(caMass, (-1,1))
        fastMTT = np.reshape(fastMTT, (-1,1))
        leg2Properties = np.reshape(leg2Properties, (-1,1))
        leg1Properties = np.reshape(leg1Properties, (-1,1))
        leg1P4 = np.transpose(leg1P4)
        leg2P4 = np.transpose(leg2P4)
        leg1GenP4 = np.transpose(leg1GenP4)
        leg2GenP4 = np.transpose(leg2GenP4)        
        jet1P4 = np.transpose(jet1P4)
        jet2P4 = np.transpose(jet2P4)
        met = np.transpose(met)
                       
        columns = ["genMass", "visMass", "caMass", "fastMTT",
                   "leg1_e", "leg1_px","leg1_py","leg1_pz", 
                   "leg2_e", "leg2_px","leg2_py","leg2_pz",
                   "leg_2_decayMode",
                   "met", "met_x", "met_y"]
        
        features = np.hstack((genMass, visMass, caMass, fastMTT, leg1P4, leg2P4, leg2Properties, met))
        df = pd.DataFrame(data=features, columns=columns)     
        return df

## Load train and test datasets

In [125]:
fileNames = ['htt_features_train.pkl','htt_features_DY_ggH125.pkl']

for fileName in fileNames:
    print("Processing file:",dataDir+fileName)
    label = fileName.split("/")[-1].split(".")[0]
    label = fileName.rstrip(".pkl")
    path = str(pathlib.Path(fileName).parent)
    parquetFile = dataDir+'df.parquet_{}.gzip'.format(label)
    print(parquetFile)
    df = getNumpyMatricesFromRawData(dataPath)
    df = df.sample(frac=1.0)
    df.to_parquet(parquetFile, compression='gzip')

Processing file: /home/user1/scratch/akalinow//ProgrammingProjects/MachineLearning/TauTauMass/data/htt_features_train.pkl
/home/user1/scratch/akalinow//ProgrammingProjects/MachineLearning/TauTauMass/data/df.parquet_htt_features_train.gzip
no of legs:  4
no of jets:  3
global params:  dict_keys(['genMass', 'caMass', 'covMET11', 'covMET10', 'fastMTTMass', 'visMass', 'covMET00', 'covMET01'])
object properties: odict_keys(['leg_1_charge', 'leg_1_combreliso', 'leg_2_DPFTau_2016_v1tauVSall', 'leg_2_byCombinedIsolationDeltaBetaCorrRaw3Hits', 'leg_2_byIsolationMVArun2v1DBoldDMwLTraw', 'leg_2_byIsolationMVArun2v1DBoldDMwLTraw2017v2', 'leg_2_charge', 'leg_2_chargedIsoPtSum', 'leg_2_decayDistMag', 'leg_2_decayMode', 'leg_2_deepTau2017v1tauVSall', 'leg_2_deepTau2017v1tauVSjet', 'leg_2_dxy', 'leg_2_dxy_Sig', 'leg_2_eRatio', 'leg_2_flightLengthSig', 'leg_2_gjAngleDiff', 'leg_2_hasSecondaryVertex', 'leg_2_ip3d', 'leg_2_nPhoton', 'leg_2_neutralIsoPtSum', 'leg_2_photonPtSumOutsideSignalCone', 'leg_2_pt

## Importing data into TF

In [134]:
parquetFile = dataDir + 'df.parquet_htt_features_train.gzip'
df = pd.read_parquet(parquetFile)
labels = df.pop("genMass")    
features = df.values
dataset = tf.data.Dataset.from_tensor_slices((features, labels))
print(dataset)

<TensorSliceDataset shapes: ((15,), ()), types: (tf.float64, tf.float64)>
