In [None]:
import uproot as uproot
import pandas as pd
import numpy as np
from os import listdir, walk
from os.path import isfile, join, isdir
import os
import h5py
import awkward as awkward
#import awkward0 as awkward0
import time
from itertools import combinations
import sys
import vector

In [6]:
print("This library contains handy functions to ease the access and use of the 13TeV ATLAS OpenData release")
print("\ngetBkgCategories()")
print("\t Dumps the name of the various background cataegories available \n\t as well as the number of samples contained in each category.\n\t Returns a vector with the name of the categories")
print("\ngetSamplesInCategory(cat)")
print("\t Dumps the name of the samples contained in a given category (cat)\n\t Returns dictionary with keys being DSIDs and values physics process name from filename.")
print("\ngetMCCategory()")
print("\t Returns dictionary with keys DSID and values MC category")
print("\ninitialize(indir)")
print("\t Collects all the root files available in a certain directory (indir)")
print("\ngetSkims(indir)")
print("\t Prints all available skims in the directory")
print("\n\n")
print("Setting luminosity to 10064 pb^-1\n")
lumi = 10064.0

This library contains handy functions to ease the access and use of the 13TeV ATLAS OpenData release

getBkgCategories()
	 Dumps the name of the various background cataegories available 
	 as well as the number of samples contained in each category.
	 Returns a vector with the name of the categories

getSamplesInCategory(cat)
	 Dumps the name of the samples contained in a given category (cat)
	 Returns dictionary with keys being DSIDs and values physics process name from filename.

getMCCategory()
	 Returns dictionary with keys DSID and values MC category

initialize(indir)
	 Collects all the root files available in a certain directory (indir)

getSkims(indir)
	 Prints all available skims in the directory



Setting luminosity to 10064 pb^-1



# Metadata

The following lists defines the available categories for the SM backgrounds and signal available in the openData 13 TeV release.

In [None]:
mcfiles = []
datafiles = []

In [None]:
if os.path.isfile("Background_samples_13TeV.txt"):
    infofile = open("Background_samples_13TeV.txt", "r")
else:
    infofile = open(sys.path[1]+"/Input/Background_samples_13TeV.txt", "r")
bkg_dsid_toplot = {}
sig_dsid_toplot = {}
for line in infofile:
    if line.startswith("#"): continue
    try:
        fname,dsid,cat = line.split()
    except:
        continue
    bkg_dsid_toplot[fname] = {"cat":cat,"DSID":int(dsid)}
    
infofile.close()

In [None]:
if os.path.isfile("Signal_samples_13TeV.txt"):
    infofile = open("Signal_samples_13TeV.txt", "r")
else:
    infofile = open(sys.path[1]+"/Input/Signal_samples_13TeV.txt", "r")
for line in infofile:
    if line.startswith("#"): continue
    try:
        fname,dsid,cat = line.split()
    except:
        continue
    sig_dsid_toplot[fname] = {"cat":cat,"DSID":int(dsid)}
infofile.close()

In [4]:
def initialize(indir):
    filedic = {}
    files = [f for f in listdir(indir) if (isfile(join(indir, f)) and f.endswith(".root"))]
    isFound = {}
    for f in files:
        file_not_in_cat = True
        ident = f.split(".")[1]
        cat = ""
        typ = ""
        for key in bkg_dsid_toplot.keys():
            if not key in isFound.keys():
                isFound[key] = False
            if key == ident:
                cat = bkg_dsid_toplot[key]["cat"]
                typ = "bkg"
                isFound[key] = True
                file_not_in_cat = False
                break
        for key in sig_dsid_toplot.keys():
            if not key in isFound.keys():
                isFound[key] = False
            if key == ident:
                cat = sig_dsid_toplot[key]["cat"]
                typ = "signal"
                isFound[key] = True
                file_not_in_cat = False
                break
        if "data_" in f and not cat:
            cat = "data"
            typ = "data"
            file_not_in_cat = False
        if cat:
            if not cat in filedic:
                filedic[cat] = {"files":[],"type":typ,"dsid":[]}
            filedic[cat]["files"].append(indir+"/"+f)
            filedic[cat]["dsid"].append(f.split(".")[0].split("_")[-1])
        else: # the hadded files have cat in their name (i.e. <cat>.root)
            cat = f.split(".")[0]
            if cat:
                if not cat in filedic:
                    filedic[cat] = {"files":[],"type":typ,"dsid":[]}
                filedic[cat]["files"].append(indir+"/"+f)
                filedic[cat]["dsid"].append(f.split(".")[0].split("_")[-1])
            else:
                print("WARNING \t Could not find category for %s"%ident)
        if file_not_in_cat:
            print("WARNING \t File %s not added as sample in Background_samples_13TeV.txt/Signal_samples_13TeV.txt"%(f))
            
    if not cat == 'data':
        bkghead = False
        sighead = False
        sigstr = ""
        bkgstr = ""
        for key in isFound.keys():
            if key in sig_dsid_toplot.keys():
                if not isFound[key]:
                    if not sighead:
                        sigstr = "#"*100
                        sigstr += "\n"
                        sigstr += "SIGNAL SAMPLES\n"
                        sigstr += "#"*100
                        sigstr += "\n"
                        sighead = True
                    sigstr += "WARNING \t File for %s not found in %s\n"%(key,indir)
            elif key in bkg_dsid_toplot.keys():
                if not isFound[key]:
                    if not bkghead:
                        bkgstr = "#"*100
                        bkgstr += "\n"
                        bkgstr += "BACKGROIUND SAMPLES\n"
                        bkgstr += "#"*100
                        bkgstr += "\n"
                        bkghead = True
                    bkgstr += "WARNING \t File for %s not found in %s\n"%(key,indir)
        if sighead:
            print(sigstr)
        if bkghead:
            print(bkgstr)
    return filedic

In [None]:
def getSignalCategories():
    newdic = {}
    for key in sig_dsid_toplot.keys():
        if not sig_dsid_toplot[key]["cat"] in newdic.keys():
            newdic[sig_dsid_toplot[key]["cat"]] = 0
        newdic[sig_dsid_toplot[key]["cat"]] += 1
    print("#"*31)
    print("#### Signal categories ####")
    print("#"*31)
    print("%-20s %10s"%("Category","N(samples)"))
    print("-"*31)
    for key in sorted(newdic.keys()):
        print("%-20s %10s"%(key,newdic[key]))
    return sorted(newdic.keys())

In [None]:
def getBkgCategories():
    newdic = {}
    for key in bkg_dsid_toplot.keys():
        if not bkg_dsid_toplot[key]["cat"] in newdic.keys():
            newdic[bkg_dsid_toplot[key]["cat"]] = 0
        newdic[bkg_dsid_toplot[key]["cat"]] += 1
    print("#"*31)
    print("#### Background categories ####")
    print("#"*31)
    print("%-20s %10s"%("Category","N(samples)"))
    print("-"*31)
    for key in sorted(newdic.keys()):
        print("%-20s %10s"%(key,newdic[key]))
    return sorted(newdic.keys())

In [None]:
def getSamplesInCategory(cat):
    newdic = {}
    for key in bkg_dsid_toplot.keys():
        if bkg_dsid_toplot[key]["cat"] == cat:
            newdic[bkg_dsid_toplot[key]["DSID"]] = key
    for key in sig_dsid_toplot.keys():
        if sig_dsid_toplot[key]["cat"] == cat:
            newdic[sig_dsid_toplot[key]["DSID"]] = key
    print("#"*31)
    print("####### Category %s #######" %cat)
    print("#"*31)
    print("%-20s %10s"%("DSID","Description"))
    print("-"*31)
    for key in sorted(newdic.keys()):
        print("%-20s %10s"%(key,newdic[key]))
    return newdic

In [None]:
def getMCCategory(filedic):
    MCcat = {}
    for cat in filedic:
        for dsid in filedic[cat]["dsid"]:
            MCcat[dsid] = cat
    return MCcat

In [None]:
Backgrounds = getBkgCategories();
Signals     = getSignalCategories();

In [7]:
def getSkims(folder):
    onlyfiles = [f for f in listdir(folder) if isdir(join(folder, f))]
    for of in onlyfiles:
        print(of)
    

# Plotting

In [None]:
plotdic = {}

In [None]:
plotdic["lep_type"] = {}
plotdic["lep_type"]["nbin"] = 2
plotdic["lep_type"]["nmax"] = 2
plotdic["lep_type"]["nmin"] = 0

In [None]:
plotdic["lep_charge"] = {}
plotdic["lep_charge"]["nbin"] = 3
plotdic["lep_charge"]["nmax"] = 1
plotdic["lep_charge"]["nmin"] = -1

In [None]:
plotdic["lep_pt"] = {}
plotdic["lep_pt"]["nbin"] = 100
plotdic["lep_pt"]["nmax"] = 1000
plotdic["lep_pt"]["nmin"] = 0

In [None]:
plotdic["jet_pt"] = {}
plotdic["jet_pt"]["nbin"] = 100
plotdic["jet_pt"]["nmax"] = 1000
plotdic["jet_pt"]["nmin"] = 0

In [None]:
plotdic["met_et"] = {}
plotdic["met_et"]["nbin"] = 100
plotdic["met_et"]["nmax"] = 1000
plotdic["met_et"]["nmin"] = 0

In [None]:
plotdic["lep_eta"] = {}
plotdic["lep_eta"]["nbin"] = 80
plotdic["lep_eta"]["nmax"] = 4
plotdic["lep_eta"]["nmin"] = -4

In [None]:
plotdic["lep_phi"] = {}
plotdic["lep_phi"]["nbin"] = 80
plotdic["lep_phi"]["nmax"] = 4
plotdic["lep_phi"]["nmin"] = -4

In [None]:
plotdic["lep_E"] = {}
plotdic["lep_E"]["nbin"] = 100
plotdic["lep_E"]["nmax"] = 1000
plotdic["lep_E"]["nmin"] = 0

In [None]:
plotdic["lep_trackd0pvunbiased"] = {}
plotdic["lep_trackd0pvunbiased"]["nbin"] = 200
plotdic["lep_trackd0pvunbiased"]["nmax"] = 10
plotdic["lep_trackd0pvunbiased"]["nmin"] = -10

In [None]:
plotdic["lep_z0"] = {}
plotdic["lep_z0"]["nbin"] = 200
plotdic["lep_z0"]["nmax"] = 10
plotdic["lep_z0"]["nmin"] = -10

In [None]:
plotdic["lep_z0SinTheta"] = {}
plotdic["lep_z0SinTheta"]["nbin"] = 100
plotdic["lep_z0SinTheta"]["nmax"] = 10
plotdic["lep_z0SinTheta"]["nmin"] = 0

In [None]:
plotdic["mll"] = {}
plotdic["mll"]["nbin"] = 200
plotdic["mll"]["nmax"] = 2000
plotdic["mll"]["nmin"] = 0

In [None]:
plotdic["mt2"] = {}
plotdic["mt2"]["nbin"] = 200
plotdic["mt2"]["nmax"] = 2000
plotdic["mt2"]["nmin"] = 0

In [None]:
plotdic["costhstar"] = {}
plotdic["costhstar"]["nbin"] = 10
plotdic["costhstar"]["nmax"] = 1
plotdic["costhstar"]["nmin"] = 0

In [None]:
bkg_plot_dic = {}
colors = [(208, 240, 193), (195, 138, 145), (155, 152, 204), (248, 206, 104), 
          (222, 90, 106), (182, 70, 45), (153, 70, 15), (265, 17, 24), (245, 29, 256)]
i = 0
for b in Backgrounds:
    bkg_plot_dic[b] = {"color":colors[i]}
    i += 1 

This function calculates the scale factor needed for the scaling of MC to the right luminosity. In stead of storing all the individual scale factors, weights and cross-sections this function calculates the final scale factor so that the individual branches (i.e. scaleFactor_*, mc_weight, etc.) do not need to be stored.

In [None]:
def calc_sf(xsec,lumi,nev,mcWeight,scaleFactor_PILEUP,scaleFactor_ELE,scaleFactor_MUON,scaleFactor_BTAG,scaleFactor_LepTRIGGER):
    if lumi <= 0: 
        print("Lumi {:d} is not valid".format(lumi)) 
        return 0
    wgt = (mcWeight)*(scaleFactor_PILEUP)*(scaleFactor_ELE)*(scaleFactor_MUON)*(scaleFactor_BTAG)*(scaleFactor_LepTRIGGER)
    return wgt * ((xsec*lumi)/nev)

The follwoing function is not really used anymore, but maybe it will become useful at some point. So saves it here :-)

# Skimming<br>
The following function specify the skimming. Due to memory consumption it is not feasible to read all events so some skimming is required

In [None]:
def skimming(df,events,prev_n,next_n,nlep,lep_ptcut):
     
    pt  = awkward.from_iter(df['lep_pt'])
    met = awkward.from_iter(df['met_et'])
    
    skim = []
    
    pt_org  = awkward.from_iter(df['lep_pt'])
    
    # MET > 50 GeV
    #skim.append(pd.Series(met > 50000,name='bools').astype(int))
    
    # Makes sure that each sub-vector has the same number of entries (filling -999 if not)
    awkward.pad_none(pt,nlep)#.fillna(-999)
    awkward.pad_none(pt_org,nlep)#.fillna(-999)
    
    # Require leptons to have enough pt (according to values set in lep_ptcut)
    for i in range(len(lep_ptcut)):
        se = pt > lep_ptcut[i]
        skim.append(pd.Series(awkward.count(pt[pt > lep_ptcut[i]],axis=1) >= 1).astype(int))
        mask = np.logical_and(pt != pt.max(), pt.max != None)
        pt = pt[mask]
        
     
    # Make sure we only have exactly nlep (after the pt cuts)
    skim.append(pd.Series(awkward.count(pt_org[pt_org != None],axis=1) == nlep).astype(int))
        
    # < here one can add additional cuts. Remeber to append the result to the skim vector)
    
    # Make sure that all our entries in the skim vector has value 1 
    # If not this means that one of the cuts above did not pass (i.e. we don't want to keep the event)
    # Adding values from all skim vectors together should give a total equal to the length of the skim vector
    sk_final = skim[0]
    for i in range(1,len(skim)):
        sk_final = sk_final.add(skim[i])
    final_skim = pd.Series(sk_final == len(skim))
    
    # Keep only rows where we have right number of leptons with pT above thresholds
    df = df[final_skim.values]
    
    return df

# Augmentation<br>
The following function let you add new jet variables into the panda data frame. The jet information is removed apriori as the exact number of jets is varying from event to event. 

In [None]:
def jetaugmentation(df,events,prev_n,next_n):
    #pt, eta, phi, e = events.arrays(['jet_pt','jet_eta','jet_phi','jet_E'],outputtype=tuple,entrystart=prev_n,entrystop=next_n)
    pt  = awkward.from_iter(df['jet_pt'])
    eta = awkward.from_iter(df['jet_eta'])
    phi = awkward.from_iter(df['jet_phi'])
    e   = awkward.from_iter(df['jet_E'])
    df['jet_n60'] = awkward.count(pt[pt > 60000],axis=1)
    return df

The following function let you add lepton variables. The jagged arrays need to be turned into variables for each lepton. If you apply skimming on 2 leptons the information of the two hardest leptons will be saved. If skimming is set to 3 it saves the 3 hardest leptons etc. One can also add higher level variables like mll, deltaR etc. See documentation here: [here](https://github.com/scikit-hep/uproot#multiple-values-per-event-jagged-arrays)

In [None]:
def lepaugmentation(df,events,prev_n,next_n,nlep):
    pt_org  = awkward.from_iter(df['lep_pt'])
    pt  = awkward.from_iter(df['lep_pt'])
    eta = awkward.from_iter(df['lep_eta'])
    phi = awkward.from_iter(df['lep_phi'])
    e   = awkward.from_iter(df['lep_E'])
  
    pt  = awkward.pad_none(pt, nlep,axis=1)#.fillna(-999)pt.pad(nlep).fillna(-999)
    eta = awkward.pad_none(eta,nlep,axis=1)#.fillna(-999)
    phi = awkward.pad_none(phi,nlep,axis=1)#.fillna(-999)
    e   = awkward.pad_none(e,  nlep,axis=1)#.fillna(-999)
    
    
    pt_org  = awkward.pad_none(pt, nlep,axis=1)#.fillna(-999)pt.pad(nlep).fillna(-999)
    # Make the lepton variables
    for i in range(1,nlep+1):
        mask = np.logical_and(pt == awkward.max(pt,axis=1), awkward.max(pt,axis=1) != None)
        
        if awkward.sum(np.logical_and(awkward.sum(mask,axis=1) != 1, True)) > 0:
            idx = awkward.argmax(np.logical_and(awkward.sum(mask,axis=1) != 1, True))
            print("There's a problem with idx %i in pt array %s"%(idx,pt[idx]))
            for i in range(0,len(pt[idx])):
                print("pt = %f, eta = %f, E = %f"%(pt[idx][i],eta[idx][i],e[idx][i]))
                vec = vector.obj(pt=pt[idx][i],eta=eta[idx][i],phi=phi[idx][i],e=e[idx][i])
                print("x = %f, y = %f, z = %f, pt = %f"%(vec.x,vec.y,vec.z,vec.rho))
            print("Take the one with highest energy")
            mask = np.logical_and(e == awkward.max(e,axis=1), awkward.max(e,axis=1) != None)
            print("mask = ",mask[idx])
            if awkward.sum(np.logical_and(awkward.sum(mask,axis=1) != 1, True)) > 0:
                idx = awkward.argmax(np.logical_and(awkward.sum(mask,axis=1) != 1, True))
                print("There's still a problem with idx %i in e array %s"%(idx,e[idx]))
               
        df['lep%i_pt'%i]  = awkward.to_numpy(pt[mask])
        df['lep%i_eta'%i] = awkward.to_numpy(eta[mask])
        df['lep%i_phi'%i] = awkward.to_numpy(phi[mask])
        df['lep%i_E'%i]   = awkward.to_numpy(e[mask])
        
        pt  = pt[~mask]
        eta = eta[~mask]
        phi = phi[~mask]
        e   = e[~mask]

    tlv = []
    for i in range(1,nlep+1):
        tlv.append(vector.array({"pt":df['lep%i_pt'%i].to_list(),"eta":df['lep%i_eta'%i].to_list(),"phi":df['lep%i_phi'%i].to_list(),"e":df['lep%i_E'%i].to_list()}))

    pairs = awkward.argcombinations(pt_org,2)
    
    for ilep in range(len(pairs[0])):
        i = pairs[0][ilep]['0']
        j = pairs[0][ilep]['1']
        
        df['mll_%i%i'%(i+1,j+1)]   = (tlv[i]+tlv[j]).mass
        df['dphi_%i%i'%(i+1,j+1)] = tlv[i].deltaphi(tlv[j])
        df['dR_%i%i'%(i+1,j+1)]   = tlv[i].deltaR(tlv[j])
            
    return df

In [None]:
def createDataFrames(indir,nlep,chunksize,printevry,datatype,skimtag,lep_ptcut,branch_selection,lumi,categories = [], vb = 0):
    # Count how many events and files we write/read
    n_allfiles   = 0
    nfile   = 0
    totev   = 0
    totev_skim = 0
    out_filenum = 1
    #try:
    #    del [result]
    #except:
    #    print("WARNING\t Result does not exists. Good :-)")
           
    root_files = initialize(indir)
    
    dir_path = os.path.dirname(os.path.realpath('__file__'))
    
    datatypedir = dir_path+"/"+datatype+"/"
    if not os.path.isdir(datatypedir):
        os.makedirs(datatypedir)
        print("INFO Created folder : ", datatypedir)
    hdf5dir = datatypedir+"/hdf5/"
    if not os.path.isdir(hdf5dir):
        os.makedirs(hdf5dir)
        print("INFO \t Dreated folder : ", hdf5dir)
    else:
        print("WARNING \t Output folder %s already exists" %hdf5dir)
 
    # Checking if file exitst (not needed)
    #path = indir+"/%s_%s.h5" %(datatype,skimtag)
    #if os.path.exists(path):
    #    os.remove(path)
    #    print("WARNING\t Removing file {:s}".format(path))
    for cat in root_files.keys():
        
        if os.path.isfile(hdf5dir+"/%s_%s_num.h5" %(cat,skimtag)):
            print("INFO \t Skipping %s since file %s already exists"%(cat,hdf5dir+"/%s_%s_num.h5" %(cat,skimtag)))
            continue
        
        if len(categories) and not cat in categories:
            print("INFO \t Skipping category %s"%cat)
            continue
        
        isSignal = False
        isBkg = False
        isData = False
        if root_files[cat]['type'] == 'signal':
            isSignal = True
        elif root_files[cat]['type'] == 'bkg':
            isBkg = True
        elif root_files[cat]['type'] == 'data':
            isData = True
        else:
            print("ERROR \t Could not find type of data set for category {:s}. Skipping".format(cat))
        print("INFO \t Category %s is %s"%(cat,"signal" if isSignal else ("background" if isBkg else "data")))
        
        nfile = 0
        n_allfiles = 0
        for f in root_files[cat]["files"]:
            #if not "410012" in f: continue
            n_allfiles += 1
            # Getting the file and extracting the information from info dictionary
            
            events = uproot.open(f+":mini")
            
            nentries = events.num_entries
            
            print("INFO  \t Opening file {:d}/{:d} for category {:s}: {:s} with {:d} events".format(n_allfiles,
                                                                                      len(root_files[cat]["files"]), cat,
                                                                                      f.split("/")[-1],
                                                                                      nentries))
            
            path = indir+"/%s_%s_num_%i.h5" %(datatype,skimtag,out_filenum)
            
            #branches = events.keys()
            
            
            n = 19
            prev_n = 18000000
            while True:
                # Measure time to read 
                if n == 19: start = time.time()   
                else:   
                    end = time.time()
                    dur = (end - start)
                    dur_sec = chunksize/dur
                    m, s = divmod((nentries-((n-1)*chunksize))/dur_sec, 60)
                    h, m = divmod(m, 60)
                    print("INFO  \t Event/sec  = {:.0f}. ETC = {:d}h{:02d}m{:02d}s".format(dur_sec,int(h),int(m),int(s)))
                    start = time.time()
                # Get the range of events to read
                next_n = n*chunksize if n*chunksize < nentries else nentries
                print("INFO  \t Reading entries {:d} - {:d} of {:d}. Total so far: {:d}".format(prev_n,next_n,nentries,totev_skim))
                #df = events.arrays(branches)#flatten=False,entrystart=prev_n,entrystop=next_n)
                #df = events.pandas.df(branches,flatten=False,entrystart=prev_n,entrystop=next_n)
                np_array = events.arrays(filter_name=branch_selection,entry_start=prev_n,
                                         entry_stop=next_n,library="np")
                df = pd.DataFrame(np_array)
                
                #print("df.columns = ",df.columns)
                
                df = df.astype({'channelNumber': 'float32'})     
                df = df.astype({'eventNumber': 'float32'})
                totev += len(df.index)
                if df.shape[0] > 0:
                    # If MC: get the scale factor corresponding to the total data luminosity (is set above). 
                    # If data: set scale factor to 1!
                    if not "data" in f:
                        df['wgt'] = np.vectorize(calc_sf)(df.XSection,lumi,df.SumWeights,df.mcWeight,
                                                         df.scaleFactor_PILEUP,df.scaleFactor_ELE,
                                                         df.scaleFactor_MUON,df.scaleFactor_BTAG,
                                                         df.scaleFactor_LepTRIGGER)
                    else:
                        df['wgt'] = 1.0
                    # Define if it is signal or background (important for ML classification)
                    if isBkg or isData:
                        df['isSignal'] = 0
                    elif isSignal:
                        df['isSignal'] = 1
                    else:
                        df['isSignal'] = 0
                    df['MCType'] = cat
                    #df = df.astype({'MCType': 'string'})
                    ###########   
                    # Skimming
                    ###########  
                    if vb > 4: print("DEBUG \t Skimming dataframe with shape ",df.shape)
                    df = skimming(df,events,prev_n,next_n,nlep,lep_ptcut)
                    if vb > 4: print("DEBUG \t Shape after skimming is ",df.shape)
                if df.shape[0] > 0: 
                    ##############
                    # Augumenting
                    ##############
                    if vb > 4: print("DEBUG \t Augumenting dataframe with shape ",df.shape)
                    df = jetaugmentation(df,events,prev_n,next_n)
                    df = lepaugmentation(df,events,prev_n,next_n,nlep)
                    if vb > 4: print("DEBUG \t Done augumenting dataframe, now is shape ",df.shape)
                    ###########   
                    # Slimming
                    ###########
                    if vb > 4: print("DEBUG \t Slimming dataframe with shape ",df.shape)
                    df = df.drop(['mcWeight','scaleFactor_PILEUP','scaleFactor_ELE','scaleFactor_MUON','scaleFactor_BTAG','scaleFactor_LepTRIGGER'],axis=1)
                    df = df.drop(['jet_n', 'jet_pt', 'jet_eta', 'jet_phi', 'jet_E', 'jet_jvt'],axis=1)
                    df = df.drop(['jet_trueflav', 'jet_truthMatched', 'jet_MV2c10', 'jet_pt_syst'],axis=1)
                    df = df.drop(['lep_n', 'lep_truthMatched', 'lep_trigMatched', 'lep_pt', 'lep_eta'],axis=1)
                    df = df.drop(['lep_phi', 'lep_E', 'lep_z0', 'lep_charge', 'lep_type', 'lep_isTightID'],axis=1)
                    df = df.drop(['lep_ptcone30', 'lep_etcone20', 'lep_trackd0pvunbiased'],axis=1)
                    df = df.drop(['lep_tracksigd0pvunbiased', 'lep_pt_syst'],axis=1)
                    if vb > 4: print("DEBUG \t Done slimming dataframe, now is shape ",df.shape)
                    # If first time, create result data frame. If not; concatenate
                    try: 
                        result = pd.concat([result,df],axis=0, ignore_index=True)
                    except:
                        result = df
                        print("WARNING\t Starting a new result panda")
                    totev_skim += len(df.index)
                    
                    # Delete the temporary data frame
                    del [df]
                if totev_skim > printevry:
                    #result = sortAndIndex(result,nlep)
                    path = hdf5dir+"/%s_%s_num.h5" %(cat,skimtag)
                    #print("{10:s} {7:d} {2:d} {7:d} {7:d}".format(cat,totev,nfile,totev_skim,
                                                                  #(float(totev_skim)/float(totev))*100.))
                    if totev:
                        print("INFO  \t <-Read {:d} events in {:d} files for category {:s}, for which {:d} ({:.2f}%) were written to \n       \t{:s}"
                            .format(totev,nfile,cat,totev_skim,(float(totev_skim)/float(totev))*100.,path))
                    print("result = ",result.columns)
                    result.to_hdf(path,'mini',mode='a',format='table')
                    out_filenum += 1
                    totev = 0
                    totev_skim = 0
                    #nfile = 0
                    del [result]
                # If read everything
                if n*chunksize > nentries: break
                # Update counters before continuing
                prev_n = n*chunksize + 1
                n += 1
                #break
            nfile += 1
            del [events]  
            #break
        # Make sure we write the last events for this category
        #result = sortAndIndex(result,nlep)
        path = hdf5dir+"/%s_%s_num.h5" %(cat,skimtag)
        try:
            result.to_hdf(path,'mini',mode='a',format='table')
            if totev:
                print("INFO  \t ->Read {:d} events in {:d} files for category {:s}, for which {:d} ({:.2f}%) were written to \n       \t{:s}"
                        .format(totev,nfile,cat,totev_skim,(float(totev_skim)/float(totev))*100.,path))
        except:
            print("INFO \t Everything already written to file.")
        totev = 0
        totev_skim = 0
        try:
            del [result]
        except:
            print("WARNING \t Result does not exist. Probably empty file.")
    
    #return result

In [None]:
#branch_selection = "/(met_|jet_|lep_|scaleFactor_|eventNumber|channelNumber|XSection|SumWeights|mcWeight)/" 
#createDataFrames("/storage/shared/data/fys5555/ATLAS_opendata/4lep/MC/",4, 50000, 50000, "MC", "4L", [], branch_selection, 10.6)

In [None]:
def convertRDFCutflowToTex(cutflow):
    i = 0
    tabstr = ""
    for c in cutflow1:
        cname = c.GetName()
        c2 = cutflow2.At(cname)
        if i == 0:
            nevc1 = c.GetAll()
            nevc2 = c2.GetAll()
        cname = cname.replace(">","$>$")
        cname = cname.replace("<","$<$")
        tabstr += "%-30s & $%.0f$ & $%.0f$ & $%.2f$ & $%.2f$ & $%.0f$ & $%.0f$ & $%.2f$ & $%.2f$ \\\ \n"%(cname,c.GetPass(),c.GetAll(),c.GetEff(),(c.GetPass()/nevc1)*100.,c2.GetPass(),c2.GetAll(),c2.GetEff(),(c2.GetPass()/nevc2)*100.)
        i += 1
    print(tabstr)