In [1]:
import glob
import re
import ROOT
import time
import numpy as np
import sys
import shutil
import json
try:
    import pandas as pd
except:
    !{sys.executable} -m pip install pandas
    import pandas as pd
xsec_dic = {}

Welcome to JupyROOT 6.28/04


ModuleNotFoundError: No module named 'numpy'

In [None]:
! g++ -shared -fPIC -o ./../commontools/helperFunctions.so ./../commontools/helperFunctions.cxx `root-config --cflags --glibs`

In [None]:
#Load the c++ library with filter functions
ROOT.gSystem.AddDynamicPath("../commontools/.")
ROOT.gROOT.ProcessLine(".include ./../commontools");
ROOT.gInterpreter.AddIncludePath("./../commontools");
ROOT.gInterpreter.Declare('#include "./../commontools/helperFunctions.h"') # Header with the definition of the myFilter function
ROOT.gSystem.Load("../commontools/helperFunctions.so") # Library with the myFilter function

In [None]:
def getCategory(dsid,dsname,vb = 0):
    if dsid >= 601348 and dsid <= 601355: return "singletop_nom"
    if dsid in [601455,601457,601459,601461]: return "singletop_sys"
    if dsid in [601229]: return "ttbar_singlelep_nom"
    if dsid in [601230]: return "ttbar_dilep_nom"
    if dsid in [601237]: return "ttbar_allhad_nom"
    if dsid in [601414]: return "ttbar_singlelep_PhH7EG"
    if dsid in [601415]: return "ttbar_dilep_PhH7EG"
    if dsid >= 700659 and dsid <= 700662: 
        if dsid in [700659]: return "ttbar_allhad_sherpa"
        if dsid in [700660]: return "ttbar_dilep_sherpa"
        if dsid in [700661]: return "ttbar_singlelepM_sherpa"
        if dsid in [700662]: return "ttbar_singlelepP_sherpa"
    if dsid in [601398]: return "ttbar_singlelep_phpy"
    if dsid in [601399]: return "ttbar_dilep_phpy"
    if dsid in [700578,700579,700706]: 
        if dsid in [700578]: return "ttW_0L" 
        if dsid in [700579]: return "ttW_1L" 
        if dsid in [700706]: return "ttW_dilep" 
    if (dsid >= 700566 and dsid <= 700574) or (dsid >= 700600 and dsid <= 700605): return "Diboson_nom"
    if (dsid == 700678): return "Diboson_Sh2212"
    if dsid >= 700760 and dsid <= 700763: return "Triboson"
    if dsid >= 601183 and dsid <= 601191:
        if dsid in [601183,601186]: return "Weejets_phpy"
        if dsid in [601184,601187]: return "Wmumujets_phpy"
        if dsid in [601185,601188]: return "Wtauttaujets_phpy"
        if dsid in [601189]: return "Zeejets_phpy" #Try
        if dsid in [601190]: return "Zmumujets_phpy"
        if dsid in [601191]: return "Ztautaujets_phpy"
    if dsid >= 700559 and dsid <= 700565: return "Vgamma_nom"
    if dsid in [700606,700607,700608,700627,700628,700629]: return "Wenu_nom"
    if dsid in [700609,700610,700611,700630,700631,700632]: return "Wmunu_nom"
    if dsid in [700612,700613,700614,700633,700634,700635]: return "Wtaunu_nom"
    if dsid in [700615,700616,700617,700636,700637,700638]: return "Zee_nom" #Remove
    if dsid in [700618,700619,700620,700639,700640,700641]: return "Zmumu_nom" #Remove
    if dsid in [700621,700622,700623,700642,700643,700644]: return "Ztautau_nom" #Remove
    if dsid in [700624,700625,700626]: return "Znunu_nom" #Remove
    if dsid in [700697,700698,700699]: return "Zee_lowmll_nom" #Remove
    if dsid in [700700,700701,700702]: return "Zmumu_lowmll_nom" #Remove
    if dsid in [700703,700704,700705]: return "Ztautau_lowmll_nom" #Remove
    if dsid >= 601469 and dsid <= 601474: return "Higgs_tautau"
    if dsid >= 601481 and dsid <= 601484: return "Higgs_gamgam"
    if (dsid >= 601499 and dsid <= 601505) or (dsid >= 601525 and dsid <= 601530) or (dsid >= 601582 and dsid <= 601588): return "Higgs_4lep"
    if dsid >= 601506 and dsid <= 601512: return "Higgs_mumu"
    if dsid in [601580,601581,601400,601402]: return "Higgs_tautau"
    if dsid >= 601477 and dsid <= 601480: return "diHiggs" 
    if dsid >= 801165 and dsid <= 801174: return "multijet_nom"
    if dsid >= 700688 and dsid <= 700696: return "multijet_sherpa"
    if (dsid >= 801663 and dsid <= 801676) or (dsid >= 801649 and dsid <= 801660): return "singlephoton"
    if "Jpsi" in dsname: return "Jpsi"
    if "_Zp_" in dsname or "_Zprime" in dsname: return "BSM_zprime" 
    if dsid >= 801697 and dsid <= 801751: return "BSM_qstar"
    if "_FxFx" in dsname: return "CPsamples"
    if vb:
        print("Could not find category for DSID %i with name %s"%(dsid,dsname))
    return "unknown"

In [None]:
def count_tags(ds):
    #print(ds)
    ne = len(re.findall("\W{1}e{1}\d{3,4}", ds))
    ne += len(re.findall("_{1}e{1}\d{3,4}", ds))
    ns = len(re.findall("_{1}s{1}\d{3,4}", ds))
    na = len(re.findall("_{1}a{1}\d{3,4}", ds))
    nr = len(re.findall("_{1}r{1}\d{3,4}", ds))
    npt = len(re.findall("_{1}p{1}\d{3,4}", ds))
    
    return ne,ns,na,nr,npt

In [None]:
def loadxsec(dsids,xsec_dic,xsecfile="/cvmfs/atlas.cern.ch/repo/sw/database/GroupData/dev/PMGTools/PMGxsecDB_mc21.txt"):
    lines = [line.rstrip() for line in open(xsecfile)]
    #xsec_dic = {}
    i = 0
    for l in lines:
        if i == 0:
            keys = l.split(":")
            i += 1
            continue
        val = l.split()
        dsid = val[0]
        if not dsid in dsids:
            continue
        if not dsid in xsec_dic:
            xsec_dic[dsid] = {}
        for j in range(len(val)):
            xsec_dic[dsid][keys[j].split("/")[0]] = val[j]
    #return xsec_dic

In [None]:
infiles = glob.glob("/storage/shared/data/PHYSLITEforML/*")
inputDS = {}
multitag_dsid = []
for infile in infiles:
    #print(infile)
    folder = infile.split("/")[-1]
    dsid = folder.split(".")[1]
    ne,ns,na,nr,npt = count_tags(folder)
    # Checking for 4-tag datasets (basically one tag for each step: 
    # event gen. (e), simulation (s/a), reconstruction (r) and derivation (p), 
    # removing all which do not satisfy conditions
    if (ne+ns+na+nr+npt) != 4:
        #print("Dataset %s is bad" %folder)
        multitag_dsid.append(dsid)
        try:
            print("remove %s"%infile)
            # uncomment if you want to clea up, 
            # note that this is already done :-)
            #shutil.rmtree(infile)
        except OSError as e:
            print("Error: %s - %s." % (e.filename, e.strerror))
        continue
    if not dsid in inputDS.keys():
        inputDS[dsid] = infile
    else:
        print("ERROR \t DSID %s already added with %s. Trying to add with %s"%(dsid,inputDS[dsid],infile))   
    category = getCategory(int(dsid),inputDS[dsid].split("/")[-1])    
    if not dsid in xsec_dic.keys():
        xsec_dic[dsid] = {}
    xsec_dic[dsid]["category"] = category
print("Added %i datasets" %len(inputDS.keys()))

In [None]:
loadxsec(xsec_dic.keys(),xsec_dic)
xsec_dic

In [None]:
from pathlib import Path
import time
timer_dic = []
ROOT.EnableImplicitMT(5)
idid = -1
for did in inputDS.keys():
    idid += 1
    if not did in xsec_dic.keys():
        print("ERROR \t We don't have xsec for %s"%inputDS[did].split("/")[-1])
        continue
    root_directory = Path(inputDS[did])
    rfiles = glob.glob(inputDS[did]+"/*.pool.root.1")
    tot_size = sum(f.stat().st_size for f in root_directory.glob("*pool.root.1") if f.is_file())/1.0e9
    sow = 0
    start = time.time()
    df = ROOT.RDataFrame("MetaData", inputDS[did]+"/*.pool.root.1")
    df = df.Define("sow","getSumOfWeights(CutBookkeepersAux.name,CutBookkeepersAux.sumOfEventWeights)")#.Sum("CutBookkeepersAux.sumOfEventWeights").GetValue()
    nentries = df.Count().GetValue()
    if nentries > len(rfiles):
        print("ERROR \t Ambigious number of entries is %i > %i"%(nentries,len(rfiles)))
    sow = df.Sum("sow").GetValue()
    #print(sow)
    end = time.time()
    
    print("INFO \t Got metadata for DSID %s : %i/%i with %i files. Spent %.0f seconds"%(did,idid,len(inputDS.keys()),len(rfiles),end-start))
    
    timer_dic.append({"tsize":tot_size,"nfiles":len(rfiles),"time":(end-start)})
    
    xsec_dic[did]["sumofweights"] = sow
    del df

In [None]:
with open("metadata_physlite.json", "w") as outfile:
    json.dump(xsec_dic, outfile)

In [None]:
pandas_df = pd.DataFrame.from_dict(xsec_dic,orient='index')
# remove rows with NaN values in any of the columns
pandas_df.drop(pandas_df[pandas_df.isna().any(axis=1)].index,inplace=True)
pandas_df.to_csv('physlite_metadata.csv', index=False, header=True)

In [None]:
with open("timer_5workers_DASK.json", "w") as outfile:
    json.dump(timer_dic, outfile)

In [None]:
pandas_df['dataset_number'].isnull().values.any()
#pandas_df["dataset_number"]