In [1]:
import json
import ROOT
from ROOT import RDataFrame, TCanvas, THStack
ROOT.EnableImplicitMT()
import numpy as np
%jsroot on
verbosity = ROOT.Experimental.RLogScopedVerbosity(ROOT.Detail.RDF.RDFLogChannel(), ROOT.Experimental.ELogLevel.kInfo)

Welcome to JupyROOT 6.26/04


In [2]:
ROOT.gSystem.CompileMacro("helper.cpp", "kO")

1

In [3]:
from urllib.request import urlretrieve
import os

class TtbarAnalysis(dict):

    def __init__(self, n_files_max_per_sample = 1, num_bins=25, bin_low = 50, bin_high = 550, download_input_data=False, use_local_data=False):
        
        self.download_input_data = download_input_data
        self.use_local_data = use_local_data
        self._nevts_total = {}
        self.n_files_max_per_sample = n_files_max_per_sample
        self.input_data = self._construct_fileset()
        self.num_bins = num_bins
        self.bin_low = bin_low
        self.bin_high = bin_high
        self.xsec_info = {
            
            "ttbar": 396.87 + 332.97, # nonallhad + allhad, keep same x-sec for all
            "single_top_s_chan": 2.0268 + 1.2676,
            "single_top_t_chan": (36.993 + 22.175)/0.252,  # scale from lepton filter to inclusive
            "single_top_tW": 37.936 + 37.906,
            "wjets": 61457 * 0.252,  # e/mu+nu final states
            "data": None
        }
    
        

    def _construct_fileset(self):
        n_files_max_per_sample = self.n_files_max_per_sample
        with open ('ntuples.json') as f:
            file_info = json.load(f)
        fileset = {}
        for process in file_info.keys():
            if process == "data":
                continue  # skip data
            fileset[process] = {}
            self[process] = {}
            self._nevts_total[process] = {}
            for variation in file_info[process].keys():
#                 if variation != 'nominal': continue      
                file_list = file_info[process][variation]["files"]
                if n_files_max_per_sample != -1:
                    file_list = file_list[:n_files_max_per_sample]  # use partial set of samples
                file_paths = [f["path"] for f in file_list]
                nevts_total = sum([f["nevts"] for f in file_list])
                self._nevts_total[process].update({variation:nevts_total})
                fileset[process].update({variation: file_paths})
                if (self.download_input_data):
                    dir_name = f"input/{process}_{variation}"
                    os.makedirs(dir_name, exist_ok=True)
                    for i in range(len(file_paths)):
                        path = file_paths[i]
                        file = f"{dir_name}/{i}.root"
                        if not os.path.exists(file):
                            urlretrieve(path, file)   
                            print(f"directory {dir_name} has been created")
                        else:
                            print(f"{file} already exists")
                self[process][variation] = {}
                
        return fileset

    def fill(self, process, variation):

        input_data = f"root://eosuser.cern.ch//eos/user/a/afalko/analysis-grand-challenge/analyses/cms-open-data-ttbar/input/{process}_{variation}/*.root" if self.use_local_data else self.input_data[process][variation]               
        d = RDataFrame('events', input_data)

        x_sec = self.xsec_info[process]
        nevts_total = self._nevts_total[process][variation]
        lumi = 3378 # /pb
        xsec_weight = x_sec * lumi / nevts_total

        pt_variations = ["pt_nominal", "pt_scale_up", "pt_res_up"] if variation == "nominal" else ["pt_nominal"]
        
        for pt_var in pt_variations:
            jet_pt_modified = f"jet_pt*{pt_var}()" if "res" not in pt_var else f"jet_pt*pt_res_up(jet_pt)"
            d_mod = d.Redefine('jet_pt', jet_pt_modified)
            d_mod = d_mod.Define('electron_pt_mask', 'electron_pt>25').Define('muon_pt_mask', 'muon_pt>25').Define('jet_pt_mask', 'jet_pt>25')\
                 .Filter('Sum(electron_pt_mask) + Sum(muon_pt_mask) == 1')\
                 .Filter('Sum(jet_pt_mask) >= 4')\
                 .Filter('Sum(jet_btag[jet_pt_mask]>=0.5)>=1')\
                 .Define('weights', str(xsec_weight)) 


    #         #it gives number 1
    #         print(d.GetNRuns())
            measured = {"4j1b": "HT", "4j2b": 'trijet_mass'}
            for region in ["4j1b","4j2b"]:
                meas = measured[region]
                if region == "4j1b":

                    fork = d_mod.Filter('Sum(jet_btag[jet_pt_mask]>=0.5)==1').Define(meas, 'Sum(jet_pt[jet_pt_mask])')      

                elif region == "4j2b":

                    fork = d_mod.Filter('Sum(jet_btag[jet_pt_mask]>=0.5)>1').Define("jet_p4", 
                        "ROOT::VecOps::Construct<ROOT::Math::PxPyPzMVector>(jet_px[jet_pt_mask], jet_py[jet_pt_mask], jet_pz[jet_pt_mask], jet_mass[jet_pt_mask])"
                    )

                    fork = fork.Define('trijet', 
                        'ROOT::VecOps::Combinations(jet_pt[jet_pt_mask],3)'
                    ).Define('ntrijet', 'trijet[0].size()')

                    fork = fork.Define('trijet_p4', 
                                          'ROOT::VecOps::RVec<ROOT::Math::PxPyPzMVector> trijet_p4(ntrijet);'              +\
                                          'for (int i = 0; i < ntrijet; ++i) {'                                            +\
                                              'int j1 = trijet[0][i]; int j2 = trijet[1][i]; int j3 = trijet[2][i];'       +\
                                              'trijet_p4[i] = jet_p4[j1] + jet_p4[j2] + jet_p4[j3];'                       +\
                                          '}'                                                                              +\
                                          'return trijet_p4;'                                                                                                                          
                                         )

                    #TODO  implement references
                    fork = fork.Define('trijet_pt', 
                            'return ROOT::VecOps::Map(trijet_p4, [](ROOT::Math::PxPyPzMVector v) { return v.Pt(); })'
                                                )

                    fork = fork.Define('trijet_btag', 
                                                      'ROOT::VecOps::RVec<bool> btag(ntrijet);'                                   +\
                                                      'for (int i = 0; i < ntrijet; ++i) {'                                       +\
                                                       'int j1 = trijet[0][i]; int j2 = trijet[1][i]; int j3 = trijet[2][i];'     +\
                                                       'btag[i]=std::max({jet_btag[j1], jet_btag[j2], jet_btag[j3]})>0.5;'        +\
                                                      '}'                                                                         +\
                                                      'return btag;'
                                                )

                    fork=fork.Define(meas,
                                                      'double mass;'+\
                                                      'double Pt = 0;'+\
                                                      'double indx = 0;'+\
                                                      'for (int i = 0; i < ntrijet; ++i) {'               +\
                                                      '    if ((Pt < trijet_pt[i]) && (trijet_btag[i])) {'+\
                                                      '        Pt = trijet_pt[i];'+\
                                                      '        indx=i;'+\
                                                      '    }'                                            +\
                                                      '}'                                                +\
                                                      'mass = trijet_p4[indx].M();'             +\
                                                      'return mass;'
                                                     )

                if "nominal" in pt_var:   
                    res = fork.Histo1D((f'{process}_{variation}_{region}', process, self.num_bins, self.bin_low, self.bin_high), meas, 'weights')
                    self[process][variation][region] = res
                    self.hist.append(res)
                    print(f'histogram {region}_{process}_{variation} has been created')
                    
                    if variation == "nominal":
                        for i_var, weight_name in enumerate([f"btag_var_{i}" for i in range(4)]):
                            for i_dir, direction in enumerate(["up", "down"]):

                                btag_weight_variation=f"{weight_name}_{direction}"
                                var_weights = f'weights_{btag_weight_variation}_{region}'
                                fork = fork.Define(var_weights, 
                                           f"weights*btag_weight_variation({i_var}, jet_pt[jet_pt_mask])[{i_dir}]"
                                        )

                                res = fork.Histo1D(
                                                    (f'{process}_{btag_weight_variation}_{region}', btag_weight_variation, self.num_bins, self.bin_low, self.bin_high),
                                                    meas,
                                                    var_weights
                                )


                                if btag_weight_variation not in self[process]:
                                    self[process][btag_weight_variation] = {}

                                self[process][btag_weight_variation][region] = res                   

                                self.hist.append(res)

                        if process == 'wjets':
                            for i_dir, direction in enumerate(["up", "down"]):
                                weight_var = f"scale_var_{direction}"
                                res = fork.Define(weight_var, f"weights*flat_variation()[{i_dir}]")\
                                            .Histo1D(
                                                    (f"{region}_{process}_{weight_var}", weight_var, self.num_bins, self.bin_low, self.bin_high),
                                                    meas,
                                                    weight_var
                                                    )
                                if weight_var not in self[process]:
                                    self[process][weight_var] = {}
                                self[process][weight_var][region] = res
                                self.hist.append(res)
                                # print(f'histogram {region}_{process}_{weight_var} has been created')
                else:
                    res = fork.Histo1D((f'{process}_{pt_var}_{region}', process, self.num_bins, self.bin_low, self.bin_high), meas, 'weights')
                    if pt_var not in self[process]:
                         self[process][pt_var] = {}
                    self[process][pt_var][region] = res
                    self.hist.append(res)
#                     print(f'histogram {region}_{process}_{pt_var} has been created')                    



            

    def Fill(self):
        self.hist = []
        for process in self:
            
            for variation in self.input_data[process]:
                self.fill(process=process, variation=variation)

    def Accumulate(self):
        ROOT.RDF.RunGraphs(self.hist)            
        
    def GetProcStack(self, region, variation='nominal'):
        return [self[process][variation][region] for process in self]
    
    def GetVarStack(self, region, process="ttbar"):
        return [self[process][variation][region] for variation in self[process]]
    
    def ExportJSON(self):
        data = {}
        for process in self:
            data[process] = {}
            for variation in self[process]:
                data[process][variation] = [region for region in self[process][variation]]
        with open('data.json', 'w') as f:
            json.dump(data, f)
                
                
                    

In [4]:
analysisManager = TtbarAnalysis(download_input_data=True, use_local_data=True)
analysisManager.input_data

input/ttbar_nominal/0.root already exists
input/ttbar_scaledown/0.root already exists
input/ttbar_scaleup/0.root already exists
input/ttbar_ME_var/0.root already exists
input/ttbar_PS_var/0.root already exists
input/single_top_s_chan_nominal/0.root already exists
input/single_top_t_chan_nominal/0.root already exists
input/single_top_tW_nominal/0.root already exists
input/wjets_nominal/0.root already exists


{'ttbar': {'nominal': ['https://xrootd-local.unl.edu:1094//store/user/AGC/datasets/RunIIFall15MiniAODv2/TT_TuneCUETP8M1_13TeV-powheg-pythia8/MINIAODSIM//PU25nsData2015v1_76X_mcRun2_asymptotic_v12_ext3-v1/00000/00DF0A73-17C2-E511-B086-E41D2D08DE30.root'],
  'scaledown': ['https://xrootd-local.unl.edu:1094//store/user/AGC/datasets/RunIIFall15MiniAODv2/TT_TuneCUETP8M1_13TeV-powheg-scaledown-pythia8/MINIAODSIM/PU25nsData2015v1_76X_mcRun2_asymptotic_v12_ext3-v1/00000/00D45978-82D1-E511-9406-00237DF27440.root'],
  'scaleup': ['https://xrootd-local.unl.edu:1094//store/user/AGC/datasets/RunIIFall15MiniAODv2/TT_TuneCUETP8M1_13TeV-powheg-scaleup-pythia8/MINIAODSIM/PU25nsData2015v1_76X_mcRun2_asymptotic_v12_ext3-v1/10000/002A13F7-B8CB-E511-A112-003048CB7B30.root'],
  'ME_var': ['https://xrootd-local.unl.edu:1094//store/user/AGC/datasets/RunIIFall15MiniAODv2/TT_TuneCUETP8M1_13TeV-amcatnlo-pythia8/MINIAODSIM/PU25nsData2015v1_76X_mcRun2_asymptotic_v12_ext1-v1/60000/00F8750A-D4CB-E511-A1D1-00215E2EAD

In [5]:
import time
t0 = time.time()
analysisManager.Fill()
t1 = time.time()
print(f"\npreprocessing took {round(t1 - t0,2)} seconds")
analysisManager.Accumulate()
t2 = time.time()
print(f"processing took {round(t2 - t1,2)} seconds")
print(f"execution took {round(t2 - t0,2)} seconds")

histogram 4j1b_ttbar_nominal has been created
histogram 4j2b_ttbar_nominal has been created
histogram 4j1b_ttbar_scaledown has been created
histogram 4j2b_ttbar_scaledown has been created
histogram 4j1b_ttbar_scaleup has been created
histogram 4j2b_ttbar_scaleup has been created
histogram 4j1b_ttbar_ME_var has been created
histogram 4j2b_ttbar_ME_var has been created
histogram 4j1b_ttbar_PS_var has been created
histogram 4j2b_ttbar_PS_var has been created
histogram 4j1b_single_top_s_chan_nominal has been created
histogram 4j2b_single_top_s_chan_nominal has been created
histogram 4j1b_single_top_t_chan_nominal has been created
histogram 4j2b_single_top_t_chan_nominal has been created
histogram 4j1b_single_top_tW_nominal has been created
histogram 4j2b_single_top_tW_nominal has been created
histogram 4j1b_wjets_nominal has been created
histogram 4j2b_wjets_nominal has been created

preprocessing took 2.5 seconds
processing took 15.38 seconds
execution took 17.88 seconds


Info in <[ROOT.RDF] Info /build/jenkins/workspace/lcg_release_pipeline/build/projects/ROOT-6.26.04/src/ROOT/6.26.04/tree/dataframe/src/RLoopManager.cxx:747 in void ROOT::Detail::RDF::RLoopManager::Run()>: Starting event loop number 0.
Info in <[ROOT.RDF] Info /build/jenkins/workspace/lcg_release_pipeline/build/projects/ROOT-6.26.04/src/ROOT/6.26.04/tree/dataframe/src/RLoopManager.cxx:747 in void ROOT::Detail::RDF::RLoopManager::Run()>: Starting event loop number 0.
Info in <[ROOT.RDF] Info /build/jenkins/workspace/lcg_release_pipeline/build/projects/ROOT-6.26.04/src/ROOT/6.26.04/tree/dataframe/src/RLoopManager.cxx:747 in void ROOT::Detail::RDF::RLoopManager::Run()>: Starting event loop number 0.
Info in <[ROOT.RDF] Info /build/jenkins/workspace/lcg_release_pipeline/build/projects/ROOT-6.26.04/src/ROOT/6.26.04/tree/dataframe/src/RLoopManager.cxx:747 in void ROOT::Detail::RDF::RLoopManager::Run()>: Starting event loop number 0.
Info in <[ROOT.RDF] Info /build/jenkins/workspace/lcg_releas

In [6]:
output = ROOT.TFile.Open('rdf.root', 'RECREATE')
for process in analysisManager:
    for variation in analysisManager[process]:
        for region in analysisManager[process][variation]:
            hist_name = f"{region}_{process}_{variation}" if variation != 'nominal' else f"{region}_{process}"
            hist_ptr = analysisManager[process][variation][region].GetPtr()
            hist_sliced = ROOT.Slice(hist_ptr, 120, 550)
            hist_binned = hist_sliced.Rebin(2, hist_ptr.GetTitle())
            output.WriteObject(hist_binned, hist_name)
output.Close()



In [7]:
c = TCanvas('c', 'c', 600, 400) 
hlist = analysisManager.GetProcStack(region='4j2b')
hs = THStack('j4b1', '>=4 jets, 1 b-tag (RDF); H_{T} [GeV]')
for h in hlist:
    ptr = h.Rebin(2, h.GetTitle())
    hs.Add(ptr)
hs.Draw('hist pfc plc')
c.Draw()
x = hs.GetXaxis()
x.SetTitleOffset(1.5)
x.CenterTitle()
c.BuildLegend(0.65, 0.7, 0.9, 0.9)

<cppyy.gbl.TLegend object at 0x1b8e5850>



In [8]:
freshstack = analysisManager.GetVarStack(region='4j1b')
hs = THStack('j4b1btag', 'b-tagging variations (RDF); H_{T} [GeV]')
for h in freshstack:
    ptr = h.Rebin(2, h.GetTitle())
    ptr.SetFillColor(0)
    ptr.SetLineWidth(1)
    hs.Add(ptr)
hs.Draw('hist nostack')
c.Draw()
x = hs.GetXaxis()
x.SetRangeUser(120, 500)
x.SetTitleOffset(1.5)
x.CenterTitle()
c.BuildLegend(0.65, 0.7, 0.9, 0.9)

<cppyy.gbl.TLegend object at 0x1bbbcd30>



In [9]:
def get_values(h):
    return np.array([h.GetBinContent(i+1) for i in range (h.GetNbinsX())])
def get_variances(h):
    return np.array([h.GetBinError(i+1) for i in range (h.GetNbinsX())])

def match_histos (rdf_hist, coffea_hist, precision):
    rdf_values = get_values(rdf_hist)
    coffea_values = get_values(coffea_hist)
    rdf_values = np.round(rdf_values, precision)
    coffea_values = np.round(coffea_values, precision)
    mask = rdf_values == coffea_values
    return not (False in mask)

def get_deviations(rdf_hist, coffea_hist):
    rdf_values = get_values(rdf_hist)
    coffea_values = get_values(coffea_hist)
    deviations = np.zeros(len(rdf_values))
    for i in range(len(deviations)):
        rdf_value = rdf_values[i]
        coffea_value = coffea_values[i]
        deviations[i] = 0 if round(coffea_value, 5) == round(rdf_value, 5) == 0 else 100*abs(rdf_value-coffea_value)/max(coffea_value, rdf_value)
#         deviations[i] = 100*abs(rdf_value-coffea_value)/coffea_value

    return deviations


def get_mismatched (rdf, coffea):
    mismatched = []
    rdf = ROOT.TFile.Open(rdf)
    coffea = ROOT.TFile.Open(coffea)
    with open('data.json', 'r') as f:
        data = json.load(f)
    for process in data:
        for variation in data[process]:
            for region in data[process][variation]:
                hist_name = f"{region}_{process}_{variation}" if variation != 'nominal' else f"{region}_{process}"
                rdf_hist = rdf.Get(hist_name)
                coffea_hist = coffea.Get(hist_name)
                if (rdf_hist and coffea_hist):
                    deviations = get_deviations(rdf_hist, coffea_hist)
                    i = np.argmax(deviations); dev = deviations[i]
                    variance = coffea_hist.GetBinError(int(i+1))
#                     if deviation > 20:
                    # if deviation > 0.0001 and "res_up" not in hist_name:
                    if "res_up" in hist_name:
                        print(f"deviation={dev:.2f}% variance/deviation={100*variance/dev:.2f}%\t-\t{hist_name}")
                        mismatched.append(hist_name)
                else:
                    raise ValueError('rdf_hist and coffea_hist is Zombie')
    return mismatched
mism = get_mismatched('rdf.root', 'histograms_local.root')

deviation=29.71% variance/deviation=2625.07%	-	4j1b_ttbar_pt_res_up
deviation=8.06% variance/deviation=8141.77%	-	4j2b_ttbar_pt_res_up
deviation=29.41% variance/deviation=5.75%	-	4j1b_single_top_s_chan_pt_res_up
deviation=7.69% variance/deviation=22.60%	-	4j2b_single_top_s_chan_pt_res_up
deviation=31.11% variance/deviation=369.87%	-	4j1b_single_top_t_chan_pt_res_up
deviation=8.12% variance/deviation=3555.17%	-	4j2b_single_top_t_chan_pt_res_up
deviation=25.44% variance/deviation=485.02%	-	4j1b_single_top_tW_pt_res_up
deviation=7.32% variance/deviation=825.67%	-	4j2b_single_top_tW_pt_res_up
deviation=60.00% variance/deviation=5075.73%	-	4j1b_wjets_pt_res_up
deviation=50.00% variance/deviation=3852.21%	-	4j2b_wjets_pt_res_up
