In [1]:
import json
import ROOT
from ROOT import RDataFrame, TCanvas, THStack
ROOT.EnableImplicitMT()
import numpy as np
%jsroot on

Welcome to JupyROOT 6.26/04


In [2]:
#redeclaration is annoying
@ROOT.Numba.Declare(["unsigned int", "ROOT::VecOps::RVec<float>&"], "ROOT::VecOps::RVec<double>")
def btag_weight_variation(i_jet, jet_pt):
    return 1 + np.array((.1,-.1))*(jet_pt[i_jet]/50)

  setattr(self, word, getattr(machar, word).flat[0])
  return self._float_to_str(self.smallest_subnormal)
  setattr(self, word, getattr(machar, word).flat[0])
  return self._float_to_str(self.smallest_subnormal)


In [3]:
# ROOT.gSystem.SetIncludePath("-I$ROOTSYS/include")
ROOT.gSystem.CompileMacro("helper.cpp", "kO")

1

In [13]:
from urllib.request import urlretrieve
import os

class TtbarAnalysis(dict):

    def __init__(self, n_files_max_per_sample = 1, num_bins=25, bin_low = 50, bin_high = 550, download_input_data=False, use_local_data=False):
        
        self.download_input_data = download_input_data
        self.use_local_data = use_local_data
        self.n_files_max_per_sample = n_files_max_per_sample
        self.input_data = self._construct_fileset()
        self.num_bins = num_bins
        self.bin_low = bin_low
        self.bin_high = bin_high
        self.xsec_info = {
            
            "ttbar": 396.87 + 332.97, # nonallhad + allhad, keep same x-sec for all
            "single_top_s_chan": 2.0268 + 1.2676,
            "single_top_t_chan": (36.993 + 22.175)/0.252,  # scale from lepton filter to inclusive
            "single_top_tW": 37.936 + 37.906,
            "wjets": 61457 * 0.252,  # e/mu+nu final states
            "data": None
        }
    
        

    def _construct_fileset(self):
        n_files_max_per_sample = self.n_files_max_per_sample
        with open ('ntuples.json') as f:
            file_info = json.load(f)
        fileset = {}
        for process in file_info.keys():
            if process == "data":
                continue  # skip data
            fileset[process] = {}
            self[process] = {}
            for variation in file_info[process].keys():
                if variation != 'nominal': continue      
                file_list = file_info[process][variation]["files"]
                if n_files_max_per_sample != -1:
                    file_list = file_list[:n_files_max_per_sample]  # use partial set of samples
                file_paths = [f["path"] for f in file_list]
                fileset[process].update({variation: file_paths})
                if (self.download_input_data):
                    dir_name = f"input/{process}_{variation}"
                    os.makedirs(dir_name, exist_ok=True)
                    for i in range(len(file_paths)):
                        path = file_paths[i]
                        urlretrieve(path, f"{dir_name}/{i}.root")   
                    print(f"directory {dir_name} has been created")
                self[process][variation] = {}
        return fileset

    def fill(self, process, variation):
        
        input_data = f"input/{process}_{variation}/*.root" if self.use_local_data else self.input_data[process][variation]               
        d = RDataFrame('events', input_data)

        x_sec = self.xsec_info[process]
        nevts_total = d.Count().GetValue()
        lumi = 3378 # /pb
        xsec_weight = x_sec * lumi / nevts_total


        d = d.Define('electron_pt_mask', 'electron_pt>25').Define('muon_pt_mask', 'muon_pt>25').Define('jet_pt_mask', 'jet_pt>25')\
             .Filter('Sum(electron_pt_mask) + Sum(muon_pt_mask) == 1')\
             .Filter('Sum(jet_pt_mask) >= 4')\
             .Filter('Sum(jet_btag[jet_pt_mask]>=0.5)>=1')\
             .Define('weights', str(xsec_weight))             

        measured = {"4j1b": "HT", "4j2b": 'trijet_mass'}
        for region in ["4j1b","4j2b"]:
            meas = measured[region]
            if region == "4j1b":

                fork = d.Filter('Sum(jet_btag[jet_pt_mask]>=0.5)==1').Define(meas, 'Sum(jet_pt[jet_pt_mask])')      
           
            elif region == "4j2b":

                fork = d.Filter('Sum(jet_btag[jet_pt_mask]>=0.5)>1').Define("jet_p4", 
                    "ROOT::VecOps::Construct<ROOT::Math::PxPyPzMVector>(jet_px[jet_pt_mask], jet_py[jet_pt_mask], jet_pz[jet_pt_mask], jet_mass[jet_pt_mask])"
                )

                fork = fork.Define('trijet', 
                    'ROOT::VecOps::Combinations(jet_pt[jet_pt_mask],3)'
                ).Define('ntrijet', 'trijet[0].size()')

                fork = fork.Define('trijet_p4', 
                                      'ROOT::VecOps::RVec<ROOT::Math::PxPyPzMVector> trijet_p4(ntrijet);'              +\
                                      'for (int i = 0; i < ntrijet; ++i) {'                                            +\
                                          'int j1 = trijet[0][i]; int j2 = trijet[1][i]; int j3 = trijet[2][i];'       +\
                                          'trijet_p4[i] = jet_p4[j1] + jet_p4[j2] + jet_p4[j3];'                       +\
                                      '}'                                                                              +\
                                      'return trijet_p4;'                                                                                                                          
                                     )

                #TODO  implement references
                fork = fork.Define('trijet_pt', 
                        'return ROOT::VecOps::Map(trijet_p4, [](ROOT::Math::PxPyPzMVector v) { return v.Pt(); })'
                                            )

                fork = fork.Define('trijet_btag', 
                                                  'ROOT::VecOps::RVec<bool> btag(ntrijet);'                                   +\
                                                  'for (int i = 0; i < ntrijet; ++i) {'                                       +\
                                                   'int j1 = trijet[0][i]; int j2 = trijet[1][i]; int j3 = trijet[2][i];'     +\
                                                   'btag[i]=std::max({jet_btag[j1], jet_btag[j2], jet_btag[j3]})>0.5;'        +\
                                                  '}'                                                                         +\
                                                  'return btag;'
                                            )

                fork=fork.Define(meas,
                                                  'double mass;'+\
                                                  'double Pt = 0;'+\
                                                  'double indx = 0;'+\
                                                  'for (int i = 0; i < ntrijet; ++i) {'               +\
                                                  '    if ((Pt < trijet_pt[i]) && (trijet_btag[i])) {'+\
                                                  '        Pt = trijet_pt[i];'+\
                                                  '        indx=i;'+\
                                                  '    }'                                            +\
                                                  '}'                                                +\
                                                  'mass = trijet_p4[indx].M();'             +\
                                                  'return mass;'
                                                 )


            res = fork.Histo1D((f'{process}_{variation}_{region}', process, self.num_bins, self.bin_low, self.bin_high), meas, 'weights')
            self[process][variation][region] = res
            self.hist.append(res)
            print(f'histogram {region}_{process}_{variation} has been created')
            
            #TODO Vary without for loop
            
            for i_var, weight_name in enumerate([f"btag_var_{i}" for i in range(4)]):
                for i_dir, direction in enumerate(["up", "down"]):

                    variation=f"{weight_name}_{direction}"
                    var_weights = f'weights_{variation}_{region}'
                    fork = fork.Define(var_weights, 
                               f"weights*Numba::btag_weight_variation({i_var}, jet_pt[jet_pt_mask])[{i_dir}]"
                            )
                    
                    res = fork.Histo1D(
                                        (f'{process}_{variation}_{region}', variation, self.num_bins, self.bin_low, self.bin_high),
                                        meas,
                                        var_weights
                    )
                    
                    
                    if variation not in self[process]:
                        self[process][variation] = {}
                    self[process][variation][region] = res                   
                    
                    self.hist.append(self[process][variation][region])
                    print(f'histogram {region}_{process}_{variation} has been created')
            
            

    def Fill(self):
        self.hist = []
        for process in self:
            
            for variation in self.input_data[process]:
                self.fill(process=process, variation=variation)

    def Accumulate(self):
        ROOT.RDF.RunGraphs(self.hist)            
        
    def GetProcStack(self, region, variation='nominal'):
        return [self[process][variation][region] for process in self]
    
    def GetVarStack(self, region, process="ttbar"):
        return [self[process][variation][region] for variation in self[process]]

    

In [14]:
analysisManager = TtbarAnalysis(use_local_data=True)
analysisManager.input_data

{'ttbar': {'nominal': ['https://xrootd-local.unl.edu:1094//store/user/AGC/datasets/RunIIFall15MiniAODv2/TT_TuneCUETP8M1_13TeV-powheg-pythia8/MINIAODSIM//PU25nsData2015v1_76X_mcRun2_asymptotic_v12_ext3-v1/00000/00DF0A73-17C2-E511-B086-E41D2D08DE30.root']},
 'single_top_s_chan': {'nominal': ['https://xrootd-local.unl.edu:1094//store/user/AGC/datasets/RunIIFall15MiniAODv2/ST_s-channel_4f_InclusiveDecays_13TeV-amcatnlo-pythia8/MINIAODSIM/PU25nsData2015v1_76X_mcRun2_asymptotic_v12-v1/00000/0EB5E88C-FE0D-E611-915D-003048FFD76C.root']},
 'single_top_t_chan': {'nominal': ['https://xrootd-local.unl.edu:1094//store/user/AGC/datasets/RunIIFall15MiniAODv2/ST_t-channel_antitop_4f_inclusiveDecays_13TeV-powhegV2-madspin-pythia8_TuneCUETP8M1/MINIAODSIM/PU25nsData2015v1_76X_mcRun2_asymptotic_v12-v1/00000/00004F9A-E3D2-E511-ABEC-0CC47A78A478.root']},
 'single_top_tW': {'nominal': ['https://xrootd-local.unl.edu:1094//store/user/AGC/datasets/RunIIFall15MiniAODv2/ST_tW_antitop_5f_inclusiveDecays_13TeV-powh

In [15]:
import time
t0 = time.time()
analysisManager.Fill()
t1 = time.time()
print(f"\npreprocessing took {round(t1 - t0,2)} seconds")
analysisManager.Accumulate()
t2 = time.time()
print(f"processing took {round(t2 - t1,2)} seconds")
print(f"execution took {round(t2 - t0,2)} seconds")

histogram 4j1b_ttbar_nominal has been created
histogram 4j1b_ttbar_btag_var_0_up has been created
histogram 4j1b_ttbar_btag_var_0_down has been created
histogram 4j1b_ttbar_btag_var_1_up has been created
histogram 4j1b_ttbar_btag_var_1_down has been created
histogram 4j1b_ttbar_btag_var_2_up has been created
histogram 4j1b_ttbar_btag_var_2_down has been created
histogram 4j1b_ttbar_btag_var_3_up has been created
histogram 4j1b_ttbar_btag_var_3_down has been created
histogram 4j2b_ttbar_btag_var_3_down has been created
histogram 4j2b_ttbar_btag_var_0_up has been created
histogram 4j2b_ttbar_btag_var_0_down has been created
histogram 4j2b_ttbar_btag_var_1_up has been created
histogram 4j2b_ttbar_btag_var_1_down has been created
histogram 4j2b_ttbar_btag_var_2_up has been created
histogram 4j2b_ttbar_btag_var_2_down has been created
histogram 4j2b_ttbar_btag_var_3_up has been created
histogram 4j2b_ttbar_btag_var_3_down has been created
histogram 4j1b_single_top_s_chan_nominal has been cr

In [7]:
output = ROOT.TFile.Open('rdf.root', 'RECREATE')
for process in analysisManager:
    for variation in analysisManager[process]:
        for region in analysisManager[process][variation]:
            hist_name = f"{region}_{process}_{variation}" if variation != 'nominal' else f"{region}_{process}"
            hist_ptr = analysisManager[process][variation][region].GetPtr()
            hist_sliced = ROOT.Slice(hist_ptr, 120, 550)
            hist_binned = hist_sliced.Rebin(2, hist_ptr.GetTitle())
            output.WriteObject(hist_binned, hist_name)
output.Close()



In [8]:
rdf = ROOT.TFile.Open('rdf.root')
coffea = ROOT.TFile.Open('histograms.root')

In [9]:
import traceback

def get_values(h):
    return np.array([h.GetBinContent(i+1) for i in range (h.GetNbinsX())])

def match_histos (rdf_hist, coffea_hist, precision):
    rdf_values = get_values(rdf_hist)
    coffea_values = get_values(coffea_hist)
    rdf_values = np.round(rdf_values, precision)
    coffea_values = np.round(coffea_values, precision)
    mask = rdf_values == coffea_values
    return not (False in mask)

def compare_histos(precision):
    for process in analysisManager:
        for variation in analysisManager[process]:
            for region in analysisManager[process][variation]:
                hist_name = f"{region}_{process}_{variation}" if variation != 'nominal' else f"{region}_{process}"
                rdf_hist = rdf.Get(hist_name)
                coffea_hist = coffea.Get(hist_name)
    #             print(hist_name,'is processing')
                if (rdf_hist and coffea_hist):
                    if not match_histos(rdf_hist, coffea_hist, precision):
                        print('mismatch', hist_name)
                else:
                    raise ValueError('rdf_hist and coffea_hist is Zombie')

In [10]:
compare_histos(2)

mismatch 4j2b_single_top_s_chan_btag_var_3_down


In [16]:
c = TCanvas('c', 'c', 600, 400) 
hlist = analysisManager.GetProcStack(region='4j1b')
hs = THStack('j4b1', '>=4 jets, 1 b-tag (RDF); H_{T} [GeV]')
for h in hlist:
    ptr = h.Rebin(2, h.GetTitle())
    hs.Add(ptr)
hs.Draw('hist pfc plc')
c.Draw()
x = hs.GetXaxis()
x.SetRangeUser(120, 500)
x.SetTitleOffset(1.5)
x.CenterTitle()
c.BuildLegend(0.65, 0.7, 0.9, 0.9)

<cppyy.gbl.TLegend object at 0x2b9eeb00>



In [17]:
hlist = analysisManager.GetProcStack('4j2b')
hs = THStack('j4b1', '>=4 jets, >=2 b-tag (RDF); m_{bjj} [GeV]')
for h in hlist[1:]:
    hs.Add(h)
hs.Add(hlist[0])
hs.Draw('hist pfc plc')
c.Draw()
x = hs.GetXaxis()
x.SetTitleOffset(1.5)
x.CenterTitle()
c.BuildLegend(0.65, 0.7, 0.9, 0.9)

KeyError: '4j2b'

In [19]:
freshstack = analysisManager.GetVarStack(region='4j2b')
hs = THStack('j4b1btag', 'b-tagging variations (RDF); H_{T} [GeV]')
for h in freshstack:
    ptr = h.Rebin(2, h.GetTitle())
    ptr.SetFillColor(0)
    ptr.SetLineWidth(1)
    hs.Add(ptr)
hs.Draw('hist nostack')
c.Draw()
x = hs.GetXaxis()
x.SetRangeUser(120, 500)
x.SetTitleOffset(1.5)
x.CenterTitle()
c.BuildLegend(0.65, 0.7, 0.9, 0.9)

KeyError: '4j2b'

In [None]:
analysisManager