`TTbarResCoffea` Notebook to perform the data-driven mistag-rate-based ttbar hadronic analysis. 
This module must be run twice: 
   1. Make the mistag rate in the "anti-tag and probe" selection 
and the expectation in the signal region from MC,
   1. Applies that mistag rate and the mod-mass procedure to the single-tag selection. 

These are all done in bins of
b-tag categories (0, 1, $\ge 2$) and rapidity ($|y| \le 1.0$, $|y| > 1.0$).
The signal region is two top-tagged jets. 
The background estimate is the single-tag selection weighted by the mistag rate from the
"anti-tag and probe" region, with the mass of the weighted jet set to a random
value from QCD MC in the 1-ttag region. 


The preselection is:
- AK4-based $H_{T} > 1100$ GeV (to be on the trigger plateau). 
- $\ge 2$ AK8 jets with AK8 $p_{T} > 400$ GeV and $|y| < 2.5$, loose jet ID applied from matched AK4 jets

The 1-tag selection adds:
- $\ge 1$ AK8 jet with top tagging applied to randomly-assigned tag jet. 


The anti-tag selection is disjoint from the 1-tag selection:
- $\ge 1$ AK8 jet with top tagging VETO applied to randomly-assigned tag jet. 


The 2-tag selection is:
- $\ge 2$ AK8 jets with top tagging applied to both leading jets. 


The ttbar candidate mass assumes the two leading top-tagged jets are the top quarks. 


In [1]:
import time

from coffea import hist
from coffea.analysis_objects import JaggedCandidateArray
import coffea.processor as processor
from awkward import JaggedArray
import numpy as np
import glob as glob
import itertools

In [2]:
"""@TTbarResAnaHadronic Package to perform the data-driven mistag-rate-based ttbar hadronic analysis. 
"""
class TTbarResProcessor(processor.ProcessorABC):
    def __init__(self, htCut=1100., minMSD=105., maxMSD=210., tau32Cut=0.7, ak8PtMin=400., bdisc=0.7,
                writePredDist=True,isData=True,year=2019):
        
        self.htCut = htCut
        self.minMSD = minMSD
        self.maxMSD = maxMSD
        self.tau32Cut = tau32Cut
        self.ak8PtMin = ak8PtMin
        self.bdisc = bdisc
        self.writePredDist = writePredDist
        self.writeHistFile = True
        self.isData = isData
        self.year=year
        
        self.btagcats = ["0b", "1b", "2b"]   # 0, 1, >=2 btags
        self.ycats = ['cen', 'fwd']          # Central and forward
        # Combine categories like "0bcen", "0bfwd", etc:
        self.anacats = [ b+y for b,y in itertools.product( self.btagcats, self.ycats) ]
        self.anacats += ['pretag']
        print(self.anacats)
        
        dataset_axis = hist.Cat("dataset", "Primary dataset")
        cats_axis = hist.Cat("anacat", "Analysis Category")
        
        ht_axis = hist.Bin("h_ak4ht", r"AK4 Jet H_{T} [GeV]", 50, 0, 5000)
        jetmass_axis = hist.Bin("jetmass", r"Jet $m$ [GeV]", 50, 0, 500)
        jetpt_axis = hist.Bin("jetpt", r"Jet $p_{T}$ [GeV]", 50, 0, 5000)
        jetn3b1_axis = hist.Bin("n3b1", r"Jet N3", 50, 0, 1)
        ttbarmass_axis = hist.Bin("ttbarmass", r"$m_{t\bar{t}}$ [GeV]", 50, 0, 5000)
        

        self._accumulator = processor.dict_accumulator({
            'h_ak4ht'  : hist.Hist("Counts", dataset_axis, cats_axis, ht_axis),
            'ttbarmass': hist.Hist("Counts", dataset_axis, cats_axis, ttbarmass_axis),
            'jetmass':   hist.Hist("Counts", dataset_axis, cats_axis, jetmass_axis),
            'jetpt':     hist.Hist("Counts", dataset_axis, cats_axis, jetpt_axis),
            'cutflow': processor.defaultdict_accumulator(int),
        })

    @property
    def accumulator(self):
        return self._accumulator

    def process(self, df):
        output = self.accumulator.identity()

        dataset = df['dataset']
        FatJets = JaggedCandidateArray.candidatesfromcounts(
            df['nFatJet'],
            pt=df['FatJet_pt'],
            eta=df['FatJet_eta'],
            phi=df['FatJet_phi'],
            mass=df['FatJet_mass'],
            msoftdrop=df['FatJet_msoftdrop'],
            jetId=df['FatJet_jetId'],
            tau1=df['FatJet_tau1'],
            tau2=df['FatJet_tau2'],
            tau3=df['FatJet_tau3'],
            tau4=df['FatJet_tau4'],
            n3b1=df['FatJet_n3b1'],
            btagDeepB=df['FatJet_btagDeepB']
            )
        
        #weight = JaggedArray.fromcounts(
        #    np.ones_like(df['Generator_binvar'],dtype=int),
        #    df['Generator_weight']
        #)
        evtweights = df["Generator_weight"].reshape(-1, 1).flatten()
        output['cutflow']['all events'] += FatJets.size
        
        jet_id = (FatJets.jetId > 0)
        FatJets = FatJets[jet_id]
        output['cutflow']['jet id'] += jet_id.any().sum()
        
        jetkincut_index = (FatJets.pt > self.ak8PtMin) & (abs(FatJets.eta) < 2.5)
        FatJets = FatJets[ jetkincut_index ]
        
        oneFatJet = (FatJets.counts >=1)
        output['cutflow']['one FatJet'] += oneFatJet.sum()
        
        twoFatJets = (FatJets.counts >= 2)
        output['cutflow']['two FatJets'] += twoFatJets.sum()

        FatJets = FatJets[twoFatJets]
        evtweights = evtweights[twoFatJets]
        ttbarcands = FatJets[:,0:2].distincts()

        oneTTbar = (ttbarcands.counts >= 1)
        output['cutflow']['>= one oneTTbar'] += oneTTbar.sum()
        ttbarcands = ttbarcands[oneTTbar]
        evtweights = evtweights[oneTTbar]
        FatJets = FatJets[oneTTbar]

        
        dPhiCut = (ttbarcands.i0.p4.delta_phi(ttbarcands.i1.p4) > 2.1).flatten()
        output['cutflow']['dPhi > 2.1'] += dPhiCut.sum()
        ttbarcands = ttbarcands[dPhiCut]
        evtweights = evtweights[dPhiCut]
        FatJets = FatJets[dPhiCut]
        
        output['ttbarmass'].fill(dataset=dataset, anacat='pretag', 
                            ttbarmass=ttbarcands.p4.sum().mass.flatten(),
                            weight=evtweights.flatten())
        
        # Now get the analysis categories. 
        # They are (central, forward)   cross   (0b,1b,>=2b)
        cen = abs(ttbarcands.i0.p4.y - ttbarcands.i1.p4.y) < 1.0
        fwd = np.logical_not(cen)
        
        btag_i0 = (ttbarcands.i0.btagDeepB > 0.7)
        btag_i1 = (ttbarcands.i1.btagDeepB > 0.7)
        
        btag0 = np.logical_not(btag_i0) & np.logical_not(btag_i1)
        btag1 = btag_i0 ^ btag_i1
        btag2 = btag_i0 & btag_i1
        
        cat0 = (cen & btag0).flatten()
        cat1 = (fwd & btag0).flatten()
        cat2 = (cen & btag1).flatten()
        cat3 = (fwd & btag1).flatten()
        cat4 = (cen & btag2).flatten()
        cat5 = (fwd & btag2).flatten()
        
        output['cutflow']['0bcen'] += cat0.sum()
        output['cutflow']['0bfwd'] += cat1.sum()
        output['cutflow']['1bcen'] += cat2.sum()
        output['cutflow']['1bfwd'] += cat3.sum()
        output['cutflow']['2bcen'] += cat4.sum()
        output['cutflow']['2bfwd'] += cat5.sum()
        
        ttbarmass = ttbarcands.p4.sum().mass.flatten()    
        
        output['ttbarmass'].fill(dataset=dataset, anacat='0bcen', 
                            ttbarmass=ttbarmass[cat0],
                            weight=evtweights[cat0].flatten())
        
        output['ttbarmass'].fill(dataset=dataset, anacat='0bfwd', 
                            ttbarmass=ttbarmass[cat1],
                            weight=evtweights[cat1].flatten())
        
        output['ttbarmass'].fill(dataset=dataset, anacat='1bcen', 
                            ttbarmass=ttbarmass[cat2],
                            weight=evtweights[cat2].flatten())
        
        output['ttbarmass'].fill(dataset=dataset, anacat='1bfwd', 
                            ttbarmass=ttbarmass[cat3],
                            weight=evtweights[cat3].flatten())
        
        output['ttbarmass'].fill(dataset=dataset, anacat='2bcen', 
                            ttbarmass=ttbarmass[cat4],
                            weight=evtweights[cat4].flatten())
        
        output['ttbarmass'].fill(dataset=dataset, anacat='2bfwd', 
                            ttbarmass=ttbarmass[cat5],
                            weight=evtweights[cat5].flatten())
        
        return output

    def postprocess(self, accumulator):
        return accumulator

In [3]:
tstart = time.time()

qcdfiles = glob.glob('/mnt/cms-data/store/mc/RunIIFall17NanoAODv4/QCD_Pt-15to7000_TuneCP5_Flat_13TeV_pythia8/NANOAODSIM/PU2017_12Apr2018_Nano14Dec2018_102X_mc2017_realistic_v6-v1/10000/*.root')
fileset = {
    'QCD':qcdfiles # QCD_Pt-15to7000_TuneCP5_Flat_13TeV_pythia8
    #'ZZ to 4mu': [
    #    'data/ZZTo4mu.root'
    #]
}

output = processor.run_uproot_job(fileset,
                                  treename='Events',
                                  processor_instance=TTbarResProcessor(),
                                  executor=processor.futures_executor,
                                  executor_args={'workers': 4, 'flatten': True},
                                  chunksize=500000,
                                 )

elapsed = time.time() - tstart
print(output)

Preprocessing:   0%|          | 0/1 [00:00<?, ?it/s]

['0bcen', '0bfwd', '1bcen', '1bfwd', '2bcen', '2bfwd', 'pretag']


Preprocessing: 100%|██████████| 1/1 [00:00<00:00,  2.11it/s]
Processing: 100%|██████████| 42/42 [00:41<00:00,  1.02items/s]

{'h_ak4ht': <Hist (dataset,anacat,h_ak4ht) instance at 0x7f4f199acac8>, 'ttbarmass': <Hist (dataset,anacat,ttbarmass) instance at 0x7f4ede95c2e8>, 'jetmass': <Hist (dataset,anacat,jetmass) instance at 0x7f4ede95cef0>, 'jetpt': <Hist (dataset,anacat,jetpt) instance at 0x7f4ede95c358>, 'cutflow': defaultdict(<class 'int'>, {'all events': 18455107, 'jet id': 10092589, 'one FatJet': 4572164, 'two FatJets': 3566899, '>= one oneTTbar': 3566899, 'dPhi > 2.1': 1777885, '0bcen': 775, '0bfwd': 1732353, '1bcen': 30, '1bfwd': 44271, '2bcen': 0, '2bfwd': 456})}





In [4]:
stack_fill_opts = {'alpha': 0.8, 'edgecolor':(0,0,0,.5)}
stack_error_opts = {'label':'Stat. Unc.', 'hatch':'///', 'facecolor':'none', 'edgecolor':(0,0,0,.5), 'linewidth': 0}


In [5]:
import matplotlib.pyplot as plt
fig, ax = hist.plotgrid(output['ttbarmass'], row="anacat", overlay="dataset", stack=True,
                                  #fill_opts=stack_fill_opts,
                                  #error_opts=stack_error_opts,
                                 )
plt.yscale("log")
for iax in ax.flatten():
    iax.autoscale(axis='y')
fig.show()

In [6]:
print("Events/s:", output['cutflow']['all events']/elapsed)

Events/s: 442628.34385034646


In [7]:
for i,j in output['cutflow'].items():
    print( '%20s : %12d' % (i,j) )

          all events :     18455107
              jet id :     10092589
          one FatJet :      4572164
         two FatJets :      3566899
     >= one oneTTbar :      3566899
          dPhi > 2.1 :      1777885
               0bcen :          775
               0bfwd :      1732353
               1bcen :           30
               1bfwd :        44271
               2bcen :            0
               2bfwd :          456


In [9]:

#mapping = {
#    'QCD': ['QCD'],
#}
#output['ttbarmass'].group("dataset", hist.Cat("dataset", "dataset"), mapping)
#hist_noDS = output['ttbarmass_pretag'].integrate('dataset')
