# QCD Processor

In [1]:
import awkward as ak
import numpy as np
import coffea
import uproot
import hist
import vector
from coffea import util, processor
from coffea.nanoevents import NanoEventsFactory, NanoAODSchema, BaseSchema
from distributed.diagnostics.plugin import UploadDirectory
import matplotlib.pyplot as plt
from collections import defaultdict
import os
import pickle
import correctionlib
from coffea.analysis_tools import PackedSelection
from dask.distributed import Client

To start with, we define the only correction function we will use on our data, which will provide a correction to the pileup scale factor. We will use the json files in `data/pu_weights` for this.

In [2]:
def GetPUSF(IOV, nTrueInt, var='nominal'):
    
    corrlib_namemap = {
    "2016APV":"2016preVFP_UL",
    "2016":"2016postVFP_UL",
    "2017":"2017_UL",
    "2018":"2018_UL"
    }
    
    fname = "data/pu_weights/" + corrlib_namemap[IOV] + "/puWeights.json.gz"
    hname = {
        "2016APV": "Collisions16_UltraLegacy_goldenJSON",
        "2016"   : "Collisions16_UltraLegacy_goldenJSON",
        "2017"   : "Collisions17_UltraLegacy_goldenJSON",
        "2018"   : "Collisions18_UltraLegacy_goldenJSON"
    }
    
    evaluator = correctionlib.CorrectionSet.from_file(fname)
    return evaluator[hname[IOV]].evaluate(np.array(nTrueInt), var)

In [3]:
class QCDProcessor(processor.ProcessorABC):
        
    def __init__(self):
        
        ############################################
        ### Defining the axes for the histograms ###
        ############################################
        
        dataset_axis = hist.axis.StrCategory([], growth=True, name="dataset", label="Primary dataset")
        frac_axis = hist.axis.Regular(300, 0, 2.0, name="frac", label=r"Fraction")               
        #pt_axis = hist.axis.Variable([10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100, 150, 200, 250, 300, 350, 400, 450, 500,
        #                            550, 600, 650, 700, 750, 800, 850, 900, 950, 1000, 1500, 2000, 2500, 3000, 3500, 4000, 4500, 5000, 6000, 7000, 8000, 9000, 10000], name="pt", label=r"$p_{T}$ [GeV]")
        
        pt_axis = hist.axis.Variable([10 ,  11  , 12 ,  13  , 14  , 15 ,  17, 20  , 23  , 27   ,30  , 35   ,40  , 45 ,  57 ,  72  , 90  , 120 ,  150, 200  , 300 ,  400   ,550 ,  750 ,  1000 ,
                                    1500  , 2000 ,  2500  , 3000, 3500 ,  4000  ,  5000   ,10000], name="pt", label=r"$p_{T}$ [GeV]")
        
        pileup_axis = hist.axis.Variable([0, 10, 20, 30, 40, 50, 60, 70, 80], name="pileup", label=r"$\mu$")
        pileup_fine_axis = hist.axis.Regular(30, 0, 40, name="pileup_fine", label=r"$\mu$")
        rho_fine_axis = hist.axis.Regular(30, 0, 30, name="rho_fine", label=r"$\rho$")
        eta_axis = hist.axis.Variable([0, 0.261, 0.522, 0.783, 1.044, 1.305, 1.566, 1.74, 1.93, 2.043, 2.172, 2.322, 2.5, 2.65, 2.853, 2.964, 3.139, 3.489, 3.839, 5.191], name="eta", label=r"$\eta$")
        
        ######################################
        ### Defining the histogram objects ###
        ######################################
             
        h_pt_reco_over_gen = hist.Hist(dataset_axis, pt_axis, frac_axis, eta_axis, pileup_axis, storage="weight", label="Counts")
        h_pt_reco_over_gen_noweight = hist.Hist(dataset_axis, pt_axis, frac_axis, eta_axis, pileup_axis, storage="weight", label="Counts")
        h_pileup_rho = hist.Hist(dataset_axis, pileup_fine_axis, rho_fine_axis, storage="weight", label="Counts")
        h_pileup_rho_noweight = hist.Hist(dataset_axis, pileup_fine_axis, rho_fine_axis, storage="weight", label="Counts")
                                         
        cutflow = {}
        
        self.hists = {
            "pt_reco_over_gen":h_pt_reco_over_gen,
            "pt_reco_over_gen_noweight":h_pt_reco_over_gen_noweight,
            "pileup_rho":h_pileup_rho,
            "pileup_rho_noweight":h_pileup_rho_noweight,
            "cutflow":cutflow,
        }
        
    @property
    def accumulator(self):
        return self.hists
    
    def process(self, events):
        
        dataset = events.metadata['dataset']
        print(f"Processing ----- {dataset}")
        if dataset not in self.hists["cutflow"]:
            self.hists["cutflow"][dataset] = defaultdict(int)
        
        #################################
        ### Getting gen and reco jets ###
        #################################
        
        gen_vtx = events.GenVtx.z
        reco_vtx = events.PV.z
        
        events = events[np.abs(gen_vtx - reco_vtx) < 0.2]
        
        ### Apply jetId mask
        
        events.Jet = events.Jet[events.Jet.jetId > 0]
        events = events[ak.num(events.Jet) > 0]
        
        ### Match generated jets (events.GenJet) to reconstructed jets (events.Jet) 
        
        genjets = events.GenJet[:,0:3]
        recojets = genjets.nearest(events.Jet, threshold=0.2)
        
        sel = ~ak.is_none(recojets, axis=1)
        
        genjets = genjets[sel]
        recojets = recojets[sel]    
        ptresponse = recojets.pt / genjets.pt
        
        ##################################################################################################
        ### Getting the number of primary vertices, number of pileups, and 3D distance to origin (rho) ###
        ##################################################################################################
        
        n_reco_vtx = events.PV.npvs
        n_pileup = events.Pileup.nPU
        rho = events.fixedGridRhoFastjetAll
        pu_nTrueInt = events.Pileup.nTrueInt
        
        sel = ~ak.is_none(ptresponse, axis=1)
        
        genjets = genjets[sel]
        recojets = recojets[sel]
        ptresponse = ptresponse[sel]
        
        sel2 = ak.num(ptresponse) > 2
        
        genjets = genjets[sel2]
        recojets = recojets[sel2]
        ptresponse = ptresponse[sel2]
        
        n_reco_vtx = n_reco_vtx[sel2]
        n_pileup = n_pileup[sel2]
        rho = rho[sel2]
        pu_nTrueInt = pu_nTrueInt[sel2]
        
        ##############################################################################
        ### Broadcast across recojets, include pileup weights, and fill histograms ###
        ##############################################################################
        
        n_reco_vtx = ak.broadcast_arrays(n_reco_vtx, recojets.pt)[0]
        n_pileup = ak.broadcast_arrays(n_pileup, recojets.pt)[0]
        rho = ak.broadcast_arrays(rho, recojets.pt)[0]
        pu_nTrueInt = ak.broadcast_arrays(pu_nTrueInt, recojets.pt)[0]
        
        puWeight = GetPUSF(dataset, np.array(ak.flatten(pu_nTrueInt)))
        
        self.hists["pt_reco_over_gen"].fill(dataset=dataset, pt=ak.flatten(genjets.pt), frac=ak.flatten(ptresponse), 
                                            eta=np.abs(ak.flatten(genjets.eta)), pileup=ak.flatten(n_pileup), weight=puWeight)
        
        self.hists["pt_reco_over_gen_noweight"].fill(dataset=dataset, pt=ak.flatten(genjets.pt), frac=ak.flatten(ptresponse), 
                                            eta=np.abs(ak.flatten(genjets.eta)), pileup=ak.flatten(n_pileup))
        
        self.hists["pileup_rho"].fill(dataset=dataset, rho_fine=ak.flatten(rho), pileup_fine=ak.flatten(n_pileup), weight=puWeight)

        self.hists["pileup_rho_noweight"].fill(dataset=dataset, rho_fine=ak.flatten(rho), pileup_fine=ak.flatten(n_pileup))
                                        
        return self.hists
    
    def postprocess(self, accumulator):
        return accumulator

In [4]:
#prependstr = "root://cmsxrootd.fnal.gov/"
prependstr = "root://xcache/"

filedir = "samples/"

filestr = "flatPU_JMENano_%s.txt"

eras = [
     '2016',
     '2016APV',
     '2017',
     '2018',
]

fileset = {}

for era in eras:
    filename = filedir + filestr % (era)
    with open(filename) as f:
        files = [prependstr + i.rstrip() for i in f.readlines() if i[0] != "#"]
        fileset[era] = files
        
futures_run = processor.Runner(
    executor = processor.FuturesExecutor(compression=None, workers=2),
    schema=NanoAODSchema,
    maxchunks=10,
    skipbadfiles=True,
)

out = futures_run(
    fileset,
    "Events",
    processor_instance=QCDProcessor()
)

fname_out = "pkl_files/QCD_pt_response_NEW.pkl"

with open(fname_out, "wb") as f:
    pickle.dump(out, f)

Output()

Output()

Processing ----- 2018Processing ----- 2018

Processing ----- 2018
Processing ----- 2018
Processing ----- 2018
Processing ----- 2018
Processing ----- 2018
Processing ----- 2018
Processing ----- 2018
Processing ----- 2018
Processing ----- 2017
Processing ----- 2017
Processing ----- 2017
Processing ----- 2017
Processing ----- 2017
Processing ----- 2017
Processing ----- 2017
Processing ----- 2017
Processing ----- 2017
Processing ----- 2017
Processing ----- 2016APV
Processing ----- 2016APV
Processing ----- 2016APV
Processing ----- 2016APV
Processing ----- 2016APV
Processing ----- 2016APV
Processing ----- 2016APV
Processing ----- 2016APV
Processing ----- 2016APV
Processing ----- 2016
Processing ----- 2016
Processing ----- 2016
Processing ----- 2016
Processing ----- 2016
Processing ----- 2016Processing ----- 2016

Processing ----- 2016
Processing ----- 2016
