## This notebook is for Data and MC corrections:
1. Trigger Efficiencies
2. Leptom effeciencies

## TRIGGER EFFICIENCY 

### Only for data

In [2]:
import os
import sys
import time
import gc 
import psutil
import json
from pathlib import Path

import uproot
import awkward as ak
import numpy as np

import vector
vector.register_awkward()

import dask
from dask.distributed import Client

print("All imports added")

All imports added


In [15]:
HOME_DIR = Path(os.environ.get("HOME", "/home/cms-jovyan"))
PROJECT_NAME = "H-to-WW-NanoAOD-analysis"

PROJECT_DIR = HOME_DIR / PROJECT_NAME
DATASETS_DIR = PROJECT_DIR / "Datasets"
DATA_DIR = DATASETS_DIR / "DATA"
MC_DIR = DATASETS_DIR / "MC_samples"
AUX_DIR = PROJECT_DIR / "Auxillary_files"

GOLDEN_JSON_PATH = AUX_DIR / "Cert_271036-284044_13TeV_Legacy2016_Collisions16_JSON.txt"

RUN_PERIODS_2016 = {
    "Run2016G": {"run_min": 278820, "run_max": 280385},
    "Run2016H": {"run_min": 280919, "run_max": 284044}
}

print(f"HOME_DIR:         {HOME_DIR}")
print(f"PROJECT_DIR:     {PROJECT_DIR}")
print(f"DATA_DIR:        {DATA_DIR}")
print(f"MC_DIR:          {MC_DIR}")
print(f"AUX_DIR:         {AUX_DIR}")
print(f"GOLDEN_JSON:      {GOLDEN_JSON_PATH}")
print(f"JSON exists:     {GOLDEN_JSON_PATH.exists()}")


HOME_DIR:         /home/cms-jovyan
PROJECT_DIR:     /home/cms-jovyan/H-to-WW-NanoAOD-analysis
DATA_DIR:        /home/cms-jovyan/H-to-WW-NanoAOD-analysis/Datasets/DATA
MC_DIR:          /home/cms-jovyan/H-to-WW-NanoAOD-analysis/Datasets/MC_samples
AUX_DIR:         /home/cms-jovyan/H-to-WW-NanoAOD-analysis/Auxillary_files
GOLDEN_JSON:      /home/cms-jovyan/H-to-WW-NanoAOD-analysis/Auxillary_files/Cert_271036-284044_13TeV_Legacy2016_Collisions16_JSON.txt
JSON exists:     True


In [16]:
SAMPLE_MAPPING = {
    'data' : "Data",
    # 'higgs' : "ggH_HWW",
    # 'dytoll' : "DY_to_Tau_Tau",
    # 'top' : "Top_antitop",
    # 'fakes' : "Fakes",
    # 'vz' : "Diboson",
    # 'ggww' : "ggWW",
    # 'ww' : 'WW',
    # 'vg' : 'VG'
}

def load_urls_from_files(filepath, max_files = None):
    urls = []

    if not os.path.exists(filepath):
        return urls

    with open(filepath, 'r') as f:
        for line in f:
            line = line.strip()
            if line and line.startswith('root://'):
                urls.append(line)
                if max_files and len(urls) >= max_files:
                    break
    return urls

def load_all_files(data_dir, mc_dir, max_per_sample = None):

    files_dict = {}

    for directory in [data_dir, mc_dir]:
        if not os.path.exists(directory):
            continue

        for filename in os.listdir(directory):
            if not filename.endswith(".txt"):
                continue

            filepath = os.path.join(directory, filename)
            filename_lower = filename.lower().replace('.txt', '')

            label = None

            for pattern, sample_label in SAMPLE_MAPPING.items():
                if pattern in filename_lower:
                    label = sample_label
                    break

            if not label:
                print(f" unknown file: {filename}- skipping")
                continue

            urls = load_urls_from_files(filepath, max_per_sample)

            if urls: 
                if label in files_dict:
                    files_dict[label].extend(urls)
                else:
                    files_dict[label] =urls

    return files_dict

files = load_all_files(DATA_DIR, MC_DIR, max_per_sample= 1)
# files = load_all_files(DATA_DIR, MC_DIR)

print("\n" + "="*70)
print("FILES TO PROCESS")
print("="*70)
total = 0
for label, urls in files.items():
    print(f"{label:20s}: {len(urls):4d} files")
    total += len(urls)
print("_"*70)
print(f"{'TOTAL':20s}: {total:4d} files")
print("="*70)

 unknown file: VG.txt- skipping
 unknown file: Higgs.txt- skipping
 unknown file: WW.txt- skipping
 unknown file: Fakes.txt- skipping
 unknown file: VZ.txt- skipping
 unknown file: DYtoLL.txt- skipping
 unknown file: ggWW.txt- skipping
 unknown file: Top.txt- skipping

FILES TO PROCESS
Data                :    1 files
______________________________________________________________________
TOTAL               :    1 files


In [17]:
def load_golden_json(json_input, run_periods=None):
    """
    Load golden JSON from either a file path (str) or a dict.
    """
    
    if isinstance(json_input, str):
        with open(json_input, 'r') as f:
            golden_json = json.load(f)
    elif isinstance(json_input, dict):
        golden_json = json_input
    else:
        raise TypeError(f"Expected str or dict, got {type(json_input)}")
    
    valid_lumis = {}
    for run_str, lumi_ranges in golden_json.items():
        run = int(run_str)
        
        # Filter by run periods 
        if run_periods is not None: 
            in_period = any(
                period['run_min'] <= run <= period['run_max']
                for period in run_periods.values()
            )
            if not in_period:
                continue
        
        valid_lumis[run] = [tuple(lr) for lr in lumi_ranges]
    
    return valid_lumis


def apply_json_mask(arrays, json_input, run_periods=None):

    valid_lumis = load_golden_json(json_input, run_periods)
    
    runs = ak.to_numpy(arrays.run)
    lumis = ak.to_numpy(arrays.luminosityBlock)
    
    mask = np. zeros(len(runs), dtype=bool)
    
    for run, lumi_ranges in valid_lumis.items():
        run_mask = (runs == run)
        
        if not np.any(run_mask):
            continue
        
        # Check lumi sections 
        run_lumis = lumis[run_mask]
        run_lumi_mask = np.zeros(len(run_lumis), dtype=bool)
        
        for lumi_start, lumi_end in lumi_ranges: 
            run_lumi_mask |= (run_lumis >= lumi_start) & (run_lumis <= lumi_end)
        
        mask[run_mask] = run_lumi_mask
    
    return ak.Array(mask)

In [5]:
#get name of the branch required for trigger efficiency 

# DATA
root_file_name = "root://eospublic.cern.ch//eos/opendata/cms/Run2016G/MuonEG/NANOAOD/UL2016_MiniAODv2_NanoAODv9-v1/120000/2ADBED61-A06A-D64B-BE90-E9B267D15700.root"

#  MC 

# file_url = "root://eospublic.cern.ch//eos/opendata/cms/mc/RunIISummer20UL16NanoAODv9/DYJetsToLL_M-50_TuneCP5_13TeV-madgraphMLM-pythia8/NANOAODSIM/106X_mcRun2_asymptotic_v17-v1/40000/14B6A8AE-C9FE-D744-80A4-DDE5D008C1CD.root"

with uproot.open(root_file_name) as file:
        # Access the Events tree
        if "Events" not in file:
            print("Error: 'Events' tree not found in file.")
        else:
            tree = file["Events"]
            branches = tree.keys()
            
            print(f"\nConnection Successful!")
            print(f"Total Branches found: {len(branches)}")
            print("=" * 60)
            
            # Print all branches alphabetically
            for branch in sorted(branches):
                if "HLT_Mu12_TrkIsoVVL_Ele23_CaloIdL_TrackIdL_IsoVL_DZ" in branch or "HLT_Mu23_TrkIsoVVL_Ele12_CaloIdL_TrackIdL_IsoVL_DZ"  in branch:
                    print(branch)



Connection Successful!
Total Branches found: 1380
HLT_Mu12_TrkIsoVVL_Ele23_CaloIdL_TrackIdL_IsoVL_DZ
HLT_Mu23_TrkIsoVVL_Ele12_CaloIdL_TrackIdL_IsoVL_DZ


In [13]:
import time
import uproot

Batch_size = 1_250_000

def load_events(file_url, batch_size=1_250_000, timeout=600, max_retries=3, retry_wait=10, is_data=False):
    columns = [
        "Electron_pt", "Electron_eta", "Electron_phi", "Electron_mass", 
        "Electron_mvaFall17V2Iso_WP90", "Electron_charge",
        
        "Muon_pt", "Muon_eta", "Muon_phi", "Muon_mass", 
        "Muon_tightId", "Muon_charge", "Muon_pfRelIso04_all",
        "PuppiMET_pt", "PuppiMET_phi",
        
        "Jet_pt", "Jet_eta", "Jet_phi", "Jet_mass",
        "Jet_btagDeepFlavB", "nJet", "Jet_jetId", "Jet_puId",

        "HLT_Mu12_TrkIsoVVL_Ele23_CaloIdL_TrackIdL_IsoVL_DZ",
        "HLT_Mu23_TrkIsoVVL_Ele12_CaloIdL_TrackIdL_IsoVL_DZ"
    ]

    if is_data:
        columns.extend(["run", "luminosityBlock"])
    else:
        columns.append("genWeight")
        
    for attempt in range(max_retries):
        try:
            with uproot.open(file_url, timeout=timeout) as f:
                tree = f['Events']
                
                # Iterate through file chunks
                for arrays in tree.iterate(columns, step_size=batch_size, library="ak"):
                    yield arrays
                
                # If iteration finishes successfully, exit the function
                return
                
        except (TimeoutError, OSError, IOError, ConnectionError) as e:
            error_type = type(e).__name__
            file_name = file_url.split('/')[-1]
            
            if attempt < max_retries - 1:
                print(f"      {error_type} on {file_name}")
                print(f"       Retry {attempt+1}/{max_retries-1} in {retry_wait}s...")
                time.sleep(retry_wait)
            else:
                print(f"     FAILED after {max_retries} attempts: {file_name}")
                print(f"       Error: {str(e)[:100]}")
                raise
                
        except Exception as e:
            file_name = file_url.split('/')[-1]
            print(f"     Unexpected error on {file_name}: {str(e)[:100]}")
            raise

## TRIGGER PART

Trigger efficiency = $\frac{denominator + Trigger cut}{\#\ of\ events\ after\ passing\ preselection}$

> Preselection inlcudes:
> 1. 2 leptons
> 2. lepton ID (Electron \& Muon)
> 3. |$\eta$| < 2.5
> 4. pT requirement: lead >25 and sublead > 13

In [18]:
def preselection_passed_events():
    
    def select_tight_leptons(arrays, met_arrays):

        tight_electron_mask = arrays.Electron_mvaFall17V2Iso_WP90 ==1
        tight_muon_mask = (arrays.Muon_tightId == 1) & (arrays.Muon_pfRelIso4_all < 0.15)
    
        tight_electrons = ak.zip({
            "pt": arrays.Electron_pt[tight_electron_mask],
            "eta": arrays.Electron_eta[tight_electron_mask],
            "phi": arrays.Electron_phi[tight_electron_mask],
            "mass": arrays.Electron_mass[tight_electron_mask],
            "charge": arrays.Electron_charge[tight_electron_mask],
            "flavor": ak.ones_like(arrays.Electron_pt[tight_electron_mask]) * 11
        })
        
        tight_muons = ak.zip({
            "pt": arrays.Muon_pt[tight_muon_mask],
            "eta": arrays.Muon_eta[tight_muon_mask],
            "phi": arrays.Muon_phi[tight_muon_mask],
            "mass": arrays.Muon_mass[tight_muon_mask],
            "charge": arrays.Muon_charge[tight_muon_mask],
            "flavor": ak.ones_like(arrays.Muon_pt[tight_muon_mask]) * 13
        })
    
        tight_leptons = ak.concatenate([tight_electrons, tight_muons], axis = 1)

    return tight_leptons, tight_electron_mask, tight_muon_mask


    def select_e_mu_events(tight_leptons, met arrays):

        sorted_leptons = tight_leptons[ak.argsort(tight_leptons.pt, ascending = False)]

        mask_2lep = ak.num(sorted_leptons) == 2

        events_2lep = sorted_leptons[mask_2lep]
        met_2lep = met_arrays[mask_2lep]


        if len(events_2lep) ==0:
            return None, None, {}, None

        leading = events_2lep[:,0]
        subleading = events_2lep[:,1]

        mask_1e1mu = ((leading.flavor == 13) & (subleading == 11)) | ((leading.flavor ==11) & (subleading.flavor == 13))

        mask_opposite_charge = leading.charge*subleading.charge <0

        mask_pt = (leading.pt>25) & (subleading.pt > 13)

        mask_eta = (abs(leadin.eta)<2.5) & (abs(subleading.eta) < 2.5)

        final_mask = mask_1e1mu & mask_opposite_charge & mask_pt & mask_eta

    return final_mask



SyntaxError: invalid syntax (3080967479.py, line 31)

In [None]:
def HLT_passed_events(arrays, met_arra)