## X-Sec Weights Factors Calculation for H $\to$ WW Analysis

## Imports

In [1]:
try: 
    import uproot
    import awkward
    import numpy as np
    import os
    import time
    import dask
    from dask import delayed, compute
    from dask.distributed import Client, progress
    from pathlib import Path
    from tqdm import tqdm 
    import pandas as pd
    from datetime import datetime
    import gc

    print("All imports done!")

except Exception as e:
    print(f"Error: {e}")
    




All imports done!


In [2]:
BASE_DIR = "/home/cms-jovyan/H-to-WW-NanoAOD-analysis/Datasets/MC_samples"
os.chdir(BASE_DIR)
for files in os.listdir():
    if files.endswith("txt"):
        print(f"-{files}")

-VG.txt
-Higgs.txt
-WW.txt
-Fakes.txt
-VZ.txt
-DYtoLL.txt
-ggWW.txt
-Top.txt


# Step 1: Find sum of gen weights

In [4]:
import dask
import dask.bag as db
from dask.distributed import Client
import uproot
import numpy as np
import json
import time

def identify_process(filename):
    """
    Returns the unique process name based on the filename string.
    This ensures we split 'Top.txt' into TTTo2L, ST_t-channel, etc.
    """
    fn = filename
    
    # 1. Drell-Yan
    if "DYJetsToLL_M-50" in fn: return "DYJetsToLL_M-50"
    
    # 2. Top (Exact Matches)
    if "TTTo2L2Nu" in fn:            return "TTTo2L2Nu"
    if "ST_t-channel_top" in fn:     return "ST_t-channel_top"
    if "ST_t-channel_antitop" in fn: return "ST_t-channel_antitop"
    if "ST_tW_antitop" in fn:        return "ST_tW_antitop"
    if "ST_tW_top" in fn:            return "ST_tW_top"
    if "ST_s-channel" in fn:         return "ST_s-channel"
    
    # 3. Fakes (W+Jets & Semileptonic Top)
    if "WJetsToLNu" in fn:       return "WJetsToLNu"
    if "TTToSemiLeptonic" in fn: return "TTToSemiLeptonic"
    
    # 4. Diboson (Split WZ and ZZ)
    if "WZTo2Q2L" in fn: return "WZTo2Q2L"
    if "WZTo3LNu" in fn: return "WZTo3LNu"
    if "ZZ_TuneCP5" in fn:       return "ZZ" 
    
    # 5. ggWW (Continuum Background)
    if "GluGluToWW" in fn: return "GluGluToWW"
    
    # 6. WW (Signal-like Background)
    if "WWTo2L2Nu" in fn: return "WWTo2L2Nu"
    
    # 7. VG (Vector Boson + Gamma)
    if "ZGToLLG" in fn:  return "ZGToLLG"
    if "WGToLNuG" in fn: return "WGToLNuG"

    # 8. Higgs Signal (Assuming standard naming if present)
    if "GluGluHToWW" in fn: return "Higgs"
    
    return "Unknown"

# ==============================================================================
# 2. WORKER FUNCTION (Run on Dask)
# ==============================================================================
def process_root_file(file_path):
    """
    Opens file, identifies process, returns (process_name, sum_weight)
    """
    process_name = identify_process(file_path)
    
    # Skip unknown files or Data (Data has no genWeight)
    if process_name == "Unknown" or "Run2016" in file_path:
        return (None, 0.0)

    try:
        with uproot.open(file_path) as f:
            # FAST METHOD: Use Runs tree
            if "Runs" in f:
                w = np.sum(f["Runs"]["genEventSumw"].array())
                return (process_name, float(w))
            
            elif "Events" in f:
                w = np.sum(f["Events"]["genWeight"].array())
                return (process_name, float(w))
            
    except Exception as e:
        print(f"Error reading {file_path}: {e}")
        return (None, 0.0)
        
    return (None, 0.0)

# ==============================================================================
# 3. MAIN EXECUTION
# ==============================================================================
def calculate_weights_per_process(txt_files):
    
    # Start Dask Client
    client = Client("tls://localhost:8786")
    print(f"Dask Dashboard: {client.dashboard_link}")
    
    all_file_paths = []
    for txt in txt_files:
        with open(txt, 'r') as f:
            lines = [line.strip() for line in f if line.strip()]
            all_file_paths.extend(lines)
            
    print(f"Total files to scan: {len(all_file_paths)}")
    
    bag = db.from_sequence(all_file_paths, partition_size=20)
    results = bag.map(process_root_file).compute()
    
    # 3. Aggregate Results locally
    final_sums = {}
    
    for proc_name, weight in results:
        if proc_name is None: continue
        
        if proc_name not in final_sums:
            final_sums[proc_name] = 0.0
        final_sums[proc_name] += weight
        
    # 4. Print & Save
    print("\n" + "="*50)
    print("FINAL SUM OF WEIGHTS (PER PROCESS)")
    print("="*50)
    
    # Print formatted for Python Dictionary copy-paste
    print("sample_weights = {")
    for proc, weight in sorted(final_sums.items()):
        print(f'    "{proc}": {weight:.6f},')
    print("}")
    
    # Save to JSON for safety
    with open("detailed_sum_weights.json", "w") as f:
        json.dump(final_sums, f, indent=4)

    client.close()

# ==============================================================================
# RUN IT
# ==============================================================================
if __name__ == "__main__":
    txt_files = [
        "VG.txt", "Higgs.txt", "WW.txt", "Fakes.txt",
        "VZ.txt", "DYtoLL.txt", "ggWW.txt", "Top.txt"
    ]
    
    start_time = time.time()
    calculate_weights_per_process(txt_files)
    print(f"\nTime taken: {time.time() - start_time:.2f}s")

Dask Dashboard: /user/anujraghav.physics@gmail.com/proxy/8787/status
Total files to scan: 728

FINAL SUM OF WEIGHTS (PER PROCESS)
sample_weights = {
    "DYJetsToLL_M-50": 82448537.000000,
    "GluGluToWW": 17662000.000000,
    "ST_s-channel": 19429336.178924,
    "ST_t-channel_antitop": 1522100315.652050,
    "ST_t-channel_top": 6703802049.125998,
    "ST_tW_antitop": 27306324.658000,
    "ST_tW_top": 20635251.100800,
    "TTTo2L2Nu": 3140127171.474801,
    "TTToSemiLeptonic": 43548253725.284019,
    "WGToLNuG": 3353413.000000,
    "WJetsToLNu": 9697410121705.164062,
    "WWTo2L2Nu": 32147079.594800,
    "WZTo2Q2L": 129756627.882064,
    "WZTo3LNu": 4077550.631840,
    "ZGToLLG": 3106465270.711500,
    "ZZ": 1151000.000000,
}

Time taken: 185.02s
