## X-Sec Weights Factors Calculation for H $\to$ WW Analysis

## Imports

In [1]:
try: 
    import uproot
    import awkward
    import numpy as np
    import os
    import time
    import dask
    from dask import delayed, compute
    from dask.distributed import Client, progress
    from pathlib import Path
    from tqdm import tqdm 
    import pandas as pd
    from datetime import datetime
    import gc

    print("All imports done!")

except Exception as e:
    print(f"Error: {e}")
    




All imports done!


In [2]:
BASE_DIR = "/home/cms-jovyan/H-to-WW-NanoAOD-analysis/Datasets/MC_samples"
os.chdir(BASE_DIR)
for files in os.listdir():
    if files.endswith("txt"):
        print(f"-{files}")

-VG.txt
-Higgs.txt
-WW.txt
-Fakes.txt
-VZ.txt
-DYtoLL.txt
-ggWW.txt
-Top.txt


# Step 1: Find sum of gen weights

In [3]:
def extract_and_save_genweight(txt_file, output_file, output, sample_name=None):
    """Extract genWeight from a sample and save directly to ROOT file"""
    start_time = time.time()
    
    if sample_name is None:
        sample_name = txt_file.replace('.txt', '')
    
    with open(txt_file) as f:
        root_files = [line.strip() for line in f if line.strip()]

    sum_gen_weight = 0
    n_events = 0
    n_files_processed = 0
    all_genweights = []

    print(f"Processing {sample_name}")
    
    for i, file in enumerate(root_files, 1):
        try: 
            with uproot.open(file) as f:
                if "Events" not in f:
                    continue
                
                tree = f['Events']

                for data in tree.iterate("genWeight", step_size="100MB"):
                    weights = data["genWeight"].to_numpy()
                    sum_gen_weight += np.sum(weights)
                    n_events += len(weights)
                    all_genweights.append(weights)
                    
            n_files_processed += 1
            if i % 50 == 0:
                print(f"  {sample_name}: processed {i}/{len(root_files)} files")

        except Exception as e:
            print(f" Error in {sample_name}/{file}: {e}")

    elapsed_time = time.time() - start_time
    print(f" Completed: {sample_name} ({n_files_processed} files) - Time: {elapsed_time:.2f}s ({elapsed_time/60:.2f}min)")
    
    # Concatenate and save immediately
    if all_genweights:
        combined_genweights = np.concatenate(all_genweights)
        print(f"Total events: {len(combined_genweights)}")
        print(f"Sum of genWeights: {sum_gen_weight:.6e}")
        
        # Save to ROOT file
        branch_name = f"{sample_name}_genWeight"
        output[sample_name] = {branch_name: combined_genweights}
        print(f"Saved tree: {sample_name}\n")
        
        # Free memory immediately
        del all_genweights
        del combined_genweights
        
        return sample_name, sum_gen_weight, n_events, n_files_processed, elapsed_time
    else:
        print(f" No genWeight\n")
        return sample_name, sum_gen_weight, n_events, n_files_processed, elapsed_time


def save_all_samples(txt_files, output_file="/home/cms-jovyan/H-to-WW-NanoAOD-analysis/Auxillary_root_files/genWeight.root"):
    """Process all samples and save them streaming to ROOT file"""
    start_time = time.time()
    
    with uproot.recreate(output_file) as output:
        for txt_file in txt_files:
            extract_and_save_genweight(txt_file, output_file, output)
    
    elapsed_time = time.time() - start_time
    print(f"\n{'='*60}")
    print(f"All samples saved to {output_file}")
    print(f"Total time: {elapsed_time:.2f}s ({elapsed_time/60:.2f}min)")
    print(f"{'='*60}")


# Process multiple samples
txt_files = [
    "VG.txt", "Higgs.txt", "WW.txt", "Fakes.txt",
    "VZ.txt", "DYtoLL.txt", "ggWW.txt", "Top.txt"
]


total_start_time = time.time()
save_all_samples(txt_files, output_file="/home/cms-jovyan/H-to-WW-NanoAOD-analysis/Auxillary_root_files/genWeight.root")
total_time = time.time() - total_start_time

print(f"\n{'='*60}")
print(f"FINAL Total processing time: {total_time:.2f}s ({total_time/60:.2f}min)")
print(f"{'='*60}")

Processing VG
 Completed: VG (32 files) - Time: 132.93s (2.22min)
 Total events: 34915878
 Sum of genWeights: 3.109819e+09
 ✓ Saved tree: VG

Processing Higgs
 Completed: Higgs (40 files) - Time: 85.66s (1.43min)
 Total events: 2946000
 Sum of genWeights: 6.328183e+07
 ✓ Saved tree: Higgs

Processing WW
 Completed: WW (7 files) - Time: 17.35s (0.29min)
 Total events: 2900000
 Sum of genWeights: 3.214710e+07
 ✓ Saved tree: WW

Processing Fakes
  Fakes: processed 50/206 files
  Fakes: processed 100/206 files
  Fakes: processed 150/206 files
  Fakes: processed 200/206 files
 Completed: Fakes (206 files) - Time: 803.50s (13.39min)
 Total events: 225680227
 Sum of genWeights: 9.740959e+12
 ✓ Saved tree: Fakes

Processing VZ
  VZ: processed 50/73 files
 Completed: VZ (73 files) - Time: 157.47s (2.62min)
 Total events: 15551954
 Sum of genWeights: 1.349852e+08
 ✓ Saved tree: VZ

Processing DYtoLL
  DYtoLL: processed 50/61 files
 Completed: DYtoLL (61 files) - Time: 227.86s (3.80min)
 Total ev

# Step 2: Getting Cross-section weights per genweight of the processes

In [3]:
# The cross-sections for the processes are:

sample_info = {
                "Higgs": {"xsec": 1.0315, "genWeight": 63_281_828.0},
                "DYtoLL": {"xsec": 6189.39, "genWeight": 82_448_512.0},
                "Top": {"xsec": 232.58, "genWeight": 11_433_399_296.0},
                "Fakes": {"xsec": 61_891.05, "genWeight": 9_740_958_564_352.0},
                "VZ": {"xsec": 26.54765, "genWeight": 134_985_184.0},
                "ggWW": {"xsec": 5.7483, "genWeight": 17_662_000.0},
                "WW": {"xsec": 12.178, "genWeight": 32_147_096.0},
                "VG": {"xsec": 464.101, "genWeight": 3_109_819_392.0}
}

Luminosity_pb = 1_000 # in pb^-1

In [4]:
def get_xsec_weight(sample_info, Luminosity_pb):
    xsec_weight = {}
    for sample, info in sample_info.items():
        xw = (info['xsec']*Luminosity_pb)/info['genWeight']
        xsec_weight[sample] = xw
    return xsec_weight

In [5]:
xsec_weight_dict = get_xsec_weight(sample_info, Luminosity_pb)
xsec_weight_dict

{'Higgs': 1.630009803130213e-05,
 'DYtoLL': 0.07506975990057892,
 'Top': 2.0342156691874536e-05,
 'Fakes': 6.3536919483978106e-06,
 'VZ': 0.00019667084352013034,
 'ggWW': 0.00032546144264522704,
 'WW': 0.0003788211538609895,
 'VG': 0.00014923728406668834}

In [41]:
max_len = max(len(s) for s in xsec_weight_dict)
print("The x-sec Weights/genWeight for 1pb^-1 are \n")
for sample, weight in xsec_weight_dict.items():
    print(f"{sample:<{max_len}} : {weight:.9f}")


The x-sec Weights/genWeight for 1pb^-1 are 

Higgs  : 0.000016300
DYtoLL : 0.075069760
Top    : 0.000020342
Fakes  : 0.000006354
VZ     : 0.000196671
ggWW   : 0.000325461
WW     : 0.000378821
VG     : 0.000149237


In [42]:
!pwd

/home/cms-jovyan/H-to-WW-NanoAOD-analysis/Datasets/MC_samples


In [43]:
with uproot.open("genWeight.root") as f:
    for keys in f.keys():
        print(f"-{keys}")

-VG;1
-Higgs;1
-WW;1
-Fakes;1
-VZ;1
-DYtoLL;1
-ggWW;1
-Top;1


### Step 2a: Getting x-sec weights 

In [6]:
xsec_weight_dict = {
    'Higgs': 1.630009803130213e-05,
    'DYtoLL': 0.07506975990057892,
    'Top': 2.0342156691874536e-05,
    'Fakes': 6.3536919483978106e-06,
    'VZ': 0.00019667084352013034,
    'ggWW': 0.00032546144264522704,
    'WW': 0.0003788211538609895,
    'VG': 0.00014923728406668834
}

sum_genweight_dict = {
    'VG': 3109819392.00,
    'Higgs': 63281828.00,
    'WW': 32147096.00,
    'Fakes': 9740958564352.00,
    'VZ': 134985184.00,
    'DYtoLL': 82448512.00,
    'ggWW': 17662000.00,
    'Top': 11433399296.00
}

print("="*60)
print("Creating xsec_weight.root with multiple branches")
print("="*60)


sample_order = ['WW', 'Higgs', 'ggWW', 'VZ', 'VG', 'DYtoLL', 'Top', 'Fakes']

with uproot.open("/home/cms-jovyan/H-to-WW-NanoAOD-analysis/Auxillary_root_files/genWeight.root") as f_in:
    with uproot.recreate("/home/cms-jovyan/H-to-WW-NanoAOD-analysis/Auxillary_root_files/xsec_weight.root") as f_out:
        
        for sample_name in sample_order:
         
            tree_name = None
            for tname in f_in.keys():
                if tname.split(';')[0] == sample_name:
                    tree_name = tname
                    break
            
            if tree_name is None:
                continue
            
            print(f"\nProcessing: {sample_name}")
            
            if sample_name not in xsec_weight_dict or sample_name not in sum_genweight_dict:
                print(f"  Skipping (missing from dictionary)")
                continue
            
            xsec_weight = xsec_weight_dict[sample_name]
            sum_genweight = sum_genweight_dict[sample_name]
            
            print(f" xsec_weight: {xsec_weight:.6e}")
            print(f" sum_genWeight: {sum_genweight:.2f}")
            
            branch_name = f"{sample_name}_genWeight"
            tree = f_in[tree_name]
            
            n_events = 0
            n_chunks = 0
            first_chunk = True
            
           
            if sample_name == 'Fakes':
                chunk_size = "5MB"
            elif sample_name in ['Top', 'DYtoLL']:
                chunk_size = "10MB"
            else:
                chunk_size = "25MB"
          
            for chunk in tree.iterate(branch_name, step_size=chunk_size):
                n_chunks += 1
                genweight_chunk = chunk[branch_name].to_numpy()
                chunk_len = len(genweight_chunk)
                
                
                if sample_name == 'Fakes':
                    # Convert to float32 to save memory 
                    genweight_chunk = genweight_chunk.astype(np.float32)
                    scaled_chunk = (genweight_chunk.astype(np.float64) * xsec_weight).astype(np.float32)
                else:
                    scaled_chunk = genweight_chunk * xsec_weight
                
                # Create sum array 
                if sample_name == 'Fakes':
                    sum_chunk = np.full(chunk_len, sum_genweight, dtype=np.float32)
                else:
                    sum_chunk = np.full(chunk_len, sum_genweight, dtype=np.float64)
                
                n_events += chunk_len
                
                # Create branches for this chunk
                branches = {
                    "genWeight": genweight_chunk,
                    "xsec_Weight": scaled_chunk,
                    "sum_genWeight": sum_chunk
                }
                
                try:
                    if first_chunk:
                        f_out[sample_name] = branches
                        first_chunk = False
                        print(f"  Created tree (chunk {n_chunks}, {chunk_len} events)")
                    else:
                        f_out[sample_name].extend(branches)
                        if n_chunks % 50 == 0:
                            print(f"  Processed chunk {n_chunks} ({chunk_len:,} events, {n_events:,} total)")
                except Exception as e:
                    print(f"  ERROR writing chunk: {e}")
                    raise
                
                # Aggressive memory cleanup
                del genweight_chunk
                del scaled_chunk
                del sum_chunk
                del branches
                del chunk
                
                # Force garbage collection
                gc.collect()
                
                
                if sample_name == 'Fakes' and n_chunks % 10 == 0:
                    gc.collect()
            
            print(f" Completed: {n_events:,} events in {n_chunks} chunks")

print("\n" + "="*60)
print("Successfully saved to: xsec_weight.root")
print("="*60)

Creating xsec_weight.root with multiple branches

Processing: WW
 xsec_weight: 3.788212e-04
 sum_genWeight: 32147096.00
  Created tree (chunk 1, 2900000 events)
 Completed: 2,900,000 events in 1 chunks

Processing: Higgs
 xsec_weight: 1.630010e-05
 sum_genWeight: 63281828.00
  Created tree (chunk 1, 2946000 events)
 Completed: 2,946,000 events in 1 chunks

Processing: ggWW
 xsec_weight: 3.254614e-04
 sum_genWeight: 17662000.00
  Created tree (chunk 1, 17662000 events)
 Completed: 17,662,000 events in 1 chunks

Processing: VZ
 xsec_weight: 1.966708e-04
 sum_genWeight: 134985184.00
  Created tree (chunk 1, 15551954 events)
 Completed: 15,551,954 events in 1 chunks

Processing: VG
 xsec_weight: 1.492373e-04
 sum_genWeight: 3109819392.00
  Created tree (chunk 1, 34915878 events)
 Completed: 34,915,878 events in 1 chunks

Processing: DYtoLL
 xsec_weight: 7.506976e-02
 sum_genWeight: 82448512.00
  Created tree (chunk 1, 82448537 events)
 Completed: 82,448,537 events in 1 chunks

Processing: 

In [57]:
ROOT_DIR = "/home/cms-jovyan/H-to-WW-NanoAOD-analysis/Auxillary_root_files"
for files in os.listdir(ROOT_DIR):
    if files.endswith("root"):
        print(files)

xsec_weight.root
genWeight.root


In [60]:
def get_info(sample_name):
    
    with uproot.open("/home/cms-jovyan/H-to-WW-NanoAOD-analysis/Auxillary_root_files/xsec_weight.root") as x:
        gw = x[sample_name]['genWeight'].array()
        xw= x[sample_name]['xsec_Weight'].array()
        sum_gw = x[sample_name]['sum_genWeight'].array()[0]

        print(f"_"*120)
        print(f"# For {sample_name}:\n")
        print(f"-GenWeights are {gw}")
        print(f"-X-sec Weights are {xw}")
        print(f"-The sum of genWeights is {sum_gw}")
        print(f"_"*120)

        # return gw, xw, sum_gw


In [61]:
samples = ['WW', 'Higgs', 'ggWW', 'VZ', 'VG', 'DYtoLL', 'Top', 'Fakes']
# samples = ['Higgs']
print(f"NOTE: These xsec weights are for 1 fb^-1 luminosity.\n\n")
for i,sample in enumerate(samples):
    get_info(sample)

NOTE: These xsec weights are for 1 fb^-1 luminosity.


________________________________________________________________________________________________________________________
# For WW:

-GenWeights are [11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, ... 11.1, 11.1, 11.1, 11.1, 11.1, 11.1]
-X-sec Weights are [0.00422, 0.00422, 0.00422, 0.00422, 0.00422, ... 0.00422, 0.00422, 0.00422, 0.00422]
-The sum of genWeights is 32147096.0
________________________________________________________________________________________________________________________
________________________________________________________________________________________________________________________
# For Higgs:

-GenWeights are [21.5, 21.5, 21.5, 21.5, 21.5, 21.5, 21.5, ... 21.5, 21.5, 21.5, 21.5, 21.5, 21.5]
-X-sec Weights are [0.00035, 0.00035, 0.00035, 0.00035, 0.00035, ... 0.00035, 0.00035, 0.00035, 0.00035]
-The sum of genWeights is 63281828.0
___________________________________________________________________________