# Debugging duplicate lumi sections in central production

Given a workflow name, retrieve its description from ReqMgr2 and fetch a bunch of data from DBS for both the input and output datasets.

In [58]:
import os
import sys
import requests
import json
import statistics
import traceback
import urllib.parse
from pprint import pprint, pformat
from tabulate import tabulate
from tqdm import tqdm
requests.packages.urllib3.disable_warnings(requests.packages.urllib3.exceptions.InsecureRequestWarning)

myCert = "/Users/amaltar2/Devel/DockerWMCore/certs/servicecert.pem"
myKey = "/Users/amaltar2/Devel/DockerWMCore/certs/servicekey.pem"

## Fetch workflows from ReqMgr2

In [74]:
# FIXME: update the workflow name
wflowName = "cmsunified_task_EXO-Run3Summer22GS-00318__v1_T_230302_175516_6976"

headers = {"Content-type": "application/json", "Accept": "application/json"}
reqmgrUrl = "https://cmsweb.cern.ch/reqmgr2/data/request"
params = {"name": wflowName}
data = requests.get(reqmgrUrl, params=params, headers=headers, cert=(myCert, myKey), verify=False)
if data.ok is True:
    data = data.json()['result']
data = data[0][wflowName]
if "Step1" in data:
    inputDset = data['Step1'].get('InputDataset', "")
elif "Task1" in data:
    inputDset = data['Task1'].get('InputDataset', "")
else:
    inputDset = data['InputDataset']

outputDsets = data['OutputDatasets']

## Define DBS functions

In [43]:
dbsUrl = "https://cmsweb.cern.ch/dbs/prod/global/DBSReader"

### filesummaries for a dataset

In [75]:
def getFileSummaries(dset, verbose=False):
    dbsFileSummaries = os.path.join(dbsUrl, "filesummaries")
    params = {"dataset": dset, "validFileOnly": 1}
    #print(f"Fetching filesummaries information for dataset: {dset}")
    data = requests.get(dbsFileSummaries, params=params, headers=headers, cert=(myCert, myKey), verify=False)
    if data.ok is True:
        data = data.json()[0]
    if verbose:
        print(f"filesummaries data for dataset {dset} is: \n{data}")
    return {"num_block": data["num_block"], "num_event": data["num_event"], "num_file": data["num_file"], "num_lumi": data["num_lumi"]}

### filesummaries for a list of blocks

In [77]:
def getFileSummariesBlock(block_list, verbose=False):
    thisSummary = {"num_block": 0, "num_event": 0, "num_file": 0, "num_lumi": 0}

    dbsFileSummaries = os.path.join(dbsUrl, "filesummaries")
    for block in block_list:
        params = {"block_name": block, "validFileOnly": 1}
        #print(f"Fetching filesummaries information for block: {block}")
        data = requests.get(dbsFileSummaries, params=params, headers=headers, cert=(myCert, myKey), verify=False)
        if data.ok is True:
            data = data.json()[0]
        if verbose:
            print(f"filesummaries data for block {block} is: \n{data}")
        try:
            thisSummary["num_block"] += 1
            thisSummary["num_event"] += data["num_event"]
            thisSummary["num_file"] += data["num_file"]
            thisSummary["num_lumi"] += data["num_lumi"]
        except Exception as exc:
            msg = f"Failed to fetch filesummaries for block: {block}. "
            msg += f"Data retrieved: {data}. Error: {str(exc)}"
            print(msg)
    return thisSummary

### filelumis for a list of blocks

In [79]:
def getFileSummariesFile(block_list, verbose=False):
    thisSummary = {"num_block": 0, "num_event": 0, "num_file": 0, "num_lumi": 0}
    dbsFileLumis = os.path.join(dbsUrl, "filelumis")
    for block in block_list:
        file_list = []
        params = {"block_name": block, "validFileOnly": 1}
        #print(f"Fetching filesummaries information for block: {block}")
        data = requests.get(dbsFileLumis, params=params, headers=headers, cert=(myCert, myKey), verify=False)
        if data.ok is True:
            data = data.json()
        if verbose:
            print(f"filelumis data for block {block} is: \n{pformat(data)}")
        for item in data:
            try:
                if item["logical_file_name"] not in file_list:
                    file_list.append(item["logical_file_name"])
                    thisSummary["num_file"] += 1
                thisSummary["num_lumi"] += 1
                thisSummary["num_event"] += item["event_count"]
            except Exception as exc:
                msg = f"Failed to fetch filelumis for block: {block}. "
                msg += f"Data retrieved: {item}. Error: {str(exc)}"
                print(msg)
        thisSummary["num_block"] += 1
    return thisSummary

### blocks for a given dataset

In [80]:
def getBlocks(dset, verbose=False):
    dbsBlocks = os.path.join(dbsUrl, "blocks")
    params = {"dataset": dset}
    #print(f"Fetching blocks information for dataset: {dset}")
    data = requests.get(dbsBlocks, params=params, headers=headers, cert=(myCert, myKey), verify=False)
    if data.ok is True:
        data = data.json()
    if verbose:
        print(f"blocks data for dataset {dset} is: \n{pformat(data)}")
    return [item['block_name'] for item in data]

### files for a given dataset

In [81]:
def getFiles(dset, verbose=False):
    dbsFiles = os.path.join(dbsUrl, "files")
    params = {"dataset": dset, "validFileOnly": 1}
    #print(f"Fetching blocks information for dataset: {dset}")
    data = requests.get(dbsFiles, params=params, headers=headers, cert=(myCert, myKey), verify=False)
    if data.ok is True:
        data = data.json()
    if verbose:
        print(f"files data for dataset {dset} is: \n{pformat(data)}")
    return [item['logical_file_name'] for item in data]

### list of lumis per file for a list of files

In [82]:
def getFileLumis(files_list, verbose=False):
    resp = {}
    dbsFileLumis = os.path.join(dbsUrl, "filelumis")
    for fname in files_list:
        file_list = []
        params = {"logical_file_name": fname, "validFileOnly": 1}
        #print(f"Fetching filesummaries information for block: {block}")
        data = requests.get(dbsFileLumis, params=params, headers=headers, cert=(myCert, myKey), verify=False)
        if data.ok is True:
            data = data.json()
        if verbose:
            print(f"filelumis data for file {fname} is: \n{pformat(data)}")
        for item in data:
            lfn = item['logical_file_name']
            lumi = item['lumi_section_num']
            resp.setdefault(lfn, [])
            if lumi in resp[lfn]:
                print(f"ERROR: Lumi {lumi} in LFN {lfn} is duplicate")
            else:
                resp[lfn].append(lumi)
    return resp

## Fetch summary for each dataset


In [83]:
%%time
# retrieve information from DBS at 3 levels: dataset, block and files
summary = {"dataset": [], "block": [], "file": []}
if not inputDset:
    thisDict = {"dataset_name": None, "num_block": 0, "num_event": 0, "num_file": 0, "num_lumi": 0}
    summary["dataset"].append(thisDict)
else:
    data = getFileSummaries(inputDset)
    data["dataset_name"] = inputDset
    summary["dataset"].append(data)

for dset in tqdm(outputDsets, total=len(outputDsets)):
    data = getFileSummaries(dset)
    data["dataset_name"] = dset
    summary["dataset"].append(data)

100%|██████████| 2/2 [00:01<00:00,  1.55it/s]

CPU times: user 70.2 ms, sys: 11.2 ms, total: 81.5 ms
Wall time: 1.29 s





## Fetch summary for each block in the dataset

In [84]:
%%time
if not inputDset:
    thisDict = {"dataset_name": None, "num_block": 0, "num_event": 0, "num_file": 0, "num_lumi": 0}
    summary["block"].append(thisDict)
else:
    blocks = getBlocks(inputDset)
    data = getFileSummariesBlock(blocks)
    data["dataset_name"] = inputDset
    summary["block"].append(data)

for dset in tqdm(outputDsets, total=len(outputDsets)):
    blocks = getBlocks(dset)
    data = getFileSummariesBlock(blocks)
    data["dataset_name"] = dset
    summary["block"].append(data)

100%|██████████| 2/2 [00:22<00:00, 11.39s/it]

CPU times: user 1.28 s, sys: 82.6 ms, total: 1.37 s
Wall time: 22.8 s





## Fetch summary for each file in the dataset

In [85]:
#%%time
if not inputDset:
    thisDict = {"dataset_name": None, "num_block": 0, "num_event": 0, "num_file": 0, "num_lumi": 0}
    summary["file"].append(thisDict)
else:
    blocks = getBlocks(inputDset)
    data = getFileSummariesFile(blocks)
    data["dataset_name"] = inputDset
    summary["file"].append(data)

for dset in tqdm(outputDsets, total=len(outputDsets)):
    blocks = getBlocks(dset)
    data = getFileSummariesFile(blocks)
    data["dataset_name"] = dset
    summary["file"].append(data)    

100%|██████████| 2/2 [00:33<00:00, 16.73s/it]


In [86]:
num_datasets = len(summary['dataset'])
for i in range(num_datasets):
    print(f"\nSummary for: {summary['dataset'][i]['dataset_name']}")
    header_table = ["metric", "dataset level", "block level", "file level"]
    data_table = [["num_block", summary['dataset'][i]['num_block'], summary['block'][i]['num_block'], summary['file'][i]['num_block']],
                  ["num_file", summary['dataset'][i]['num_file'], summary['block'][i]['num_file'], summary['file'][i]['num_file']],
                  ["num_event", summary['dataset'][i]['num_event'], summary['block'][i]['num_event'], summary['file'][i]['num_event']],
                  ["num_lumi", summary['dataset'][i]['num_lumi'], summary['block'][i]['num_lumi'], summary['file'][i]['num_lumi']]]
    print(tabulate(data_table, headers=header_table, tablefmt="github"))



Summary for: None
| metric    |   dataset level |   block level |   file level |
|-----------|-----------------|---------------|--------------|
| num_block |               0 |             0 |            0 |
| num_file  |               0 |             0 |            0 |
| num_event |               0 |             0 |            0 |
| num_lumi  |               0 |             0 |            0 |

Summary for: /ZPrime2DarkPhoton_HMass-1000_DPMass-0p3_TuneCP5_13p6TeV-pythia8/Run3Summer22DRPremix-124X_mcRun3_2022_realistic_v12-v2/AODSIM
| metric    |   dataset level |   block level |   file level |
|-----------|-----------------|---------------|--------------|
| num_block |              16 |            16 |           16 |
| num_file  |              14 |            14 |           14 |
| num_event |           12896 |         12896 |        12896 |
| num_lumi  |              15 |            19 |           19 |

Summary for: /ZPrime2DarkPhoton_HMass-1000_DPMass-0p3_TuneCP5_13p6TeV-pythia8/Run3S

## Save this summary in a JSON file

In [48]:
with open("summary_dbs.json", "w") as fo:
    json.dump(summary, fo, indent=2)

## Fetching list of lumis per file from input dataset

In [72]:
if not inputDset:
    thisDict = {"dataset_name": None, "num_block": 0, "num_event": 0, "num_file": 0, "num_lumi": 0}
else:
    files = getFiles(inputDset)
    #print(f"List of files: {pformat(files)}")
    data = getFileLumis(files)
    #print("Map of lumis per file is:")
    #for lfn, lumis in data.items():
    #    print(f"{lfn} contains {sorted(lumis)}")
    print("Map of duplicate files and lumis is:")
    for lfn, lumis in data.items():
        for lfn2, lumis2 in data.items():
            if lfn == lfn2:
                continue
            dupLumis = set(lumis) & set(lumis2)
            if dupLumis:
                print(f"Lumi {dupLumis} is common between {lfn} and {lfn2}")


Map of duplicate files and lumis is:
Lumi {7} is common between /store/mc/Run3Summer22DRPremix/ZPrime2DarkPhoton_HMass-1000_DPMass-0p3_TuneCP5_13p6TeV-pythia8/AODSIM/124X_mcRun3_2022_realistic_v12-v2/2810000/f956c9e4-c5ae-4ec6-8c77-71bfc3002cf9.root and /store/mc/Run3Summer22DRPremix/ZPrime2DarkPhoton_HMass-1000_DPMass-0p3_TuneCP5_13p6TeV-pythia8/AODSIM/124X_mcRun3_2022_realistic_v12-v2/2810000/5d37dc29-67f7-41fb-8a02-4f4645464938.root
Lumi {7} is common between /store/mc/Run3Summer22DRPremix/ZPrime2DarkPhoton_HMass-1000_DPMass-0p3_TuneCP5_13p6TeV-pythia8/AODSIM/124X_mcRun3_2022_realistic_v12-v2/2810000/5d37dc29-67f7-41fb-8a02-4f4645464938.root and /store/mc/Run3Summer22DRPremix/ZPrime2DarkPhoton_HMass-1000_DPMass-0p3_TuneCP5_13p6TeV-pythia8/AODSIM/124X_mcRun3_2022_realistic_v12-v2/2810000/f956c9e4-c5ae-4ec6-8c77-71bfc3002cf9.root
Lumi {18} is common between /store/mc/Run3Summer22DRPremix/ZPrime2DarkPhoton_HMass-1000_DPMass-0p3_TuneCP5_13p6TeV-pythia8/AODSIM/124X_mcRun3_2022_realistic_

## Fetching list of lumis for one of the output datasets

In [None]:
for dset in tqdm(outputDsets, total=len(outputDsets)):
    blocks = getBlocks(dset)
    data = getFileSummariesFile(blocks)
    data["dataset_name"] = dset
    summary["file"].append(data)  