# Inspect Record ID Schemas

This notebook fetches sample files from specific CERN Open Data record IDs and inspects their branch structure to determine the schema.


In [13]:
import sys
from pathlib import Path
import requests
import json
import uproot
import pandas as pd
from collections import defaultdict

# Add project root to path
project_root = Path.cwd().parent
sys.path.insert(0, str(project_root))

from src.parse_atlas import parser, schemas, consts


In [14]:
# Record IDs to inspect
RECORD_IDS = [30512, 30546]

# Number of sample files to inspect per record
NUM_SAMPLES = 2


In [15]:
def fetch_file_uris_from_record(record_id: int) -> list:
    """Fetch all file URIs from a record ID."""
    r = requests.get(consts.CMS_RECID_FILEPAGE_URL.format(record_id))
    json_r = json.loads(r.text)
    
    file_uris = []
    if json_r.get("index_files", {}).get("files"):
        for file_group in json_r["index_files"]["files"]:
            for f in file_group["files"]:
                file_uris.append(f["uri"])
    
    return file_uris

def inspect_file_branches(file_uri: str) -> dict:
    """Inspect branches in a ROOT file."""
    try:
        with uproot.open(file_uri) as file:
            # Try common tree names
            tree_names = ["CollectionTree", "mini", "Events"]
            tree = None
            tree_name = None
            
            for name in tree_names:
                if name in file:
                    tree = file[name]
                    tree_name = name
                    break
            
            if tree is None:
                # Use first available tree
                available_trees = list(file.keys())
                if available_trees:
                    tree_name = available_trees[0].rstrip(";1")
                    tree = file[tree_name]
            
            if tree is None:
                return {"error": "No tree found", "available_keys": list(file.keys())}
            
            all_branches = list(tree.keys())
            print(all_branches)
            # Analyze branch structure
            dotted_branches = [b for b in all_branches if "." in b]
            flat_branches = [b for b in all_branches if "_" in b and "." not in b]
            
            # Group branches by base name
            base_branches = defaultdict(list)
            for branch in all_branches:
                if "." in branch:
                    base, field = branch.rsplit(".", 1)
                    base_branches[base].append(field)
                elif "_" in branch:
                    parts = branch.split("_", 1)
                    if len(parts) == 2:
                        base_branches[parts[0]].append(parts[1])
            
            return {
                "tree_name": tree_name,
                "total_branches": len(all_branches),
                "all_branches": all_branches,
                "dotted_branches": dotted_branches,
                "flat_branches": flat_branches,
                "base_branches": dict(base_branches),
                "naming_pattern": "dotted" if dotted_branches else "flat" if flat_branches else "unknown"
            }
    except Exception as e:
        return {"error": str(e)}


In [16]:
# Fetch and inspect files for each record ID
results = {}

for record_id in RECORD_IDS:
    print(f"\n{'='*80}")
    print(f"Record ID: {record_id}")
    print(f"URL: https://opendata.cern.ch/record/{record_id}")
    print(f"{'='*80}")
    
    # Fetch file URIs
    file_uris = fetch_file_uris_from_record(record_id)
    print(f"\nFound {len(file_uris)} files in record {record_id}")
    
    # Inspect sample files
    sample_files = file_uris[:NUM_SAMPLES]
    record_results = []
    
    for i, file_uri in enumerate(sample_files, 1):
        print(f"\n--- Inspecting file {i}/{len(sample_files)} ---")
        print(f"URI: {file_uri}")
        
        inspection = inspect_file_branches(file_uri)
        
        if "error" in inspection:
            print(f"ERROR: {inspection['error']}")
            if "available_keys" in inspection:
                print(f"Available keys: {inspection['available_keys']}")
        else:
            print(f"Tree: {inspection['tree_name']}")
            print(f"Total branches: {inspection['total_branches']}")
            print(f"Naming pattern: {inspection['naming_pattern']}")
            print(f"Dotted branches: {len(inspection['dotted_branches'])}")
            print(f"Flat branches: {len(inspection['flat_branches'])}")
            
            # Show sample branches
            if inspection['dotted_branches']:
                print(f"\nSample dotted branches (first 10):")
                for branch in inspection['dotted_branches'][:10]:
                    print(f"  {branch}")
            
            if inspection['flat_branches']:
                print(f"\nSample flat branches (first 10):")
                for branch in inspection['flat_branches'][:10]:
                    print(f"  {branch}")
            
            # Show base branches
            print(f"\nBase branches (first 20):")
            for base, fields in list(inspection['base_branches'].items())[:20]:
                print(f"  {base}: {fields}")
        
        record_results.append({
            "file_uri": file_uri,
            "inspection": inspection
        })
    
    results[record_id] = record_results



Record ID: 30512
URL: https://opendata.cern.ch/record/30512

Found 1257 files in record 30512

--- Inspecting file 1/2 ---
URI: root://eospublic.cern.ch//eos/opendata/cms/Run2016G/SingleElectron/MINIAOD/UL2016_MiniAODv2-v2/120000/0014ADC0-08B8-1347-B496-CDB3A3A32317.root
['EventAuxiliary', 'EventProductProvenance', 'EventSelections', 'BranchListIndexes', 'edmTriggerResults_TriggerResults__HLT.', 'edmTriggerResults_TriggerResults__HLT./edmTriggerResults_TriggerResults__HLT.present', 'edmTriggerResults_TriggerResults__HLT./edmTriggerResults_TriggerResults__HLT.obj', 'GlobalAlgBlkBXVector_gtStage2Digis__RECO.', 'GlobalAlgBlkBXVector_gtStage2Digis__RECO./GlobalAlgBlkBXVector_gtStage2Digis__RECO.present', 'GlobalAlgBlkBXVector_gtStage2Digis__RECO./GlobalAlgBlkBXVector_gtStage2Digis__RECO.obj', 'GlobalExtBlkBXVector_gtStage2Digis__RECO.', 'GlobalExtBlkBXVector_gtStage2Digis__RECO./GlobalExtBlkBXVector_gtStage2Digis__RECO.present', 'GlobalExtBlkBXVector_gtStage2Digis__RECO./GlobalExtBlkBXVec

In [17]:
# Extract schemas using the schema extraction function
print("\n" + "="*80)
print("EXTRACTING SCHEMAS")
print("="*80)

extracted_schemas = {}

for record_id in RECORD_IDS:
    print(f"\n--- Extracting schema for record {record_id} ---")
    try:
        schema = schemas.extract_schema_from_record_id(record_id)
        extracted_schemas[record_id] = schema
        
        print(f"\nExtracted schema:")
        print(json.dumps(schema, indent=2))
        
        # Show summary
        print(f"\nSchema Summary:")
        print(f"  Naming pattern: {schema.get('naming_pattern', 'unknown')}")
        print(f"  Branch prefix: '{schema.get('branch_prefix', '')}'")
        print(f"  Branch suffix: '{schema.get('branch_suffix', '')}'")
        print(f"  Objects found: {list(schema.get('objects', {}).keys())}")
        print(f"  Object mappings: {schema.get('object_mappings', {})}")
        
        for obj_name, fields in schema.get('objects', {}).items():
            print(f"    {obj_name}: {fields}")
            
    except Exception as e:
        print(f"ERROR extracting schema: {e}")
        import traceback
        traceback.print_exc()



EXTRACTING SCHEMAS

--- Extracting schema for record 30512 ---

Extracted schema:
{
  "naming_pattern": "dotted",
  "branch_prefix": "",
  "branch_suffix": "",
  "object_mappings": {},
  "objects": {}
}

Schema Summary:
  Naming pattern: dotted
  Branch prefix: ''
  Branch suffix: ''
  Objects found: []
  Object mappings: {}

--- Extracting schema for record 30546 ---

Extracted schema:
{
  "naming_pattern": "dotted",
  "branch_prefix": "",
  "branch_suffix": "",
  "object_mappings": {},
  "objects": {}
}

Schema Summary:
  Naming pattern: dotted
  Branch prefix: ''
  Branch suffix: ''
  Objects found: []
  Object mappings: {}


In [18]:
# Compare schemas between records
if len(extracted_schemas) > 1:
    print("\n" + "="*80)
    print("SCHEMA COMPARISON")
    print("="*80)
    
    record_ids = list(extracted_schemas.keys())
    
    print(f"\nComparing records: {record_ids}")
    
    # Compare naming patterns
    patterns = {rid: extracted_schemas[rid].get('naming_pattern') for rid in record_ids}
    print(f"\nNaming patterns: {patterns}")
    
    # Compare objects
    all_objects = set()
    for schema in extracted_schemas.values():
        all_objects.update(schema.get('objects', {}).keys())
    
    print(f"\nObjects found across all records: {sorted(all_objects)}")
    
    # Compare object mappings
    for obj in sorted(all_objects):
        mappings = {}
        for rid in record_ids:
            schema = extracted_schemas[rid]
            obj_mappings = schema.get('object_mappings', {})
            if obj in obj_mappings:
                mappings[rid] = obj_mappings[obj]
        if mappings:
            print(f"\n  {obj} mappings: {mappings}")



SCHEMA COMPARISON

Comparing records: [30512, 30546]

Naming patterns: {30512: 'dotted', 30546: 'dotted'}

Objects found across all records: []


In [19]:
# Create a summary DataFrame
summary_data = []

for record_id, schema in extracted_schemas.items():
    summary_data.append({
        "record_id": record_id,
        "naming_pattern": schema.get('naming_pattern', 'unknown'),
        "branch_prefix": schema.get('branch_prefix', ''),
        "branch_suffix": schema.get('branch_suffix', ''),
        "objects": ', '.join(schema.get('objects', {}).keys()),
        "num_objects": len(schema.get('objects', {})),
    })

if summary_data:
    df_summary = pd.DataFrame(summary_data)
    print("\n" + "="*80)
    print("SCHEMA SUMMARY TABLE")
    print("="*80)
    print(df_summary.to_string(index=False))



SCHEMA SUMMARY TABLE
 record_id naming_pattern branch_prefix branch_suffix objects  num_objects
     30512         dotted                                                0
     30546         dotted                                                0


In [20]:
# Save extracted schemas to a JSON file for reference
output_file = project_root / "testing" / "extracted_schemas.json"

with open(output_file, 'w') as f:
    json.dump(extracted_schemas, f, indent=2)

print(f"\nSchemas saved to: {output_file}")



Schemas saved to: /srv01/agrp/netalev/atlas_utilization/testing/extracted_schemas.json
