# ICCS

## OA: OpenAlex

using DBLP metadata, targeting DOI and Title at fallback

In [None]:
import json
import os
import time
from pathlib import Path
import pandas as pd
import requests
from tqdm import tqdm
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from dotenv import load_dotenv
load_dotenv()

IN_PATH = Path(os.getenv("DBLP_ICCS"))
OUT_RAW_JSONL = Path(os.getenv("OA_ICCS_JSONL"))
# OUT_CORE_PARQUET = Path(os.getenv("OA_ICCS_PARQUET"))

OPENALEX_BASE = os.getenv("OA_BASE")
USER_AGENT = os.getenv("USER_OA")

BATCH_SIZE = 50
SLEEP_SEC = 0.3

session = requests.Session()
retries = Retry(total=5, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504])
session.mount("https://", HTTPAdapter(max_retries=retries))

def normalize_doi(doi):
    if not doi:
        return None
    doi = str(doi).strip().lower()
    for prefix in ("https://doi.org/", "http://doi.org/", "doi:"):
        if doi.startswith(prefix):
            doi = doi[len(prefix):]
    return doi

def openalex_get(url, params=None):
    headers = {"User-Agent": USER_AGENT}
    try:
        r = session.get(url, headers=headers, params=params, timeout=60)
        if r.status_code == 404:
            return None
        r.raise_for_status()
        return r.json()
    except Exception as e:
        print(f"Request error: {e}")
        return None

rows = []
if IN_PATH.exists():
    with open(IN_PATH, "r", encoding="utf-8") as f:
        for line in f:
            if not line.strip():
                continue
            data = json.loads(line)
            rows.append({
                "dblp_id": data.get("key"),
                "doi": data.get("doi"),
                "title": data.get("title"),
                "year": data.get("year")
            })
else:
    print(f"Input file not found: {IN_PATH}")
    exit()

df = pd.DataFrame(rows)
df["year_int"] = pd.to_numeric(df["year"], errors='coerce')

df = df[df['year_int'].between(2001, 2026)].copy()
df["normalized_doi"] = df["doi"].apply(normalize_doi)

doi_to_indices = {}
for idx, doi in df["normalized_doi"].items():
    if doi:
        doi_to_indices.setdefault(doi, []).append(idx)

all_dois = list(doi_to_indices.keys())
work_by_doi = {}

print(f"Querying {len(all_dois)} ICCS DOIs from OpenAlex...")
for i in tqdm(range(0, len(all_dois), BATCH_SIZE)):
    batch = all_dois[i:i+BATCH_SIZE]
    filt = "doi:" + "|".join(batch)
    params = {"filter": filt, "per-page": 200}
    
    data = openalex_get(f"{OPENALEX_BASE}/works", params=params)
    if data:
        for w in data.get("results", []):
            nd = normalize_doi(w.get("doi"))
            if nd:
                work_by_doi[nd] = w
    time.sleep(SLEEP_SEC)

def search_by_title(title, year):
    if not title:
        return None
    params = {"search": title.strip(), "per-page": 5}
    if year and not pd.isna(year):
        params["filter"] = f"from_publication_date:{int(year)}-01-01,to_publication_date:{int(year)}-12-31"
    
    res = openalex_get(f"{OPENALEX_BASE}/works", params=params)
    if res and res.get("results"):
        return res.get("results")[0]
    return None

missing = list(set(all_dois) - set(work_by_doi.keys()))
print(f"Falling back to title search for {len(missing)} missing items...")
for doi in tqdm(missing):
    idx = doi_to_indices[doi][0]
    row = df.loc[idx]
    w = search_by_title(row["title"], row["year_int"])
    if w:
        work_by_doi[doi] = w
    time.sleep(0.3)

OUT_RAW_JSONL.parent.mkdir(parents=True, exist_ok=True)
with open(OUT_RAW_JSONL, "w", encoding="utf-8") as f:
    for _, row in df.iterrows():
        nd = row["normalized_doi"]
        item = {
            "dblp_id": row["dblp_id"],
            "dblp_title": row["title"],
            "dblp_year": row["year"],
            "dblp_doi": row["doi"],
            "normalized_doi": nd,
            "openalex_work": work_by_doi.get(nd),
        }
        f.write(json.dumps(item, ensure_ascii=False) + "\n")

core_rows = []
if OUT_RAW_JSONL.exists():
    with open(OUT_RAW_JSONL, "r", encoding="utf-8") as f:
        for line in f:
            rec = json.loads(line)
            w = rec.get("openalex_work")
            if not w:
                continue

            loc = w.get("primary_location") or {}
            src = loc.get("source") or {}

            core_rows.append({
                "dblp_id": rec["dblp_id"],
                "dblp_title": rec["dblp_title"],
                "dblp_year": rec["dblp_year"],
                "work_id": w.get("id"),
                "display_name": w.get("display_name"),
                "cited_by_count": w.get("cited_by_count"),
                "is_oa": (w.get("open_access") or {}).get("is_oa"),
                "source_name": src.get("display_name"),
                "topics": json.dumps(w.get("topics", []), ensure_ascii=False)
            })

if core_rows:
    # pd.DataFrame(core_rows).to_parquet(OUT_CORE_PARQUET, index=False)
    print(f"Finished. Saved {len(core_rows)} ICCS records with citation metadata.")

Querying 8355 ICCS DOIs from OpenAlex...


100%|██████████| 168/168 [03:50<00:00,  1.37s/it]


Falling back to title search for 41 missing items...


100%|██████████| 41/41 [01:11<00:00,  1.74s/it]


Finished. Saved 8355 ICCS records with citation metadata.


## Filling missing author's Affiliation Data 

using GROBID Parsed Data

In [17]:
import json
import os
import re
from pathlib import Path
from tqdm import tqdm
from dotenv import load_dotenv
load_dotenv()


OPENALEX_JSONL = Path(r"D:\ITMO Big Data & ML School\semester 3\RI3\notebooks\data\interim\iccs\ICCS_openalex_works.jsonl")
GROBID_DIR = Path(r"D:\ITMO Big Data & ML School\semester 3\RI3\parsed\iccs")
OUTPUT_FILE = Path(r"D:\ITMO Big Data & ML School\semester 3\RI3\notebooks\data\interim\iccs\ICCS_openalex_enriched.jsonl")


def normalize_name(name):
    if not name: return ""
    name = name.replace(".", " ").lower()
    return re.sub(r'\s+', ' ', name).strip()

def extract_doi_from_filename(filename):
    """
    Filename: '2023_10.1016_j.jocs.2023.102166.json'
    DOI: '10.1016/j.jocs.2023.102166'
    """
    # Regex for DOI pattern
    match = re.search(r'10\.\d{4,}_.*(?=\.json)', filename)
    if match:
        doi_part = match.group(0)
        # Pehle underscore ko slash mein badlein
        return doi_part.replace("_", "/", 1).lower().strip()
    return None

# Load GROBID data
grobid_lookup = {}
print("Loading GROBID JSON files from year folders...")
for year_folder in tqdm(list(GROBID_DIR.glob("*")), desc="Years"):
    if year_folder.is_dir():
        for fpath in year_folder.glob("*.json"):
            doi = extract_doi_from_filename(fpath.name)
            if doi:
                try:
                    with open(fpath, 'r', encoding='utf-8') as f:
                        grobid_lookup[doi] = json.load(f)
                except Exception as e:
                    print(f"Error loading {fpath}: {e}")

print(f"Loaded {len(grobid_lookup)} DOIs from GROBID (year folders).")

# Process OpenAlex
merged_results = []
authors_filled = 0
doi_matches = 0
total_papers = 0

print("Merging data...")
with open(OPENALEX_JSONL, 'r', encoding='utf-8') as f_in:
    for line in f_in:
        record = json.loads(line)
        total_papers += 1 
        doi = record.get("normalized_doi", "").lower().strip()
        oa_work = record.get("openalex_work")

        # DOI Match Check
        if oa_work and doi in grobid_lookup:
            doi_matches += 1
            grobid_data = grobid_lookup[doi]
            authorships = oa_work.get("authorships", [])

            for auth_obj in authorships:
                # Initialize for matching papers
                auth_obj["raw_pdf_affiliation"] = None

                # Gap check
                has_oa = any([
                    len(auth_obj.get("institutions", [])) > 0,
                    len(auth_obj.get("raw_affiliation_strings", [])) > 0,
                    len(auth_obj.get("affiliations", [])) > 0
                ])

                if has_oa: continue 

                # Name Matching
                oa_disp = normalize_name(auth_obj.get("author", {}).get("display_name", ""))
                oa_raw = normalize_name(auth_obj.get("raw_author_name", ""))
                
                for g_auth in grobid_data.get("authors", []):
                    g_name = normalize_name(g_auth.get("name", ""))
                    if (oa_disp == g_name or oa_raw == g_name or oa_disp in g_name or g_name in oa_disp):
                        affils = g_auth.get("affiliations", [])
                        auth_obj["raw_pdf_affiliation"] = " ; ".join(affils) if affils else "Empty in PDF"
                        authors_filled += 1
                        break
        
        merged_results.append(record)


OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)
with open(OUTPUT_FILE, 'w', encoding='utf-8') as f_out:
    for res in merged_results:
        f_out.write(json.dumps(res, ensure_ascii=False) + "\n")

print(f"\nDOI Matches found: {doi_matches}/{total_papers}")
print(f"Gaps filled in 'raw_pdf_affiliation': {authors_filled}")

Loading GROBID JSON files from year folders...


Years: 100%|██████████| 27/27 [00:01<00:00, 22.45it/s]


Loaded 8324 DOIs from GROBID (year folders).
Merging data...

DOI Matches found: 8324/8355
Gaps filled in 'raw_pdf_affiliation': 74


## Audit for missing values

In [18]:
import json
from pathlib import Path
from collections import defaultdict
from dotenv import load_dotenv

load_dotenv()

ENRICHED_JSONL = Path(os.getenv("ICCS_ENRICHED_OA_GROBID"))

print("ICCS DATASET: OPENALEX COVERAGE ANALYSIS")

missing_authors_by_year = defaultdict(list)
year_counts = defaultdict(int)
total_papers = 0
total_authors = 0
covered_authors = 0

with open(ENRICHED_JSONL , "r", encoding="utf-8") as f:
    for line in f:
        if not line.strip():
            continue
            
        rec = json.loads(line)
        total_papers += 1
        year = rec.get("dblp_year")
        
        oa_work = rec.get("openalex_work")
        if not oa_work:
            if year:
                missing_authors_by_year[year].append("NO_OPENALEX")
            continue
            
        authorships = oa_work.get("authorships", [])
        total_authors += len(authorships)
        
        missing_count = 0
        for auth in authorships:
            # Check if author has ANY affiliation data
            aff_sources = [
                auth.get("institutions", []),
                auth.get("raw_affiliation_strings", []),
                auth.get("affiliations", [])
            ]
            pdf_aff = auth.get("raw_pdf_affiliation")
            bad_pdf_vals = {None, "Not found in PDF", "Empty in PDF", 0}
            has_pdf = pdf_aff and pdf_aff not in bad_pdf_vals
            
            has_affil = any(len(src) > 0 for src in aff_sources) or has_pdf
            
            
            if has_affil:
                covered_authors += 1
            else:
                missing_count += 1
                
        if missing_count > 0 and year:
            missing_authors_by_year[year].append({
                "doi": rec.get("normalized_doi") or rec.get("dblp_doi"),
                "missing_authors": missing_count,
                "total_authors": len(authorships)
            })

print(f"Total papers:     {total_papers:,}")
print(f"Total authors:    {total_authors:,}")

if total_authors > 0:
    print(f"Covered authors:  {covered_authors:,} ({covered_authors/total_authors*100:.1f}%)")
    print(f"Missing authors:  {total_authors-covered_authors:,} ({(total_authors-covered_authors)/total_authors*100:.1f}%)")
else:
    print("No authors found - check OpenAlex data structure")

print()

print("MISSING AUTHORS BY YEAR:")
for year in sorted(missing_authors_by_year.keys()):
    year_data = missing_authors_by_year[year]
    total_missing_year = sum(d["missing_authors"] for d in year_data if isinstance(d, dict))
    print(f"{year}: {total_missing_year} missing authors ({len([d for d in year_data if isinstance(d, dict)])} papers)")

ICCS DATASET: OPENALEX COVERAGE ANALYSIS
Total papers:     8,355
Total authors:    28,414
Covered authors:  28,097 (98.9%)
Missing authors:  317 (1.1%)

MISSING AUTHORS BY YEAR:
2001: 5 missing authors (1 papers)
2002: 4 missing authors (1 papers)
2005: 7 missing authors (3 papers)
2006: 3 missing authors (3 papers)
2007: 15 missing authors (3 papers)
2008: 9 missing authors (2 papers)
2009: 17 missing authors (2 papers)
2018: 16 missing authors (3 papers)
2019: 49 missing authors (5 papers)
2020: 22 missing authors (3 papers)
2021: 30 missing authors (6 papers)
2022: 43 missing authors (5 papers)
2024: 43 missing authors (8 papers)
2025: 54 missing authors (10 papers)


In [None]:
import os
import json
from pathlib import Path
from collections import defaultdict
from dotenv import load_dotenv

load_dotenv()

# Using the path from your environment variables
ENRICHED_JSONL = Path(r"D:\ITMO Big Data & ML School\semester 3\RI3\notebooks\data\interim\iccs\ICCS_openalex_enriched.jsonl")

print("ICCS DATASET: OPENALEX & GROBID MISSING AUTHOR ANALYSIS")
print("-" * 60)

missing_details = []
total_papers = 0
total_authors = 0
covered_authors = 0

if not ENRICHED_JSONL.exists():
    print(f"Error: File not found at {ENRICHED_JSONL}")
    exit(1)

with open(ENRICHED_JSONL, "r", encoding="utf-8") as f:
    for line in f:
        if not line.strip(): continue
            
        rec = json.loads(line)
        total_papers += 1
        
        # Extract metadata
        doi = rec.get("normalized_doi") or rec.get("dblp_doi") or "NO_DOI"
        title = rec.get("dblp_title") or "Unknown Title"
        oa_work = rec.get("openalex_work")
        
        if not oa_work:
            continue
            
        authorships = oa_work.get("authorships", [])
        total_authors += len(authorships)
        
        for auth in authorships:
            # Check for any affiliation data (OpenAlex sources + GROBID PDF extraction)
            aff_sources = [
                auth.get("institutions", []),
                auth.get("raw_affiliation_strings", []),
                auth.get("affiliations", [])
            ]
            pdf_aff = auth.get("raw_pdf_affiliation")
            bad_pdf_vals = {None, "Not found in PDF", "Empty in PDF", 0, ""}
            
            has_pdf = pdf_aff and pdf_aff not in bad_pdf_vals
            has_affil = any(len(src) > 0 for src in aff_sources) or has_pdf
            
            if has_affil:
                covered_authors += 1
            else:
                # Capture the name of the author missing data
                author_name = auth.get("author", {}).get("display_name") or "Unknown Author"
                missing_details.append({
                    "doi": doi,
                    "title": title[:70] + "..." if len(title) > 70 else title,
                    "author": author_name
                })

# SCREEN OUTPUT
print(f"{'AUTHOR NAME':<25} | {'DOI':<25} | {'TITLE'}")
print("-" * 100)

for item in missing_details:
    print(f"{item['author']:<25} | {item['doi']:<25} | {item['title']}")

print("-" * 100)
print(f"Total Papers Scanned:  {total_papers:,}")
print(f"Total Authors Found:   {total_authors:,}")

if total_authors > 0:
    missing_count = total_authors - covered_authors
    print(f"Covered Authors:       {covered_authors:,} ({covered_authors/total_authors*100:.1f}%)")
    print(f"Missing Authors:       {missing_count:,} ({missing_count/total_authors*100:.1f}%)")

ICCS DATASET: OPENALEX & GROBID MISSING AUTHOR ANALYSIS
------------------------------------------------------------
AUTHOR NAME               | DOI                       | TITLE
----------------------------------------------------------------------------------------------------
International Conference on Computational Science 2025 Singapur | 10.1007/978-3-031-97626-1 | Computational Science - ICCS 2025 - 25th International Conference, Sin...
Lees, Michael             | 10.1007/978-3-031-97626-1 | Computational Science - ICCS 2025 - 25th International Conference, Sin...
Cai, Wentong              | 10.1007/978-3-031-97626-1 | Computational Science - ICCS 2025 - 25th International Conference, Sin...
Cheong, Siew Ann          | 10.1007/978-3-031-97626-1 | Computational Science - ICCS 2025 - 25th International Conference, Sin...
Su, Yi                    | 10.1007/978-3-031-97626-1 | Computational Science - ICCS 2025 - 25th International Conference, Sin...
Abramson, David           | 10.1

CSV for still missing values

In [20]:
import json
import pandas as pd
from pathlib import Path
from dotenv import load_dotenv
from collections import defaultdict

load_dotenv()

ENRICHED_JSONL = Path(os.getenv("ICCS_ENRICHED_OA_GROBID"))
OUTPUT_CSV = Path("iccs_all_years_missing_authors.csv")

missing_data = []

print("ICCS ALL YEARS: EXPORTING MISSING AUTHORS")

with open(ENRICHED_JSONL, "r", encoding="utf-8") as f:
    for line in f:
        if not line.strip():
            continue
            
        rec = json.loads(line)
        year = rec.get("dblp_year")
        if not year:
            continue
            
        oa_work = rec.get("openalex_work")
        if not oa_work:
            continue
            
        authorships = oa_work.get("authorships", [])
        missing_authors = []
        
        for auth in authorships:
            aff_sources = [
                auth.get("institutions", []),
                auth.get("raw_affiliation_strings", []),
                auth.get("affiliations", [])
            ]
            pdf_aff = auth.get("raw_pdf_affiliation")
            bad_pdf_vals = {None, "Not found in PDF", "Empty in PDF", 0}
            has_pdf = pdf_aff and pdf_aff not in bad_pdf_vals
            
            if not (any(len(src) > 0 for src in aff_sources) or has_pdf):
                author_name = (
                    auth.get("author", {}).get("display_name") 
                    or auth.get("raw_author_name", "Unknown")
                )
                missing_authors.append(author_name)
        
        if missing_authors:
            doi = rec.get("normalized_doi") or rec.get("dblp_doi") or "NO_DOI"
            title = rec.get("dblp_title", "No title")
            
            missing_data.append({
                "year": year,
                "doi": doi,
                "title": title,
                "missing_authors": "; ".join(missing_authors),
                "total_authors": len(authorships),
                "missing_count": len(missing_authors)
            })
            
df = pd.DataFrame(missing_data)
df.to_csv(OUTPUT_CSV, index=False)

print(f"Exported {len(missing_data)} papers to {OUTPUT_CSV}")
print(f"Total missing authors across all years: {df['missing_count'].sum()}")
print()


ICCS ALL YEARS: EXPORTING MISSING AUTHORS
Exported 55 papers to iccs_all_years_missing_authors.csv
Total missing authors across all years: 317



In [None]:
import os
import json
import pandas as pd
from pathlib import Path
from dotenv import load_dotenv

load_dotenv()

# CONFIGURATION
CSV_FILE = Path(r"D:\ITMO Big Data & ML School\semester 3\RI3\notebooks\data\interim\iccs\iccs_all_years_missing_authors.csv")
ENRICHED_JSONL = Path(os.getenv("ICCS_ENRICHED_OA_GROBID"))
FINAL_JSONL = Path(os.getenv("ICCS_FINAL"))

# SAFE LIST: Protecting the consortium and study team papers
EXCLUDE_FROM_DELETION = {
    "10.1007/978-3-031-63759-9_29", # GEMINI consortium paper
    "10.1007/978-3-031-08757-8_14"  # BIOCARD Study Team paper
}

if not CSV_FILE.exists():
    print(f"Error: Missing Authors CSV not found at {CSV_FILE}")
    exit(1)

# Load the DOIs to be purged
df_missing = pd.read_csv(CSV_FILE)
bad_dois = set(df_missing["doi"].dropna().unique())

# Remove 'Safe List' from the bad_dois set so they are NOT deleted
bad_dois = bad_dois - EXCLUDE_FROM_DELETION

complete_records = []
proceedings_count = 0
oa_null_count = 0
total_scanned = 0

print(f"Filtering Master File: {ENRICHED_JSONL.name}")


with open(ENRICHED_JSONL, "r", encoding="utf-8") as f:
    for line in f:
        if not line.strip(): continue
            
        rec = json.loads(line)
        total_scanned += 1
        doi = rec.get("normalized_doi") or rec.get("dblp_doi")
        
        # 1. Filter out Volume Headers/Proceedings
        if doi in bad_dois:
            proceedings_count += 1
            continue
            
        # 2. Filter out records where OpenAlex failed to find a match
        if rec.get("openalex_work") is None:
            oa_null_count += 1
            continue
            
        complete_records.append(rec)


FINAL_JSONL.parent.mkdir(parents=True, exist_ok=True)
with open(FINAL_JSONL, "w", encoding="utf-8") as f:
    for rec in complete_records:
        f.write(json.dumps(rec, ensure_ascii=False) + "\n")

# FINAL REPORT
print("-" * 40)
print(f"ICCS FINAL CLEANUP COMPLETE (2026)")
print("-" * 40)
print(f"Total Papers Scanned  : {total_scanned:,}")
print(f"Removed (Proceedings) : {proceedings_count:,}")
print(f"Removed (Null OA)     : {oa_null_count:,}")
print(f"Protected (Safe List) : {len(EXCLUDE_FROM_DELETION)}")
print(f"Final Usable Dataset  : {len(complete_records):,}")
print(f"File Saved To         : {FINAL_JSONL}")

Filtering Master File: ICCS_openalex_enriched.jsonl
----------------------------------------
ICCS FINAL CLEANUP COMPLETE (2026)
----------------------------------------
Total Papers Scanned  : 8,355
Removed (Proceedings) : 53
Removed (Null OA)     : 0
Protected (Safe List) : 2
Final Usable Dataset  : 8,302
File Saved To         : D:\ITMO Big Data & ML School\semester 3\RI3\notebooks\data\processed\iccs_final_complete_authors.jsonl


"Applied OpenAlex metadata (98.9% author coverage) 
+ GROBID affiliation gap-filling to 8,355 ICCS records. 
Filtered 53/55 (96%) incomplete proceedings volumes, 
retaining 2 substantive papers. Final: 8,302 complete records."

In [25]:
import json
from pathlib import Path
from collections import defaultdict
from dotenv import load_dotenv

load_dotenv()

ENRICHED_JSONL = Path(r"D:\ITMO Big Data & ML School\semester 3\RI3\notebooks\data\processed\iccs_final_complete_authors.jsonl")

print("ICCS DATASET: OPENALEX COVERAGE ANALYSIS")

missing_authors_by_year = defaultdict(list)
year_counts = defaultdict(int)
total_papers = 0
total_authors = 0
covered_authors = 0

with open(ENRICHED_JSONL , "r", encoding="utf-8") as f:
    for line in f:
        if not line.strip():
            continue
            
        rec = json.loads(line)
        total_papers += 1
        year = rec.get("dblp_year")
        
        oa_work = rec.get("openalex_work")
        if not oa_work:
            if year:
                missing_authors_by_year[year].append("NO_OPENALEX")
            continue
            
        authorships = oa_work.get("authorships", [])
        total_authors += len(authorships)
        
        missing_count = 0
        for auth in authorships:
            # Check if author has ANY affiliation data
            aff_sources = [
                auth.get("institutions", []),
                auth.get("raw_affiliation_strings", []),
                auth.get("affiliations", [])
            ]
            pdf_aff = auth.get("raw_pdf_affiliation")
            bad_pdf_vals = {None, "Not found in PDF", "Empty in PDF", 0}
            has_pdf = pdf_aff and pdf_aff not in bad_pdf_vals
            
            has_affil = any(len(src) > 0 for src in aff_sources) or has_pdf
            
            
            if has_affil:
                covered_authors += 1
            else:
                missing_count += 1
                
        if missing_count > 0 and year:
            missing_authors_by_year[year].append({
                "doi": rec.get("normalized_doi") or rec.get("dblp_doi"),
                "missing_authors": missing_count,
                "total_authors": len(authorships)
            })

print(f"Total papers:     {total_papers:,}")
print(f"Total authors:    {total_authors:,}")

if total_authors > 0:
    print(f"Covered authors:  {covered_authors:,} ({covered_authors/total_authors*100:.1f}%)")
    print(f"Missing authors:  {total_authors-covered_authors:,} ({(total_authors-covered_authors)/total_authors*100:.1f}%)")
else:
    print("No authors found - check OpenAlex data structure")

print()

print("MISSING AUTHORS BY YEAR:")
for year in sorted(missing_authors_by_year.keys()):
    year_data = missing_authors_by_year[year]
    total_missing_year = sum(d["missing_authors"] for d in year_data if isinstance(d, dict))
    print(f"{year}: {total_missing_year} missing authors ({len([d for d in year_data if isinstance(d, dict)])} papers)")

ICCS DATASET: OPENALEX COVERAGE ANALYSIS
Total papers:     8,302
Total authors:    28,010
Covered authors:  28,008 (100.0%)
Missing authors:  2 (0.0%)

MISSING AUTHORS BY YEAR:
2022: 1 missing authors (1 papers)
2024: 1 missing authors (1 papers)
