# OpenAlex by-year downloader to a single Parquet

This notebook fetches all `works` for the Communication subfield (`subfields/3315`) from years 2000–2025 using cursor-based paging, and writes them into a single Parquet file at `data/processed/communication_works.parquet`.

- Uses `OPENALEX_MAILTO` from environment for the polite pool
- Streams page results into Parquet to avoid high memory usage
- Includes progress logging and count validation per year
- Kernel: Python (InvisibleResearch venv)


In [1]:
import os
import json
import time
import math
from datetime import datetime
from typing import Dict, Any, Iterator, List, Optional

import requests
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
from tqdm import tqdm
# Optional: load .env using python-dotenv if available; fallback to manual loader
try:
    from dotenv import load_dotenv  # type: ignore
    load_dotenv()
except Exception:
    def _simple_load_dotenv(path: str = ".env") -> None:
        if not os.path.exists(path):
            return
        with open(path, "r", encoding="utf-8") as f:
            for raw in f:
                line = raw.strip()
                if not line or line.startswith("#"):
                    continue
                if "=" not in line:
                    continue
                key, val = line.split("=", 1)
                key = key.strip()
                val = val.strip().strip("'\"")
                os.environ.setdefault(key, val)
    _simple_load_dotenv()

# Configuration (fixed as per confirmation)
SUBFIELD_ID = "subfields/3315"  # Communication
START_YEAR = int(os.getenv("OPENALEX_START_YEAR", 2000))
END_YEAR = int(os.getenv("OPENALEX_END_YEAR", 2025))
PARQUET_PATH = os.path.join("data", "processed", "communication_works.parquet")
PER_PAGE_CANDIDATES = [200, 150, 100]  # try larger first, fallback if needed
REQUEST_TIMEOUT = 60
RETRY_MAX = 5
BACKOFF_BASE = 1.5

# Polite pool email from environment
CONTACT_EMAIL = os.getenv("OPENALEX_MAILTO")
if not CONTACT_EMAIL:
    raise RuntimeError("Please set environment variable OPENALEX_MAILTO to your contact email.")

os.makedirs(os.path.dirname(PARQUET_PATH), exist_ok=True)

BASE_URL = "https://api.openalex.org/works"


def fetch_page(year: int, cursor: str, per_page: int, mailto: str) -> Dict[str, Any]:
    params = {
        "filter": f"primary_topic.subfield.id:{SUBFIELD_ID},publication_year:{year}",
        "per-page": per_page,
        "cursor": cursor,
        "mailto": mailto,
    }
    resp = requests.get(BASE_URL, params=params, timeout=REQUEST_TIMEOUT)
    resp.raise_for_status()
    return resp.json()


def iterate_year(year: int, per_page: int, mailto: str, sleep_seconds: float = 0.3) -> Iterator[Dict[str, Any]]:
    cursor = "*"
    total_retrieved = 0
    meta_count: Optional[int] = None
    while True:
        data = fetch_page(year, cursor=cursor, per_page=per_page, mailto=mailto)
        if meta_count is None:
            meta_count = data.get("meta", {}).get("count")
            print(f"Year {year}: expected {meta_count} works with per_page={per_page}")
        results = data.get("results", [])
        for work in results:
            total_retrieved += 1
            yield work
        next_cursor = data.get("meta", {}).get("next_cursor")
        if not next_cursor:
            break
        cursor = next_cursor
        time.sleep(sleep_seconds)
    if meta_count is not None and total_retrieved != meta_count:
        print(f"WARNING: Year {year} mismatch: retrieved {total_retrieved} vs meta.count {meta_count}")
    else:
        print(f"Year {year}: validated {total_retrieved} works")


def robust_iterate_year(year: int, mailto: str) -> Iterator[Dict[str, Any]]:
    last_error: Optional[Exception] = None
    for per_page in PER_PAGE_CANDIDATES:
        for attempt in range(1, RETRY_MAX + 1):
            try:
                yield from iterate_year(year, per_page=per_page, mailto=mailto)
                last_error = None
                break
            except requests.HTTPError as e:
                last_error = e
                status = getattr(e.response, "status_code", None)
                if status in (429, 502, 503, 504):
                    delay = BACKOFF_BASE ** attempt
                    print(f"HTTP {status} on year {year}, per_page={per_page}, retry {attempt}/{RETRY_MAX} after {delay:.1f}s...")
                    time.sleep(delay)
                    continue
                raise
            except requests.RequestException as e:
                last_error = e
                delay = BACKOFF_BASE ** attempt
                print(f"Network error on year {year}, per_page={per_page}, retry {attempt}/{RETRY_MAX} after {delay:.1f}s...")
                time.sleep(delay)
                continue
        if last_error is None:
            return
    if last_error is not None:
        raise last_error


def flatten_work(record: Dict[str, Any]) -> Dict[str, Any]:
    # Minimal flattening: keep id, title, publication_year, primary_topic, type, doi, authorship count
    out = {
        "id": record.get("id"),
        "title": record.get("title"),
        "publication_year": record.get("publication_year"),
        "type": record.get("type"),
        "doi": record.get("doi"),
        "authorships_count": len(record.get("authorships", []) or []),
        "primary_topic_id": (record.get("primary_topic") or {}).get("id"),
        "primary_topic_display_name": (record.get("primary_topic") or {}).get("display_name"),
    }
    return out


start = datetime.now()
print(f"Writing to {PARQUET_PATH}")
writer: Optional[pq.ParquetWriter] = None
schema: Optional[pa.schema] = None

try:
    for year in range(START_YEAR, END_YEAR + 1):
        batch_rows: List[Dict[str, Any]] = []
        with tqdm(desc=f"Year {year}", unit="works") as pbar:
            for work in robust_iterate_year(year, mailto=CONTACT_EMAIL):
                batch_rows.append(flatten_work(work))
                # flush in chunks to manage memory
                if len(batch_rows) >= 2000:
                    table = pa.Table.from_pylist(batch_rows)
                    if writer is None:
                        schema = table.schema
                        writer = pq.ParquetWriter(PARQUET_PATH, schema=schema)
                    writer.write_table(table)
                    pbar.update(len(batch_rows))
                    batch_rows.clear()
            # flush remainder for the year
            if batch_rows:
                table = pa.Table.from_pylist(batch_rows)
                if writer is None:
                    schema = table.schema
                    writer = pq.ParquetWriter(PARQUET_PATH, schema=schema)
                writer.write_table(table)
                pbar.update(len(batch_rows))
                batch_rows.clear()
finally:
    if writer is not None:
        writer.close()
        print(f"Closed Parquet writer: {PARQUET_PATH}")
    elapsed = datetime.now() - start
    print(f"Completed in {elapsed}")


Writing to data/processed/communication_works.parquet


Year 2000: 0works [00:00, ?works/s]

Year 2000: expected 5935 works with per_page=200


Year 2000: 2000works [00:12, 159.71works/s]

Year 2000: 4000works [00:22, 177.20works/s]

Year 2000: 5935works [00:34, 169.53works/s]

Year 2000: 5935works [00:34, 169.73works/s]




Year 2000: validated 5935 works


Year 2001: 0works [00:00, ?works/s]

Year 2001: expected 6330 works with per_page=200


Year 2001: 2000works [00:12, 158.14works/s]

Year 2001: 4000works [00:23, 175.23works/s]

Year 2001: 6000works [00:34, 173.12works/s]

Year 2001: 6330works [00:37, 168.01works/s]

Year 2001: 6330works [00:37, 169.16works/s]




Year 2001: validated 6330 works


Year 2002: 0works [00:00, ?works/s]

Year 2002: expected 7409 works with per_page=200


Year 2002: 2000works [00:12, 165.94works/s]

Year 2002: 4000works [00:23, 168.32works/s]

Year 2002: 6000works [00:34, 174.90works/s]

Year 2002: 7409works [00:44, 164.69works/s]

Year 2002: 7409works [00:44, 167.01works/s]




Year 2002: validated 7409 works


Year 2003: 0works [00:00, ?works/s]

Year 2003: expected 8174 works with per_page=200


Year 2003: 2000works [00:14, 135.99works/s]

Year 2003: 4000works [00:27, 149.30works/s]

Year 2003: 6000works [00:38, 158.59works/s]

Year 2003: 8000works [00:50, 165.62works/s]

Year 2003: 8174works [00:51, 161.70works/s]

Year 2003: 8174works [00:51, 158.07works/s]




Year 2003: validated 8174 works


Year 2004: 0works [00:00, ?works/s]

Year 2004: expected 9280 works with per_page=200


Year 2004: 2000works [00:15, 130.85works/s]

Year 2004: 4000works [00:28, 141.84works/s]

Year 2004: 6000works [00:40, 154.66works/s]

Year 2004: 6000works [00:51, 154.66works/s]

Year 2004: 8000works [00:53, 153.91works/s]

Year 2004: 9280works [01:01, 152.72works/s]

Year 2004: 9280works [01:01, 150.28works/s]




Year 2004: validated 9280 works


Year 2005: 0works [00:00, ?works/s]

Year 2005: expected 10746 works with per_page=200


Year 2005: 2000works [00:14, 142.00works/s]

Year 2005: 4000works [00:26, 150.67works/s]

Year 2005: 4000works [00:39, 150.67works/s]

Year 2005: 6000works [00:40, 149.13works/s]

Year 2005: 8000works [00:54, 147.55works/s]

Year 2005: 10000works [01:07, 147.15works/s]

Year 2005: 10746works [01:13, 143.50works/s]

Year 2005: 10746works [01:13, 145.87works/s]




Year 2005: validated 10746 works


Year 2006: 0works [00:00, ?works/s]

Year 2006: expected 12056 works with per_page=200


Year 2006: 2000works [00:13, 148.13works/s]

Year 2006: 4000works [00:26, 153.58works/s]

Year 2006: 4000works [00:36, 153.58works/s]

Year 2006: 6000works [00:37, 161.15works/s]

Year 2006: 8000works [00:49, 163.36works/s]

Year 2006: 10000works [01:01, 168.36works/s]

Year 2006: 12000works [01:12, 171.40works/s]

Year 2006: 12056works [01:13, 163.82works/s]

Year 2006: 12056works [01:13, 163.35works/s]




Year 2006: validated 12056 works


Year 2007: 0works [00:00, ?works/s]

Year 2007: expected 13959 works with per_page=200


Year 2007: 2000works [00:12, 163.79works/s]

Year 2007: 2000works [00:22, 163.79works/s]

Year 2007: 4000works [00:26, 148.70works/s]

Year 2007: 6000works [00:41, 142.95works/s]

Year 2007: 6000works [00:52, 142.95works/s]

Year 2007: 8000works [00:53, 148.60works/s]

Year 2007: 10000works [01:06, 152.71works/s]

Year 2007: 12000works [01:17, 159.28works/s]

Year 2007: 13959works [01:30, 158.14works/s]

Year 2007: 13959works [01:30, 154.51works/s]




Year 2007: validated 13959 works


Year 2008: 0works [00:00, ?works/s]

Year 2008: expected 16663 works with per_page=200


Year 2008: 2000works [00:15, 128.07works/s]

Year 2008: 4000works [00:28, 140.08works/s]

Year 2008: 6000works [00:41, 147.71works/s]

Year 2008: 6000works [00:52, 147.71works/s]

Year 2008: 8000works [00:53, 157.02works/s]

Year 2008: 10000works [01:06, 154.43works/s]

Year 2008: 12000works [01:18, 158.53works/s]

Year 2008: 14000works [01:29, 166.87works/s]

Year 2008: 16000works [01:41, 167.05works/s]

Year 2008: 16663works [01:45, 164.44works/s]

Year 2008: 16663works [01:45, 157.94works/s]




Year 2008: validated 16663 works


Year 2009: 0works [00:00, ?works/s]

Year 2009: expected 19447 works with per_page=200


Year 2009: 2000works [00:14, 139.35works/s]

Year 2009: 2000works [00:26, 139.35works/s]

Year 2009: 4000works [00:27, 145.23works/s]

Year 2009: 6000works [00:41, 145.26works/s]

Year 2009: 8000works [00:54, 146.67works/s]

Year 2009: 8000works [01:06, 146.67works/s]

Year 2009: 10000works [01:08, 148.59works/s]

Year 2009: 12000works [01:21, 148.26works/s]

Year 2009: 14000works [01:33, 153.65works/s]

Year 2009: 16000works [01:45, 156.32works/s]

Year 2009: 18000works [01:56, 164.50works/s]

Year 2009: 19447works [02:06, 161.71works/s]

Year 2009: 19447works [02:06, 154.16works/s]




Year 2009: validated 19447 works


Year 2010: 0works [00:00, ?works/s]

Year 2010: expected 21854 works with per_page=200


Year 2010: 2000works [00:17, 114.11works/s]

Year 2010: 4000works [00:30, 135.92works/s]

Year 2010: 4000works [00:40, 135.92works/s]

Year 2010: 6000works [00:43, 144.36works/s]

Year 2010: 8000works [00:55, 150.86works/s]

Year 2010: 10000works [01:07, 156.46works/s]

Year 2010: 10000works [01:20, 156.46works/s]

Year 2010: 12000works [01:21, 150.95works/s]

Year 2010: 14000works [01:35, 149.93works/s]

Year 2010: 16000works [01:48, 151.26works/s]

Year 2010: 18000works [02:00, 153.84works/s]

Year 2010: 18000works [02:10, 153.84works/s]

Year 2010: 20000works [02:11, 160.45works/s]

Year 2010: 21854works [02:24, 155.07works/s]

Year 2010: 21854works [02:24, 150.97works/s]




Year 2010: validated 21854 works


Year 2011: 0works [00:00, ?works/s]

Year 2011: expected 25249 works with per_page=200


Year 2011: 2000works [00:14, 133.98works/s]

Year 2011: 2000works [00:26, 133.98works/s]

Year 2011: 4000works [00:32, 122.91works/s]

Year 2011: 4000works [00:46, 122.91works/s]

Year 2011: 6000works [00:50, 115.41works/s]

Year 2011: 8000works [01:04, 124.20works/s]

Year 2011: 8000works [01:16, 124.20works/s]

Year 2011: 10000works [01:17, 134.73works/s]

Year 2011: 12000works [01:30, 139.46works/s]

Year 2011: 14000works [01:44, 143.15works/s]

Year 2011: 16000works [01:55, 150.64works/s]

Year 2011: 16000works [02:06, 150.64works/s]

Year 2011: 18000works [02:08, 153.36works/s]

Year 2011: 20000works [02:20, 156.86works/s]

Year 2011: 22000works [02:32, 159.74works/s]

Year 2011: 24000works [02:45, 158.32works/s]

Year 2011: 25249works [02:56, 147.47works/s]

Year 2011: 25249works [02:56, 143.42works/s]




Year 2011: validated 25249 works


Year 2012: 0works [00:00, ?works/s]

Year 2012: expected 26398 works with per_page=200


Year 2012: 2000works [00:17, 114.68works/s]

Year 2012: 2000works [00:30, 114.68works/s]

Year 2012: 4000works [00:31, 130.39works/s]

Year 2012: 6000works [00:48, 125.04works/s]

Year 2012: 6000works [01:00, 125.04works/s]

Year 2012: 8000works [01:00, 137.66works/s]

Year 2012: 10000works [01:12, 146.00works/s]

Year 2012: 12000works [01:27, 142.05works/s]

Year 2012: 12000works [01:40, 142.05works/s]

Year 2012: 14000works [01:41, 143.04works/s]

Year 2012: 16000works [01:53, 147.63works/s]

Year 2012: 18000works [02:06, 152.06works/s]

Year 2012: 20000works [02:17, 157.80works/s]

Year 2012: 22000works [02:28, 163.24works/s]

Year 2012: 22000works [02:40, 163.24works/s]

Year 2012: 24000works [02:40, 165.02works/s]

Year 2012: 26000works [02:52, 165.51works/s]

Year 2012: 26398works [02:55, 164.56works/s]

Year 2012: 26398works [02:55, 150.51works/s]




Year 2012: validated 26398 works


Year 2013: 0works [00:00, ?works/s]

Year 2013: expected 27828 works with per_page=200


Year 2013: 2000works [00:15, 129.41works/s]

Year 2013: 4000works [00:29, 135.22works/s]

Year 2013: 6000works [00:43, 138.52works/s]

Year 2013: 6000works [00:54, 138.52works/s]

Year 2013: 8000works [00:58, 138.99works/s]

Year 2013: 10000works [01:10, 144.81works/s]

Year 2013: 12000works [01:24, 145.67works/s]

Year 2013: 12000works [01:34, 145.67works/s]

Year 2013: 14000works [01:36, 152.14works/s]

Year 2013: 16000works [01:48, 156.83works/s]

Year 2013: 18000works [02:04, 144.91works/s]

Year 2013: 18000works [02:15, 144.91works/s]

Year 2013: 20000works [02:16, 150.64works/s]

Year 2013: 22000works [02:27, 157.43works/s]

Year 2013: 24000works [02:38, 165.59works/s]

Year 2013: 26000works [02:49, 169.11works/s]

Year 2013: 27828works [03:01, 164.16works/s]

Year 2013: 27828works [03:01, 153.07works/s]




Year 2013: validated 27828 works


Year 2014: 0works [00:00, ?works/s]

Year 2014: expected 29861 works with per_page=200


Year 2014: 2000works [00:13, 142.90works/s]

Year 2014: 4000works [00:28, 141.99works/s]

Year 2014: 4000works [00:43, 141.99works/s]

Year 2014: 6000works [00:44, 133.14works/s]

Year 2014: 8000works [00:59, 132.74works/s]

Year 2014: 8000works [01:13, 132.74works/s]

Year 2014: 10000works [01:14, 133.21works/s]

Year 2014: 12000works [01:28, 136.98works/s]

Year 2014: 14000works [01:41, 141.36works/s]

Year 2014: 14000works [01:53, 141.36works/s]

Year 2014: 16000works [01:54, 143.74works/s]

Year 2014: 18000works [02:08, 144.23works/s]

Year 2014: 20000works [02:21, 148.02works/s]

Year 2014: 20000works [02:33, 148.02works/s]

Year 2014: 22000works [02:34, 150.25works/s]

Year 2014: 24000works [02:46, 154.92works/s]

Year 2014: 26000works [02:57, 160.05works/s]

Year 2014: 28000works [03:10, 159.01works/s]

Year 2014: 29861works [03:22, 156.84works/s]

Year 2014: 29861works [03:22, 147.34works/s]




Year 2014: validated 29861 works


Year 2015: 0works [00:00, ?works/s]

Year 2015: expected 30822 works with per_page=200


Year 2015: 2000works [00:15, 125.05works/s]

Year 2015: 2000works [00:30, 125.05works/s]

Year 2015: 4000works [01:30, 39.56works/s] 

Year 2015: 6000works [01:43, 60.45works/s]

Year 2015: 8000works [01:56, 79.43works/s]

Year 2015: 10000works [02:08, 97.07works/s]

Year 2015: 10000works [02:20, 97.07works/s]

Year 2015: 12000works [02:21, 110.40works/s]

Year 2015: 14000works [02:33, 126.28works/s]

Year 2015: 16000works [02:44, 139.02works/s]

Year 2015: 18000works [02:58, 140.86works/s]

Year 2015: 20000works [03:10, 146.53works/s]

Year 2015: 20000works [03:20, 146.53works/s]

Year 2015: 22000works [03:24, 146.99works/s]

Year 2015: 24000works [03:35, 155.83works/s]

Year 2015: 26000works [03:46, 160.88works/s]

Year 2015: 28000works [03:58, 161.69works/s]

Year 2015: 30000works [04:09, 168.67works/s]

Year 2015: 30822works [04:15, 164.58works/s]

Year 2015: 30822works [04:15, 120.79works/s]




Year 2015: validated 30822 works


Year 2016: 0works [00:00, ?works/s]

Year 2016: expected 31594 works with per_page=200


Year 2016: 2000works [00:13, 146.46works/s]

Year 2016: 2000works [00:25, 146.46works/s]

Year 2016: 4000works [00:28, 136.68works/s]

Year 2016: 6000works [00:42, 140.63works/s]

Year 2016: 6000works [00:55, 140.63works/s]

Year 2016: 8000works [00:59, 130.48works/s]

Year 2016: 10000works [01:15, 129.91works/s]

Year 2016: 10000works [01:25, 129.91works/s]

Year 2016: 12000works [01:27, 138.56works/s]

Year 2016: 14000works [01:42, 138.79works/s]

Year 2016: 16000works [01:55, 142.43works/s]

Year 2016: 16000works [02:05, 142.43works/s]

Year 2016: 18000works [02:08, 146.83works/s]

Year 2016: 20000works [02:22, 145.08works/s]

Year 2016: 22000works [02:34, 149.02works/s]

Year 2016: 22000works [02:45, 149.02works/s]

Year 2016: 24000works [02:47, 153.31works/s]

Year 2016: 26000works [02:59, 156.65works/s]

Year 2016: 28000works [03:10, 161.81works/s]

Year 2016: 30000works [03:21, 166.46works/s]

Year 2016: 31595works [03:31, 166.62works/s]

Year 2016: 31595works [03:31, 149.48works/s]






Year 2017: 0works [00:00, ?works/s]

Year 2017: expected 31461 works with per_page=200


Year 2017: 2000works [00:15, 127.80works/s]

Year 2017: 4000works [00:31, 125.49works/s]

Year 2017: 6000works [00:44, 139.22works/s]

Year 2017: 6000works [00:54, 139.22works/s]

Year 2017: 8000works [00:59, 137.82works/s]

Year 2017: 8000works [01:14, 137.82works/s]

Year 2017: 10000works [02:15, 54.09works/s]

Year 2017: 12000works [02:29, 68.95works/s]

Year 2017: 14000works [02:41, 85.05works/s]

Year 2017: 16000works [02:52, 102.70works/s]

Year 2017: 16000works [03:04, 102.70works/s]

Year 2017: 18000works [03:04, 115.98works/s]

Year 2017: 20000works [03:18, 124.39works/s]

Year 2017: 22000works [03:29, 136.58works/s]

Year 2017: 24000works [03:42, 142.71works/s]

Year 2017: 26000works [03:53, 153.19works/s]

Year 2017: 26000works [04:04, 153.19works/s]

Year 2017: 28000works [04:05, 156.56works/s]

Year 2017: 30000works [04:17, 158.84works/s]

Year 2017: 31461works [04:28, 151.39works/s]

Year 2017: 31461works [04:28, 117.22works/s]




Year 2017: validated 31461 works


Year 2018: 0works [00:00, ?works/s]

Year 2018: expected 31229 works with per_page=200


Year 2018: 2000works [00:14, 137.84works/s]

Year 2018: 2000works [00:26, 137.84works/s]

Year 2018: 4000works [00:29, 133.73works/s]

Year 2018: 6000works [00:44, 135.54works/s]

Year 2018: 6000works [00:56, 135.54works/s]

Year 2018: 8000works [00:57, 140.08works/s]

Year 2018: 10000works [01:10, 147.70works/s]

Year 2018: 12000works [01:22, 152.08works/s]

Year 2018: 14000works [01:34, 154.62works/s]

Year 2018: 14000works [01:46, 154.62works/s]

Year 2018: 16000works [01:47, 154.76works/s]

Year 2018: 18000works [02:01, 153.34works/s]

Year 2018: 20000works [02:14, 152.86works/s]

Year 2018: 20000works [02:26, 152.86works/s]

Year 2018: 22000works [02:26, 155.44works/s]

Year 2018: 24000works [02:38, 157.83works/s]

Year 2018: 26000works [02:52, 156.11works/s]

Year 2018: 28000works [03:03, 159.78works/s]

Year 2018: 30000works [03:16, 160.35works/s]

Year 2018: 31229works [03:26, 150.86works/s]

Year 2018: 31229works [03:26, 151.42works/s]




Year 2018: validated 31229 works


Year 2019: 0works [00:00, ?works/s]

Year 2019: expected 32066 works with per_page=200


Year 2019: 2000works [00:15, 125.66works/s]

Year 2019: 2000works [00:30, 125.66works/s]

Year 2019: 4000works [00:32, 121.79works/s]

Year 2019: 6000works [00:45, 135.62works/s]

Year 2019: 8000works [00:59, 138.54works/s]

Year 2019: 8000works [01:10, 138.54works/s]

Year 2019: 10000works [01:11, 148.29works/s]

Year 2019: 12000works [01:23, 152.73works/s]

Year 2019: 14000works [01:35, 155.38works/s]

Year 2019: 16000works [01:47, 162.18works/s]

Year 2019: 18000works [02:00, 159.45works/s]

Year 2019: 18000works [02:10, 159.45works/s]

Year 2019: 20000works [02:12, 159.12works/s]

Year 2019: 22000works [02:25, 157.94works/s]

Year 2019: 22000works [02:40, 157.94works/s]

Year 2019: 24000works [02:40, 150.01works/s]

Year 2019: 26000works [02:50, 161.06works/s]

Year 2019: 28000works [03:02, 163.14works/s]

Year 2019: 30000works [03:15, 159.62works/s]

Year 2019: 32000works [03:27, 162.52works/s]

Year 2019: 32066works [03:29, 156.79works/s]

Year 2019: 32066works [03:29, 153.34works/s]




Year 2019: validated 32066 works


Year 2020: 0works [00:00, ?works/s]

Year 2020: expected 34756 works with per_page=200


Year 2020: 2000works [00:14, 134.93works/s]

Year 2020: 4000works [00:28, 139.01works/s]

Year 2020: 6000works [00:40, 150.57works/s]

Year 2020: 6000works [00:51, 150.57works/s]

Year 2020: 8000works [00:53, 153.06works/s]

Year 2020: 10000works [01:07, 149.32works/s]

Year 2020: 10000works [01:21, 149.32works/s]

Year 2020: 12000works [01:23, 140.74works/s]

Year 2020: 14000works [01:36, 145.13works/s]

Year 2020: 16000works [01:49, 147.94works/s]

Year 2020: 16000works [02:01, 147.94works/s]

Year 2020: 18000works [02:04, 143.43works/s]

Year 2020: 18000works [02:21, 143.43works/s]

Year 2020: 20000works [03:17, 61.93works/s] 

Year 2020: 22000works [03:29, 76.34works/s]

Year 2020: 22000works [03:41, 76.34works/s]

Year 2020: 24000works [03:42, 90.73works/s]

Year 2020: 26000works [03:53, 106.25works/s]

Year 2020: 28000works [04:06, 118.58works/s]

Year 2020: 30000works [04:18, 130.20works/s]

Year 2020: 32000works [04:28, 142.77works/s]

Year 2020: 34000works [04:40, 151.25works/s]

Year 2020: 34756works [04:45, 149.39works/s]

Year 2020: 34756works [04:45, 121.63works/s]




Year 2020: validated 34756 works


Year 2021: 0works [00:00, ?works/s]

Year 2021: expected 28122 works with per_page=200


Year 2021: 2000works [00:18, 110.31works/s]

Year 2021: 2000works [00:35, 110.31works/s]

Year 2021: 4000works [00:36, 110.51works/s]

Year 2021: 6000works [00:54, 110.04works/s]

Year 2021: 6000works [01:05, 110.04works/s]

Year 2021: 8000works [01:07, 122.45works/s]

Year 2021: 10000works [01:20, 133.19works/s]

Year 2021: 12000works [01:35, 133.95works/s]

Year 2021: 12000works [01:45, 133.95works/s]

Year 2021: 14000works [01:47, 143.09works/s]

Year 2021: 16000works [01:59, 150.42works/s]

Year 2021: 16000works [02:15, 150.42works/s]

Year 2021: 18000works [02:15, 140.11works/s]

Year 2021: 20000works [02:30, 137.22works/s]

Year 2021: 22000works [02:44, 139.63works/s]

Year 2021: 22000works [02:55, 139.63works/s]

Year 2021: 24000works [02:56, 146.81works/s]

Year 2021: 26000works [03:10, 145.93works/s]

Year 2021: 28000works [03:24, 144.76works/s]

Year 2021: 28122works [03:26, 141.23works/s]

Year 2021: 28122works [03:26, 136.28works/s]




Year 2021: validated 28122 works


Year 2022: 0works [00:00, ?works/s]

Year 2022: expected 22353 works with per_page=200


Year 2022: 2000works [00:18, 106.30works/s]

Year 2022: 2000works [00:29, 106.30works/s]

Year 2022: 4000works [00:33, 123.53works/s]

Year 2022: 6000works [00:47, 132.20works/s]

Year 2022: 6000works [00:59, 132.20works/s]

Year 2022: 8000works [01:00, 139.32works/s]

Year 2022: 10000works [01:14, 139.44works/s]

Year 2022: 12000works [01:28, 140.50works/s]

Year 2022: 12000works [01:39, 140.50works/s]

Year 2022: 14000works [01:42, 143.05works/s]

Year 2022: 16000works [01:54, 147.77works/s]

Year 2022: 18000works [02:07, 150.31works/s]

Year 2022: 18000works [02:19, 150.31works/s]

Year 2022: 20000works [02:20, 151.72works/s]

Year 2022: 22000works [02:33, 151.59works/s]

Year 2022: 22353works [02:36, 148.81works/s]

Year 2022: 22353works [02:36, 142.87works/s]




Year 2022: validated 22353 works


Year 2023: 0works [00:00, ?works/s]

Year 2023: expected 22893 works with per_page=200


Year 2023: 2000works [00:16, 119.31works/s]

Year 2023: 2000works [00:33, 119.31works/s]

Year 2023: 4000works [00:34, 116.82works/s]

Year 2023: 4000works [00:53, 116.82works/s]

Year 2023: 6000works [00:54, 107.92works/s]

Year 2023: 8000works [01:12, 108.74works/s]

Year 2023: 8000works [01:23, 108.74works/s]

Year 2023: 10000works [01:27, 116.64works/s]

Year 2023: 12000works [01:40, 127.19works/s]

Year 2023: 14000works [01:51, 140.45works/s]

Year 2023: 14000works [02:03, 140.45works/s]

Year 2023: 16000works [02:03, 147.94works/s]

Year 2023: 18000works [02:21, 135.63works/s]

Year 2023: 18000works [02:33, 135.63works/s]

Year 2023: 20000works [02:40, 124.18works/s]

Year 2023: 20000works [02:53, 124.18works/s]

Year 2023: 22000works [02:57, 120.51works/s]

Year 2023: 22892works [03:05, 118.96works/s]

Year 2023: 22892works [03:05, 123.15works/s]






Year 2024: 0works [00:00, ?works/s]

Year 2024: expected 21748 works with per_page=200


Year 2024: 2000works [00:14, 135.65works/s]

Year 2024: 2000works [00:27, 135.65works/s]

Year 2024: 4000works [00:28, 139.85works/s]

Year 2024: 6000works [00:41, 144.93works/s]

Year 2024: 8000works [00:54, 148.12works/s]

Year 2024: 10000works [01:06, 158.06works/s]

Year 2024: 10000works [01:17, 158.06works/s]

Year 2024: 12000works [01:17, 162.59works/s]

Year 2024: 14000works [01:30, 158.74works/s]

Year 2024: 16000works [01:42, 161.28works/s]

Year 2024: 18000works [01:56, 156.07works/s]

Year 2024: 18000works [02:07, 156.07works/s]

Year 2024: 20000works [02:08, 158.40works/s]

Year 2024: 21748works [02:19, 160.25works/s]

Year 2024: 21748works [02:19, 155.96works/s]




Year 2024: validated 21748 works


Year 2025: 0works [00:00, ?works/s]

Year 2025: expected 12545 works with per_page=200


Year 2025: 2000works [00:14, 139.05works/s]

Year 2025: 4000works [00:26, 152.04works/s]

Year 2025: 4000works [00:37, 152.04works/s]

Year 2025: 6000works [00:38, 158.04works/s]

Year 2025: 8000works [00:56, 135.01works/s]

Year 2025: 8000works [01:08, 135.01works/s]

Year 2025: 10000works [01:15, 122.84works/s]

Year 2025: 10000works [01:28, 122.84works/s]

Year 2025: 12000works [01:34, 117.47works/s]

Year 2025: 12544works [01:40, 113.93works/s]

Year 2025: 12544works [01:40, 125.34works/s]

Closed Parquet writer: data/processed/communication_works.parquet
Completed in 1:03:24.669628





## How to run

1. Ensure your venv is active and the kernel is installed:

```bash
source /Users/yann.jy/InvisibleResearch/.venv/bin/activate
python -m ipykernel install --user --name invisible-research-venv --display-name "Python (InvisibleResearch venv)"
```

2. Set your email for the polite pool in the same shell (no passwords needed):

```bash
export OPENALEX_MAILTO="jinyi.yang@student.uva.nl"
```

3. Open the notebook and select kernel "Python (InvisibleResearch venv)", then run all cells.


In [2]:
# Quick smoke test: fetch a small sample to verify connectivity and schema
import os
import requests

CONTACT_EMAIL = os.getenv("OPENALEX_MAILTO")
params = {
    "filter": "primary_topic.subfield.id:subfields/3315,publication_year:2020",
    "per-page": 5,
    "cursor": "*",
    "mailto": CONTACT_EMAIL,
}
resp = requests.get("https://api.openalex.org/works", params=params, timeout=30)
resp.raise_for_status()
js = resp.json()
print(js.get("meta", {}))
print("first ids:", [r.get("id") for r in js.get("results", [])])


{'count': 34756, 'db_response_time_ms': 121, 'page': None, 'per_page': 5, 'next_cursor': 'IlsxMDAuMCwgNjkzLCAnaHR0cHM6Ly9vcGVuYWxleC5vcmcvVzMwMDE0Mjg1NjInXSI=', 'groups_count': None}
first ids: ['https://openalex.org/W3003646990', 'https://openalex.org/W3014719091', 'https://openalex.org/W4236836887', 'https://openalex.org/W239563548', 'https://openalex.org/W3001428562']
