Concept: Measure th LLM research ecosystem using netwroks and NLP.
We will quantify what topics exist, how they evolve, who drives them and wich papers are ahead of the curve by combining citations/author-networks with abstract/full-text analysis

Why is it interesting? It turns a hyped space into a measurable structure. It can surface non-obvious leaders and emerginf subfields.

Datasets: Queried 20k papers from OpenAlex. filtering them by key-words and year of publication. 

Networks to analyze: 
- citation networks: paper->paper
- author collaboration network: authors->authors

Research questions or hypoheses:

- Evolution of LLMs in the 21st century. 
- What are the most hyped topics and why is it that way?  
- Are there other hidden directions this field is going into, that can be determined in advance?
- Most impactufl/cited papers are the ones that bring most value
- There are specific people in the AI community who write the write stuff and are the real innovators in the field



In [16]:
import os
import json
import requests
from tqdm import tqdm
import time

# SEARCH_QUERY = "(large language model OR language model OR pretrained language model OR foundation model OR transformer OR LLM OR agentic AI OR AI agent OR tool use OR retrieval augmented generation OR retrieval-augmented OR model context protocol)"
OUTPUT_FILE = "ai_papers.json"

MAX_RESULTS = 20000
PER_PAGE = 200

EMAIL = "s243873@student.dtu.dk"  # <-- REAL email, not example.com

BASE_URL = "https://api.openalex.org/works"

params = {
    "filter": (
        "from_publication_date:2000-01-01,"
        "open_access.is_oa:true,"
        "referenced_works_count:>0,"
        "title.search:(large language model|language model|pretrained language model|foundation model|transformer|LLM|agentic AI|AI agent|tool use|retrieval augmented generation|retrieval-augmented|model context protocol)"
    ),
    "per-page": PER_PAGE,
    "cursor": "*",
    "mailto": EMAIL
}


headers = {
    "User-Agent": f"DTU MSc research (contact: {EMAIL})",
    "Accept": "application/json",
}

all_papers = []

# Sanity probe: tiny request first so you see the real 403 message if any
probe = requests.get(f"{BASE_URL}?sample=1", headers=headers, params={"mailto": EMAIL}, timeout=20)
probe.raise_for_status()

while len(all_papers) < MAX_RESULTS:
    try:
        r = requests.get(BASE_URL, params=params, headers=headers, timeout=60)
        if r.status_code == 403:
            # Print server message to understand why (often user-agent/mailto)
            print("403 body:", r.text[:500])
        r.raise_for_status()
        data = r.json()
    except requests.HTTPError as e:
        print("HTTP error:", e)
        print("Body preview:", getattr(r, "text", "")[:500])
        # brief backoff; if it's a WAF hiccup, try again
        time.sleep(3)
        continue

    results = data.get("results", [])
    if not results:
        break

    for item in results:
        all_papers.append({
            "title": item.get("title"),
            "url": item.get("id"),
            "year": item.get("publication_year"),
            "authors": item.get("authorships", []),
            "referenced_by_number": item.get("cited_by_count"),
            "type": item.get("type"),
            "references": item.get("referenced_works", []),
        })
        if len(all_papers) >= MAX_RESULTS:
            break

    # cursor paging
    next_cursor = data.get("meta", {}).get("next_cursor")
    if not next_cursor:
        break
    params["cursor"] = next_cursor

    # be polite
    time.sleep(0.2)

print(f"Downloaded: {len(all_papers)}")

with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
    json.dump(all_papers, f, indent=2)

print("Saved:", OUTPUT_FILE)


Downloaded: 20000
Saved: ai_papers.json


In [2]:
import os
import json
import requests
from tqdm import tqdm
from pypdf import PdfReader
from io import BytesIO

INPUT_JSON = "ai_papers.json"       # <-- the file you already downloaded
OUTPUT_DIR = "papers_full"
os.makedirs(OUTPUT_DIR, exist_ok=True)

EMAIL = "s243873@student.dtu.dk"    # required by OpenAlex
HEADERS = {
    "User-Agent": f"DTU MSc Research (contact: {EMAIL})",
    "Accept": "application/json"
}

def fetch_full_metadata(openalex_url):
    try:
        r = requests.get(openalex_url, headers=HEADERS, params={"mailto": EMAIL}, timeout=30)

        # If response isn't JSON, log it
        if "application/json" not in r.headers.get("Content-Type", ""):
            print("\n⚠️ Non-JSON response from:", openalex_url)
            print("Response preview:", r.text[:200])
            return None

        return r.json()

    except Exception as e:
        print("\n❌ Error fetching:", openalex_url)
        print(e)
        return None

def extract_pdf_text(pdf_url):
    try:
        r = requests.get(pdf_url, timeout=30, headers=HEADERS)
        if r.status_code != 200:
            return None
        reader = PdfReader(BytesIO(r.content))
        return "\n".join([page.extract_text() or "" for page in reader.pages])
    except:
        return None

def decode_abstract(inv_index):
    if not inv_index:
        return None
    words = {}
    for word, positions in inv_index.items():
        for pos in positions:
            words[pos] = word
    return " ".join(words[pos] for pos in sorted(words))

def clean_text(text):
    if text is None:
        return None
    # remove invalid surrogate pairs
    return text.encode("utf-8", "surrogatepass").decode("utf-8", "ignore")


In [3]:
from concurrent.futures import ThreadPoolExecutor, as_completed

MAX_WORKERS = 5  # don't go higher, OpenAlex will rate limit

def process_paper(p):

    paper_id = p["url"].split("/")[-1]
    target_path = os.path.join(OUTPUT_DIR, f"{paper_id}.json")

    if os.path.exists(target_path) and os.path.getsize(target_path) > 100:
        print("Already downloaded:", target_path)
        return None

    api_url = p["url"].replace("https://openalex.org/", "https://api.openalex.org/works/")
    full = fetch_full_metadata(api_url)
    if not full:
        return None

    pdf_url = full.get("open_access", {}).get("oa_url")
    full_text = extract_pdf_text(pdf_url) if pdf_url else None
    abstract = decode_abstract(full.get("abstract_inverted_index"))

    if abstract is None and full_text is None:
        return None

    authors = [
        {
            "id": a["author"].get("id"),
            "name": a["author"].get("display_name"),
            "affiliations": [inst.get("display_name") for inst in a["institutions"]],
        }
        for a in full.get("authorships", [])
    ]

    return {
        "paper_id": full["id"].split("/")[-1],
        "data": {
            "title": full.get("title"),
            "url": full.get("id"),
            "year": full.get("publication_year"),
            "authors": authors,
            "references": full.get("referenced_works", []),
            "abstract": clean_text(abstract),
            "full_text": clean_text(full_text),
        },
    }

with open(INPUT_JSON, "r", encoding="utf-8") as f:
    papers = json.load(f)

with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
    futures = [executor.submit(process_paper, p) for p in papers]

    for future in tqdm(as_completed(futures), total=len(futures), desc="Processing"):
        result = future.result()
        if not result:
            continue

        paper_id = result["paper_id"]
        output = result["data"]

        with open(os.path.join(OUTPUT_DIR, f"{paper_id}.json"), "w", encoding="utf-8") as f:
            json.dump(output, f, indent=4, ensure_ascii=False)


invalid pdf header: b'{"ind'
EOF marker not found
Processing:  33%|███▎      | 6542/20000 [07:45<1:26:01,  2.61it/s]invalid pdf header: b'{"ind'
EOF marker not found
Processing:  33%|███▎      | 6545/20000 [07:46<1:12:57,  3.07it/s]invalid pdf header: b'<!DOC'
EOF marker not found
Processing:  33%|███▎      | 6547/20000 [07:49<4:01:34,  1.08s/it]invalid pdf header: b'{"ind'
EOF marker not found
Processing:  33%|███▎      | 6556/20000 [07:57<2:14:56,  1.66it/s]invalid pdf header: b'{"ind'
EOF marker not found
Processing:  33%|███▎      | 6558/20000 [07:58<1:28:11,  2.54it/s]invalid pdf header: b'\n\n\n<!'
EOF marker not found
invalid pdf header: b'{"ind'
EOF marker not found
Processing:  33%|███▎      | 6560/20000 [07:58<1:17:36,  2.89it/s]invalid pdf header: b'<!DOC'
EOF marker not found
invalid pdf header: b'{"ind'
EOF marker not found
Processing:  33%|███▎      | 6563/20000 [07:58<54:13,  4.13it/s]  invalid pdf header: b'\n\n\n\n<'
EOF marker not found
Processing:  33%|███▎      | 65


❌ Error fetching: https://api.openalex.org/works/W4362499835
HTTPSConnectionPool(host='api.openalex.org', port=443): Read timed out. (read timeout=30)


Processing:  49%|████▊     | 9742/20000 [43:23<1:02:48,  2.72it/s]invalid pdf header: b'{"ins'
EOF marker not found
Processing:  49%|████▊     | 9747/20000 [43:26<1:20:01,  2.14it/s]invalid pdf header: b'{"ind'
EOF marker not found
Processing:  49%|████▊     | 9748/20000 [43:27<1:18:32,  2.18it/s]invalid pdf header: b'{"ind'
EOF marker not found
invalid pdf header: b'{"ind'
EOF marker not found
Processing:  49%|████▉     | 9751/20000 [43:28<1:09:35,  2.45it/s]invalid pdf header: b'{"ind'
invalid pdf header: b'{"ind'
EOF marker not found
EOF marker not found
Processing:  49%|████▉     | 9754/20000 [43:31<1:50:07,  1.55it/s]invalid pdf header: b'{"ind'
EOF marker not found
Processing:  49%|████▉     | 9755/20000 [43:32<2:42:40,  1.05it/s]invalid pdf header: b'{"ind'
EOF marker not found
invalid pdf header: b'{"ins'
Processing:  49%|████▉     | 9756/20000 [43:33<2:42:30,  1.05it/s]EOF marker not found
Processing:  49%|████▉     | 9758/20000 [43:34<1:59:24,  1.43it/s]invalid pdf header: b'


❌ Error fetching: https://api.openalex.org/works/W4385574294
HTTPSConnectionPool(host='api.openalex.org', port=443): Max retries exceeded with url: /works/W4385574294?mailto=s243873%40student.dtu.dk (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x000001FDC2C4C2F0>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))

❌ Error fetching: https://api.openalex.org/works/W4293047303
HTTPSConnectionPool(host='api.openalex.org', port=443): Max retries exceeded with url: /works/W4293047303?mailto=s243873%40student.dtu.dk (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x000001FDC6710680>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))

❌ Error fetching: https://api.openalex.org/works/W4210798363
HTTPSConnectionPool(host='api.openalex.org', port=443): Max retries exceeded with url: /works/W4210798363?mailto=s243873%40student.dtu.dk (Caused by NewConnectionError('<urllib3.connection.HTTPS

Object 91 0 not defined.
Processing:  54%|█████▍    | 10886/20000 [1:45:35<288:45:46, 114.06s/it] invalid pdf header: b'\n<!DO'
EOF marker not found
Processing:  54%|█████▍    | 10889/20000 [1:45:35<159:57:18, 63.20s/it] invalid pdf header: b'{"ind'
EOF marker not found
Processing:  54%|█████▍    | 10891/20000 [1:45:36<104:46:44, 41.41s/it]invalid pdf header: b'{"ind'
EOF marker not found
Processing:  54%|█████▍    | 10892/20000 [1:45:36<84:05:42, 33.24s/it] invalid pdf header: b'{"ind'
EOF marker not found
Processing:  54%|█████▍    | 10894/20000 [1:45:38<50:44:34, 20.06s/it]invalid pdf header: b'{"ind'
EOF marker not found
Processing:  54%|█████▍    | 10895/20000 [1:45:38<38:02:15, 15.04s/it]invalid pdf header: b'{"ind'
EOF marker not found
Processing:  55%|█████▍    | 10902/20000 [1:45:44<5:28:10,  2.16s/it] invalid pdf header: b'\r\n\r\n\r'
EOF marker not found
Processing:  55%|█████▍    | 10905/20000 [1:45:48<3:30:41,  1.39s/it]invalid pdf header: b'{"ind'
EOF marker not found
Pro


❌ Error fetching: https://api.openalex.org/works/W4403536413
HTTPSConnectionPool(host='api.openalex.org', port=443): Read timed out. (read timeout=30)


invalid pdf header: b'{"ind'
EOF marker not found
Processing:  83%|████████▎ | 16628/20000 [3:37:37<1:47:11,  1.91s/it]


❌ Error fetching: https://api.openalex.org/works/W2953334586
HTTPSConnectionPool(host='api.openalex.org', port=443): Read timed out. (read timeout=30)


invalid pdf header: b'{"ind'
EOF marker not found
Processing:  83%|████████▎ | 16629/20000 [3:37:41<2:29:26,  2.66s/it]invalid pdf header: b'{"ind'
EOF marker not found
Processing:  83%|████████▎ | 16630/20000 [3:37:43<2:06:46,  2.26s/it]invalid pdf header: b'<!DOC'
EOF marker not found
Processing:  83%|████████▎ | 16634/20000 [3:37:57<2:21:22,  2.52s/it]


❌ Error fetching: https://api.openalex.org/works/W4381848602
HTTPSConnectionPool(host='api.openalex.org', port=443): Read timed out. (read timeout=30)


Processing:  83%|████████▎ | 16635/20000 [3:38:05<3:45:57,  4.03s/it]invalid pdf header: b'<!DOC'
EOF marker not found
Processing:  83%|████████▎ | 16638/20000 [3:38:10<2:05:41,  2.24s/it]invalid pdf header: b'{"ind'
EOF marker not found
Processing:  83%|████████▎ | 16639/20000 [3:38:14<2:32:46,  2.73s/it]invalid pdf header: b'{"ind'
EOF marker not found
Processing:  83%|████████▎ | 16640/20000 [3:38:16<2:30:42,  2.69s/it]invalid pdf header: b'{"ind'
EOF marker not found
Processing:  83%|████████▎ | 16642/20000 [3:38:17<1:32:36,  1.65s/it]invalid pdf header: b'{"ind'
EOF marker not found
Processing:  83%|████████▎ | 16644/20000 [3:38:19<1:11:34,  1.28s/it]invalid pdf header: b'{"ind'
EOF marker not found
Processing:  83%|████████▎ | 16645/20000 [3:38:20<59:59,  1.07s/it]  invalid pdf header: b'{"ind'
EOF marker not found
Processing:  83%|████████▎ | 16650/20000 [3:38:26<1:00:24,  1.08s/it]invalid pdf header: b'\n\n\n\n<'
EOF marker not found
Processing:  83%|████████▎ | 16651/20000 [3:


❌ Error fetching: https://api.openalex.org/works/W2261300951
HTTPSConnectionPool(host='api.openalex.org', port=443): Read timed out. (read timeout=30)


Processing:  83%|████████▎ | 16667/20000 [3:39:10<2:49:00,  3.04s/it]invalid pdf header: b'{"ind'
EOF marker not found
Processing:  83%|████████▎ | 16668/20000 [3:39:13<2:45:03,  2.97s/it]invalid pdf header: b'{"ind'
EOF marker not found
Processing:  83%|████████▎ | 16669/20000 [3:39:15<2:26:31,  2.64s/it]invalid pdf header: b'{"ind'
EOF marker not found
Processing:  83%|████████▎ | 16672/20000 [3:39:18<1:31:00,  1.64s/it]invalid pdf header: b'{"ind'
EOF marker not found
Processing:  83%|████████▎ | 16673/20000 [3:39:18<1:12:14,  1.30s/it]invalid pdf header: b'{"ind'
EOF marker not found
Processing:  83%|████████▎ | 16677/20000 [3:39:21<49:32,  1.12it/s]  invalid pdf header: b'{"ind'
EOF marker not found
Processing:  83%|████████▎ | 16680/20000 [3:39:25<1:16:30,  1.38s/it]invalid pdf header: b'{"ind'
EOF marker not found
Processing:  83%|████████▎ | 16683/20000 [3:39:29<1:05:23,  1.18s/it]invalid pdf header: b'{"ind'
EOF marker not found
Processing:  83%|████████▎ | 16684/20000 [3:39:3


❌ Error fetching: https://api.openalex.org/works/W4399362479
HTTPSConnectionPool(host='api.openalex.org', port=443): Read timed out. (read timeout=30)


Processing:  84%|████████▎ | 16701/20000 [3:40:05<1:21:10,  1.48s/it]invalid pdf header: b'<!DOC'
EOF marker not found
Processing:  84%|████████▎ | 16702/20000 [3:40:08<1:49:31,  1.99s/it]invalid pdf header: b'{"ind'
EOF marker not found
Processing:  84%|████████▎ | 16703/20000 [3:40:09<1:26:25,  1.57s/it]invalid pdf header: b'{"ind'
EOF marker not found
Processing:  84%|████████▎ | 16705/20000 [3:40:12<1:36:25,  1.76s/it]invalid pdf header: b'{"ind'
EOF marker not found
Processing:  84%|████████▎ | 16706/20000 [3:40:13<1:14:29,  1.36s/it]invalid pdf header: b'{"ind'
EOF marker not found
Processing:  84%|████████▎ | 16709/20000 [3:40:15<57:34,  1.05s/it]  invalid pdf header: b'{"ind'
EOF marker not found
Processing:  84%|████████▎ | 16711/20000 [3:40:17<1:01:22,  1.12s/it]invalid pdf header: b'{"ind'
EOF marker not found
Processing:  84%|████████▎ | 16715/20000 [3:40:18<30:58,  1.77it/s]  invalid pdf header: b'<!doc'
EOF marker not found
Processing:  84%|████████▎ | 16717/20000 [3:40:1