Concept: Measure th LLM research ecosystem using netwroks and NLP.
We will quantify what topics exist, how they evolve, who drives them and wich papers are ahead of the curve by combining citations/author-networks with abstract/full-text analysis

Why is it interesting? It turns a hyped space into a measurable structure. It can surface non-obvious leaders and emerginf subfields.

Datasets: Queried 20k papers from OpenAlex. filtering them by key-words and year of publication. 

Networks to analyze: 
- citation networks: paper->paper
- author collaboration network: authors->authors

Research questions or hypoheses:

- Evolution of LLMs in the 21st century. 
- What are the most hyped topics and why is it that way?  
- Are there other hidden directions this field is going into, that can be determined in advance?
- Most impactufl/cited papers are the ones that bring most value
- There are specific people in the AI community who write the write stuff and are the real innovators in the field



In [11]:
import os
import json
import requests
from tqdm import tqdm
import time

# SEARCH_QUERY = "(large language model OR language model OR pretrained language model OR foundation model OR transformer OR LLM OR agentic AI OR AI agent OR tool use OR retrieval augmented generation OR retrieval-augmented OR model context protocol)"
OUTPUT_FILE = "ai_papers.json"

MAX_RESULTS = 20000
PER_PAGE = 200

EMAIL = "s243873@student.dtu.dk"  # <-- REAL email, not example.com

BASE_URL = "https://api.openalex.org/works"

params = {
    "filter": (
        "from_publication_date:2000-01-01,"
        "open_access.is_oa:true,"
        "referenced_works_count:>0,"
        "title.search:(large language model|language model|pretrained language model|foundation model|transformer|LLM|agentic AI|AI agent|tool use|retrieval augmented generation|retrieval-augmented|model context protocol)"
    ),
    "per-page": PER_PAGE,
    "cursor": "*",
    "mailto": EMAIL
}


headers = {
    "User-Agent": f"DTU MSc research (contact: {EMAIL})",
    "Accept": "application/json",
}

all_papers = []

# Sanity probe: tiny request first so you see the real 403 message if any
probe = requests.get(f"{BASE_URL}?sample=1", headers=headers, params={"mailto": EMAIL}, timeout=20)
probe.raise_for_status()

while len(all_papers) < MAX_RESULTS:
    try:
        r = requests.get(BASE_URL, params=params, headers=headers, timeout=60)
        if r.status_code == 403:
            # Print server message to understand why (often user-agent/mailto)
            print("403 body:", r.text[:500])
        r.raise_for_status()
        data = r.json()
    except requests.HTTPError as e:
        print("HTTP error:", e)
        print("Body preview:", getattr(r, "text", "")[:500])
        # brief backoff; if it's a WAF hiccup, try again
        time.sleep(3)
        continue

    results = data.get("results", [])
    if not results:
        break

    for item in results:
        all_papers.append({
            "title": item.get("title"),
            "url": item.get("id"),
            "year": item.get("publication_year"),
            "authors": item.get("authorships", []),
            "referenced_by_number": item.get("cited_by_count"),
            "type": item.get("type"),
            "references": item.get("referenced_works", []),
            "institutions": item.get("institutions", []),
        })
        if len(all_papers) >= MAX_RESULTS:
            break

    # cursor paging
    next_cursor = data.get("meta", {}).get("next_cursor")
    if not next_cursor:
        break
    params["cursor"] = next_cursor

    # be polite
    time.sleep(0.2)

print(f"Downloaded: {len(all_papers)}")

with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
    json.dump(all_papers, f, indent=2)

print("Saved:", OUTPUT_FILE)


Downloaded: 20000
Saved: ai_papers.json


In [2]:
import os
import json
import requests
from tqdm import tqdm
from pypdf import PdfReader
from io import BytesIO

INPUT_JSON = "ai_papers.json"       # <-- the file you already downloaded
OUTPUT_DIR = "papers_full"
os.makedirs(OUTPUT_DIR, exist_ok=True)

EMAIL = "s243873@student.dtu.dk"    # required by OpenAlex
HEADERS = {
    "User-Agent": f"DTU MSc Research (contact: {EMAIL})",
    "Accept": "application/json"
}

def fetch_full_metadata(openalex_url):
    try:
        r = requests.get(openalex_url, headers=HEADERS, params={"mailto": EMAIL}, timeout=30)

        # If response isn't JSON, log it
        if "application/json" not in r.headers.get("Content-Type", ""):
            print("\n⚠️ Non-JSON response from:", openalex_url)
            print("Response preview:", r.text[:200])
            return None

        return r.json()

    except Exception as e:
        print("\n❌ Error fetching:", openalex_url)
        print(e)
        return None

def extract_pdf_text(pdf_url):
    try:
        r = requests.get(pdf_url, timeout=30, headers=HEADERS)
        if r.status_code != 200:
            return None
        reader = PdfReader(BytesIO(r.content))
        return "\n".join([page.extract_text() or "" for page in reader.pages])
    except:
        return None

def decode_abstract(inv_index):
    if not inv_index:
        return None
    words = {}
    for word, positions in inv_index.items():
        for pos in positions:
            words[pos] = word
    return " ".join(words[pos] for pos in sorted(words))

def clean_text(text):
    if text is None:
        return None
    # remove invalid surrogate pairs
    return text.encode("utf-8", "surrogatepass").decode("utf-8", "ignore")


In [None]:
from concurrent.futures import ThreadPoolExecutor, as_completed

MAX_WORKERS = 5  # don't go higher, OpenAlex will rate limit

def process_paper(p):

    paper_id = p["url"].split("/")[-1]
    target_path = os.path.join(OUTPUT_DIR, f"{paper_id}.json")

    if os.path.exists(target_path) and os.path.getsize(target_path) > 100:
        print("Already downloaded:", target_path)
        return None

    api_url = p["url"].replace("https://openalex.org/", "https://api.openalex.org/works/")
    full = fetch_full_metadata(api_url)
    if not full:
        return None

    pdf_url = full.get("open_access", {}).get("oa_url")
    full_text = extract_pdf_text(pdf_url) if pdf_url else None
    abstract = decode_abstract(full.get("abstract_inverted_index"))

    if abstract is None and full_text is None:
        return None

    authors = [
        {
            "id": a["author"].get("id"),
            "name": a["author"].get("display_name"),
            "affiliations": [inst.get("display_name") for inst in a["institutions"]],
        }
        for a in full.get("authorships", [])
    ]

    return {
        "paper_id": full["id"].split("/")[-1],
        "data": {
            "title": full.get("title"),
            "url": full.get("id"),
            "year": full.get("publication_year"),
            "authors": authors,
            "references": full.get("referenced_works", []),
            "abstract": clean_text(abstract),
            "full_text": clean_text(full_text),
        },
    }

with open(INPUT_JSON, "r", encoding="utf-8") as f:
    papers = json.load(f)

with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
    futures = [executor.submit(process_paper, p) for p in papers]

    for future in tqdm(as_completed(futures), total=len(futures), desc="Processing"):
        result = future.result()
        if not result:
            continue

        paper_id = result["paper_id"]
        output = result["data"]

        with open(os.path.join(OUTPUT_DIR, f"{paper_id}.json"), "w", encoding="utf-8") as f:
            json.dump(output, f, indent=4, ensure_ascii=False)


### Adding more data to the json papers

In [None]:
# ### Cleaning step
# import os
# import json
# from pathlib import Path

# INPUT_DIR = Path("papers_full")
# FIELDS_TO_REMOVE = ["topic", "subfield", "field", "domain"]

# def clean_json_file(file_path: Path):
#     try:
#         with file_path.open("r", encoding="utf-8") as f:
#             data = json.load(f)
#     except Exception as e:
#         print(f"Skipping (cannot read): {file_path.name}   Error: {e}")
#         return False

#     original = data.copy()

#     # remove unwanted fields
#     for key in FIELDS_TO_REMOVE:
#         if key in data:
#             data.pop(key)

#     # if nothing was removed, skip rewriting
#     if data == original:
#         return False

#     try:
#         with file_path.open("w", encoding="utf-8") as f:
#             json.dump(data, f, indent=2, ensure_ascii=False)
#         return True
#     except Exception as e:
#         print(f"Failed writing file {file_path.name}: {e}")
#         return False


# def main():
#     json_files = list(INPUT_DIR.glob("*.json"))
#     print(f"Scanning {len(json_files)} JSON files...")

#     removed_count = 0
#     for file in json_files:
#         if clean_json_file(file):
#             removed_count += 1
#             print(f"✓ Cleaned: {file.name}")
#         else:
#             print(f"• No change: {file.name}")

#     print(f"\nDone. Updated {removed_count} JSON files.")

# if __name__ == "__main__":
#     main()



Scanning 17891 JSON files...
✓ Cleaned: W1021950341.json
✓ Cleaned: W1043896296.json
✓ Cleaned: W1056559112.json
✓ Cleaned: W107376459.json
✓ Cleaned: W1111699446.json
✓ Cleaned: W1114923635.json
✓ Cleaned: W114000222.json
✓ Cleaned: W1162638878.json
✓ Cleaned: W120300249.json
✓ Cleaned: W1274133664.json
✓ Cleaned: W1276242696.json
✓ Cleaned: W128136426.json
✓ Cleaned: W128830652.json
✓ Cleaned: W129513086.json
✓ Cleaned: W137288188.json
✓ Cleaned: W140271714.json
✓ Cleaned: W14090282.json
✓ Cleaned: W14230040.json
✓ Cleaned: W142909730.json
✓ Cleaned: W144947839.json
✓ Cleaned: W1464370620.json
✓ Cleaned: W1479894935.json
✓ Cleaned: W1480579386.json
✓ Cleaned: W1481200330.json
✓ Cleaned: W1483146833.json
✓ Cleaned: W1483277231.json
✓ Cleaned: W1483642010.json
✓ Cleaned: W1485069053.json
✓ Cleaned: W1485156179.json
✓ Cleaned: W1485237120.json
✓ Cleaned: W1485633403.json
✓ Cleaned: W1486723856.json
✓ Cleaned: W1490175560.json
✓ Cleaned: W1490716443.json
✓ Cleaned: W1491181815.json
✓ Cle

In [5]:
import os
import json
import time
import math
import random
import concurrent.futures as cf
from pathlib import Path

import requests

# ---- config ---------------------------------------------------------------

INPUT_DIR = Path("papers_full")  # folder with your per-paper JSONs
EMAIL = "s243873@student.dtu.dk"

# Concurrency: keep this modest; OpenAlex is friendly but rate-limited.
MAX_WORKERS = 5

# If a JSON already contains topic/subfield/field/domain, skip unless FORCE=True
FORCE = False

HEADERS = {
    "User-Agent": f"DTU MSc Research (contact: {EMAIL})",
    "Accept": "application/json",
}
PARAMS = {"mailto": EMAIL}

# Backoff settings for transient errors / 429s
MAX_RETRIES = 5
BASE_SLEEP = 1.5  # seconds

# ---- helpers --------------------------------------------------------------

def http_get_json(url: str, *, session: requests.Session) -> dict | None:
    """GET JSON with retries/backoff; returns None on hard failure."""
    for attempt in range(1, MAX_RETRIES + 1):
        try:
            r = session.get(url, headers=HEADERS, params=PARAMS, timeout=30)
            # Respect basic rate signals
            if r.status_code in (429, 502, 503, 504):
                # Backoff with jitter
                sleep = BASE_SLEEP * (2 ** (attempt - 1)) + random.uniform(0, 0.4)
                time.sleep(sleep)
                continue
            r.raise_for_status()
            if "application/json" not in r.headers.get("Content-Type", ""):
                # Not JSON; give up on this item
                return None
            return r.json()
        except requests.RequestException:
            sleep = BASE_SLEEP * (2 ** (attempt - 1)) + random.uniform(0, 0.4)
            time.sleep(sleep)
    return None


def pick_top_concept(concepts: list[dict]) -> dict | None:
    """Pick the highest-score concept from the work’s concepts."""
    if not concepts:
        return None
    return max(concepts, key=lambda c: (c.get("score") or 0.0))


def concept_levels(concept_json: dict) -> dict:
    """
    From a concept JSON, build a dict of {0: domain, 1: field, 2: subfield}
    using ancestors (and fallbacks if levels are missing).
    """
    result = {0: None, 1: None, 2: None}
    ancestors = concept_json.get("ancestors", []) or []
    # Ancestors can be in any order; index by level
    for a in ancestors:
        lvl = a.get("level")
        name = a.get("display_name")
        if lvl in result and result[lvl] is None:
            result[lvl] = name
    return result

def extract_classification_from_work(work_json: dict):
    """
    Returns (topic, concepts) derived from OpenAlex work concepts.
    - topic: highest-score concept name
    - concepts: list of (name, score)
    """
    concepts = work_json.get("concepts") or []
    if not concepts:
        return (None, [])

    # sort by score descending
    sorted_concepts = sorted(concepts, key=lambda c: c.get("score", 0), reverse=True)

    topic = sorted_concepts[0].get("display_name")
    concept_list = [
        {"name": c.get("display_name"), "score": c.get("score")}
        for c in sorted_concepts
    ]

    return topic, concept_list



def needs_update(payload: dict) -> bool:
    keys = ("topic", "subfield", "field", "domain")
    if FORCE:
        return True
    return any(k not in payload for k in keys)


def update_one_json(json_path: Path) -> tuple[str, bool, str | None]:
    """
    Load a per-paper JSON, fetch classification, write back if needed.
    Returns (filename, updated?, error_message)
    """
    try:
        data = json.loads(json_path.read_text(encoding="utf-8"))
    except Exception as e:
        return (json_path.name, False, f"read_error: {e}")

    if not isinstance(data, dict):
        return (json_path.name, False, "malformed_json")

    if not needs_update(data):
        return (json_path.name, False, None)

    work_url = data.get("url")  # should already be the API URL from your previous step
    if not work_url:
        return (json_path.name, False, "missing_url")

    # Normalize to API endpoint
    api_url = work_url.replace("https://openalex.org/", "https://api.openalex.org/")
    with requests.Session() as session:
        work_json = http_get_json(api_url, session=session)
        if not work_json:
            return (json_path.name, False, "work_fetch_failed")

        topic, concepts = extract_classification_from_work(work_json)



    # Append to JSON and write back
    data["topic"] = topic
    data["concepts"] = concepts

    try:
        json_path.write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding="utf-8")
        return (json_path.name, True, None)
    except Exception as e:
        return (json_path.name, False, f"write_error: {e}")


# ---- main -----------------------------------------------------------------

def main():
    files = sorted(p for p in INPUT_DIR.glob("*.json") if p.is_file())
    if not files:
        print(f"No JSON files found in {INPUT_DIR}")
        return

    updated = 0
    errors = 0

    with cf.ThreadPoolExecutor(max_workers=MAX_WORKERS) as ex:
        for fname, did_update, err in ex.map(update_one_json, files):
            if err:
                errors += 1
                print(f"✗ {fname}: {err}")
            elif did_update:
                updated += 1
                print(f"✓ {fname}: updated")
            else:
                print(f"• {fname}: skipped")

    print(f"\nDone. Updated: {updated}, Errors: {errors}, Total: {len(files)}")

if __name__ == "__main__":
    main()


• W1021950341.json: skipped
• W1043896296.json: skipped
• W1056559112.json: skipped
• W107376459.json: skipped
• W1111699446.json: skipped
• W1114923635.json: skipped
• W114000222.json: skipped
• W1162638878.json: skipped
• W120300249.json: skipped
• W1274133664.json: skipped
• W1276242696.json: skipped
• W128136426.json: skipped
• W128830652.json: skipped
• W129513086.json: skipped
• W137288188.json: skipped
• W140271714.json: skipped
• W14090282.json: skipped
• W14230040.json: skipped
• W142909730.json: skipped
• W144947839.json: skipped
• W1464370620.json: skipped
• W1479894935.json: skipped
• W1480579386.json: skipped
• W1481200330.json: skipped
• W1483146833.json: skipped
• W1483277231.json: skipped
• W1483642010.json: skipped
• W1485069053.json: skipped
• W1485156179.json: skipped
• W1485237120.json: skipped
• W1485633403.json: skipped
• W1486723856.json: skipped
• W1490175560.json: skipped
• W1490716443.json: skipped
• W1491181815.json: skipped
• W1491259378.json: skipped
• W149

KeyboardInterrupt: 

### Appending institutions to the individual json papers

In [None]:
import json
from pathlib import Path

AI_PAPERS_FILE = Path("ai_papers.json")
PAPERS_DIR = Path("papers_full")

def extract_id(url: str | None) -> str | None:
    if not url:
        return None
    return url.strip().split("/")[-1]

def main():
    # 1) Load ai_papers.json and build an index by both URL and ID
    print(f"Loading {AI_PAPERS_FILE}...")
    ai_data = json.loads(AI_PAPERS_FILE.read_text(encoding="utf-8"))

    if not isinstance(ai_data, list):
        raise ValueError("ai_papers.json must be a list of paper objects")

    by_url = {}
    by_id = {}

    for item in ai_data:
        url = item.get("url")
        inst = item.get("institutions", [])
        if url:
            by_url[url] = inst
            pid = extract_id(url)
            if pid:
                by_id[pid] = inst

    print(f"Indexed {len(by_url)} papers from ai_papers.json")

    # 2) Walk through papers_full/*.json and update "institutions"
    files = sorted(p for p in PAPERS_DIR.glob("*.json") if p.is_file())
    if not files:
        print(f"No JSON files found in {PAPERS_DIR}")
        return

    updated = 0
    missing = 0

    for f in files:
        try:
            data = json.loads(f.read_text(encoding="utf-8"))
        except Exception as e:
            print(f"✗ {f.name}: read_error: {e}")
            continue

        if not isinstance(data, dict):
            print(f"✗ {f.name}: malformed_json")
            continue

        url = data.get("url")
        pid = extract_id(url) if url else f.stem  # fallback: filename as ID

        institutions = None

        # Try exact URL match first
        if url and url in by_url:
            institutions = by_url[url]
        # Then try by OpenAlex ID
        elif pid and pid in by_id:
            institutions = by_id[pid]

        if institutions is None:
            missing += 1
            # Optional debug:
            # print(f"• {f.name}: no match in ai_papers.json (url={url}, id={pid})")
            continue

        data["institutions"] = institutions

        try:
            f.write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding="utf-8")
            updated += 1
        except Exception as e:
            print(f"✗ {f.name}: write_error: {e}")

    print(f"\nDone. Updated: {updated}, No match: {missing}, Total files: {len(files)}")

if __name__ == "__main__":
    main()
