In [1]:
import json
from pathlib import Path
import os
from dotenv import load_dotenv

load_dotenv()
INPUT_FILE = Path(os.getenv("ICCS_FINAL"))

def audit_unique_authors():
    unique_ids = set()
    total_authorships = 0
    missing_id_count = 0
    
    with open(INPUT_FILE, 'r', encoding='utf-8') as f:
        for line in f:
            try:
                rec = json.loads(line)
                # authorships contains the link between work and person
                authorships = rec.get('openalex_work', {}).get('authorships', [])
                
                for auth in authorships:
                    total_authorships += 1
                    a_id_raw = auth.get('author', {}).get('id')
                    
                    if a_id_raw:
                        # NORMALIZATION: "https://openalex.org/A123" -> "A123"
                        clean_id = str(a_id_raw).split('/')[-1].strip().upper()
                        unique_ids.add(clean_id)
                    else:
                        missing_id_count += 1
            except: continue

    print(f"Total Authorships (Seats) : {total_authorships:,}")
    print(f"Unique Author IDs (People): {len(unique_ids):,}")
    print(f"Missing Author IDs        : {missing_id_count}")
    
    return unique_ids

if __name__ == "__main__":
    gold_ids = audit_unique_authors()

Total Authorships (Seats) : 28,010
Unique Author IDs (People): 17,208
Missing Author IDs        : 153


In [None]:
import os
import json
import time
import requests
import logging
from pathlib import Path
from tqdm import tqdm
from dotenv import load_dotenv
from concurrent.futures import ThreadPoolExecutor, as_completed
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

load_dotenv()

# Logger configuration
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    filename='author_profile.log'
)

# Configuration
INPUT_FILE = Path(os.getenv("ICCS_FINAL", "iccs_final.jsonl")) # Fallback added for testing
OUTPUT_FILE = Path(r"D:\ITMO Big Data & ML School\semester 3\RI3\notebooks\data\processed\iccs_author_profiles.jsonl")
EMAIL = os.getenv("OPENALEX_EMAIL", "nehalsonu4@gmail.com")
BATCH_SIZE = 50  # OpenAlex max filter size

# Session management
session = requests.Session()
retries = Retry(
    total=5,
    backoff_factor=0.5, # Reduced backoff for faster recovery
    status_forcelist=[429, 500, 502, 503, 504]
)
adapter = HTTPAdapter(pool_connections=20, pool_maxsize=20, max_retries=retries)
session.mount("https://", adapter)

def normalize_id(raw_id: str) -> str:
    """Standardizes author identifiers."""
    if not raw_id:
        return None
    return str(raw_id).split('/')[-1].strip().upper()

def get_unique_author_ids(path: Path) -> list:
    """Extracts unique author IDs from the source file."""
    authors = set()
    if not path.exists():
        logging.error(f"Input file not found: {path}")
        return []
        
    try:
        with open(path, 'r', encoding='utf-8') as f:
            for line in f:
                try:
                    rec = json.loads(line)
                    # Handle both direct list and openalex_work wrapper
                    authorships = rec.get('openalex_work', {}).get('authorships', []) or rec.get('authorships', [])
                    for auth in authorships:
                        a_id = normalize_id(auth.get('author', {}).get('id'))
                        if a_id:
                            authors.add(a_id)
                except json.JSONDecodeError:
                    continue
    except Exception as e:
        logging.error(f"Error reading {path}: {e}")
    return list(authors)

def get_already_fetched_ids(path: Path) -> set:
    """Retrieves IDs from existing output."""
    fetched = set()
    if path.exists():
        with open(path, 'r', encoding='utf-8') as f:
            for line in f:
                try:
                    data = json.loads(line)
                    a_id = normalize_id(data.get('id'))
                    if a_id:
                        fetched.add(a_id)
                except:
                    continue
    return fetched

def fetch_author_batch(batch_ids: list) -> list:
    """Retrieves up to 50 author profiles in a single request."""
    if not batch_ids:
        return []
        
    # Join IDs with pipe for OR filter
    ids_filter = "|".join(batch_ids)
    url = "https://api.openalex.org/authors"
    
    params = {
        "filter": f"openalex:{ids_filter}",
        "per-page": 50, # Critical: must match batch size
        "mailto": EMAIL
    }

    try:
        response = session.get(url, params=params, timeout=30)
        if response.status_code == 200:
            data = response.json()
            return data.get('results', [])
        elif response.status_code == 429:
            logging.warning("Rate limit hit. Sleeping...")
            time.sleep(2)
            return fetch_author_batch(batch_ids) # Simple recursive retry
        else:
            logging.error(f"Batch failed {response.status_code}: {response.text}")
            return []
    except Exception as e:
        logging.error(f"Request failed for batch: {e}")
        return []

def chunked_iterable(iterable, size):
    """Yield successive n-sized chunks from iterable."""
    for i in range(0, len(iterable), size):
        yield iterable[i:i + size]

def main():
    all_ids = get_unique_author_ids(INPUT_FILE)
    fetched_ids = get_already_fetched_ids(OUTPUT_FILE)
    
    remaining_ids = list(set(all_ids) - fetched_ids)
    
    print(f"Status: {len(all_ids)} Total | {len(fetched_ids)} Fetched | {len(remaining_ids)} Remaining")
    
    if not remaining_ids:
        print("Data is synchronized. No fetching required.")
        return

    OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)
    
    # Create batches
    batches = list(chunked_iterable(remaining_ids, BATCH_SIZE))
    
    with open(OUTPUT_FILE, 'a', encoding='utf-8') as f_out:
        with ThreadPoolExecutor(max_workers=5) as executor:
            # We map over batches, not individual IDs
            futures = {executor.submit(fetch_author_batch, batch): batch for batch in batches}
            
            for future in tqdm(as_completed(futures), total=len(batches), desc="Batch Fetching", unit="batch"):
                results = future.result()
                if results:
                    for profile in results:
                        f_out.write(json.dumps(profile, ensure_ascii=False) + "\n")
                        
    print("Fetching complete.")

if __name__ == "__main__":
    main()


Status: 17208 Total | 10 Fetched | 17198 Remaining


Batch Fetching: 100%|██████████| 344/344 [22:24<00:00,  3.91s/batch]  


Fetching complete.


In [3]:
import os
import json
import time
import requests
import logging
from pathlib import Path
from tqdm import tqdm
from dotenv import load_dotenv
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

load_dotenv()

# Setup logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    filename='author_cleanup.log' # New log file for this run
)

# Files
INPUT_FILE = Path(os.getenv("ICCS_FINAL", "iccs_final.jsonl"))
OUTPUT_FILE = Path(r"D:\ITMO Big Data & ML School\semester 3\RI3\notebooks\data\processed\iccs_author_profiles.jsonl")
EMAIL = os.getenv("OPENALEX_EMAIL", "nehalsonu4@gmail.com")

# Robust Session
session = requests.Session()
retries = Retry(total=3, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504])
adapter = HTTPAdapter(max_retries=retries)
session.mount("https://", adapter)

def normalize_id(raw_id):
    if not raw_id: return None
    return str(raw_id).split('/')[-1].strip().upper()

def get_ids(path, is_output=False):
    ids = set()
    if not path.exists(): return ids
    try:
        with open(path, 'r', encoding='utf-8') as f:
            for line in f:
                try:
                    data = json.loads(line)
                    # Input file has structure inside 'authorships', Output is flat
                    if is_output:
                        val = data.get('id')
                    else:
                        # Scan all possible authors in the input line
                        authorships = data.get('openalex_work', {}).get('authorships', []) or data.get('authorships', [])
                        for auth in authorships:
                            a_id = normalize_id(auth.get('author', {}).get('id'))
                            if a_id: ids.add(a_id)
                        continue # Skip the direct val check for input
                    
                    if val: ids.add(normalize_id(val))
                except: continue
    except Exception as e:
        print(f"Error reading {path}: {e}")
    return ids

def main():
    print("Reading files...")
    all_ids = get_ids(INPUT_FILE, is_output=False)
    fetched_ids = get_ids(OUTPUT_FILE, is_output=True)
    
    # Calculate strictly what is missing
    remaining = list(all_ids - fetched_ids)
    print(f"Total: {len(all_ids)} | Fetched: {len(fetched_ids)} | Remaining: {len(remaining)}")
    
    if not remaining:
        print("Done!")
        return

    # Open in Append mode
    with open(OUTPUT_FILE, 'a', encoding='utf-8') as f:
        for auth_id in tqdm(remaining, desc="Final Cleanup"):
            url = f"https://api.openalex.org/authors/{auth_id}?mailto={EMAIL}"
            
            try:
                # 1. Polite delay
                time.sleep(0.6) 
                
                # 2. Request
                resp = session.get(url, timeout=10)
                
                if resp.status_code == 200:
                    data = resp.json()
                    f.write(json.dumps(data) + "\n")
                    f.flush() # Save immediately
                elif resp.status_code == 404:
                    logging.warning(f"Author {auth_id} not found (404)")
                elif resp.status_code == 429:
                    print("Hit 429. Sleeping 10s...")
                    time.sleep(10)
                else:
                    logging.error(f"Failed {auth_id}: {resp.status_code}")
                    
            except Exception as e:
                logging.error(f"Error fetching {auth_id}: {e}")

if __name__ == "__main__":
    main()


Reading files...
Total: 17208 | Fetched: 16057 | Remaining: 1151


Final Cleanup: 100%|██████████| 1151/1151 [16:07<00:00,  1.19it/s]


Fetching all papers of these aurthors using openalex 

In [11]:
import json
from pathlib import Path

# Path to your library of 17,207 authors
AUTHOR_PROFILES_PATH = Path(r"D:\ITMO Big Data & ML School\semester 3\RI3\notebooks\data\processed\iccs_author_profiles.jsonl")

def run_combined_audit():
    total_authors = 0
    total_expected_works = 0
    more_than_one = 0
    era_works_total = 0
    
    if not AUTHOR_PROFILES_PATH.exists():
        print(f"Error: File not found at {AUTHOR_PROFILES_PATH}")
        return

    with open(AUTHOR_PROFILES_PATH, 'r', encoding='utf-8') as f:
        for line in f:
            try:
                data = json.loads(line)
                total_authors += 1
                
                # Part 1: General Distribution
                count = data.get('works_count', 0)
                total_expected_works += count
                if count >= 1:
                    more_than_one += 1
                
                # Part 2: Era-Specific (2001-2025) Audit [cite: 6, 67]
                yearly_stats = data.get('counts_by_year', [])
                for entry in yearly_stats:
                    year = entry.get('year')
                    if 2001 <= year <= 2025:
                        era_works_total += entry.get('works_count', 0)
                        
            except Exception as e:
                continue

    print(f"AUTHOR WORK DISTRIBUTION (OVERALL)")
    print("="*45)
    print(f"Total Authors Analyzed    : {total_authors:,}")
    print(f"Authors with >=1 Work      : {more_than_one:,}")
    print(f"Total Papers (All Time)   : {total_expected_works:,}")
    print(f"Percentage of Prolific    : {(more_than_one/total_authors)*100:.1f}%")
    
    print(f"\nERA-SPECIFIC AUDIT (2001-2025) [cite: 17, 67]")
    print("="*45)
    print(f"Estimated Era Papers      : {era_works_total:,}")
    reduction = ((total_expected_works - era_works_total) / total_expected_works) * 100 if total_expected_works > 0 else 0
    print(f"Reduction from Total      : {reduction:.1f}%")

if __name__ == "__main__":
    run_combined_audit()

AUTHOR WORK DISTRIBUTION (OVERALL)
Total Authors Analyzed    : 17,207
Authors with >=1 Work      : 17,207
Total Papers (All Time)   : 2,077,990
Percentage of Prolific    : 100.0%

ERA-SPECIFIC AUDIT (2001-2025) [cite: 17, 67]
Estimated Era Papers      : 1,885,872
Reduction from Total      : 9.2%


### Fetch ICCS Author Works

In [None]:
import asyncio
import math
import random
from pathlib import Path
from time import perf_counter
from urllib.parse import quote
import os
from dotenv import load_dotenv
import aiohttp
import pyarrow as pa
import pyarrow.parquet as pq
load_dotenv()

INPUT_FILE = Path(r"D:\ITMO Big Data & ML School\semester 3\RI3\Data\processed\pro_oa_iccs\iccs_author_profiles.jsonl")
OUT_DIR = Path(r"D:\ITMO Big Data & ML School\semester 3\RI3\data\openalex_works_full_parquet")
OUT_DIR.mkdir(parents=True, exist_ok=True)

EMAIL = os.getenv("OA_EMAIL")
API_KEY = os.getenv("OA_API_KEY")


BATCH_SIZE = 500
RESUME = True

FETCH_WORKERS = 15
CONN_LIMIT_PER_HOST = 15
MAX_RETRIES = 8

PER_PAGE = 200
Q_OUT_MAX = FETCH_WORKERS * 4

# CHECKS
if not API_KEY or API_KEY.strip() in {"", "PASTE_YOUR_REAL_OPENALEX_API_KEY_HERE"}:
    raise ValueError("API_KEY is empty/placeholder. Paste OpenAlex api_key.")
API_KEY = API_KEY.strip()
EMAIL = EMAIL.strip()

try:
    import orjson
except ImportError:
    orjson = None

def loads_line(line: str):
    if orjson:
        return orjson.loads(line)
    import json
    return json.loads(line)

def dumps_json(obj) -> str:
    # Store exact API object as JSON string in Parquet
    if orjson:
        return orjson.dumps(obj).decode("utf-8")
    import json
    return json.dumps(obj, ensure_ascii=False)


# PARQUET SCHEMA
# Keep the full work payload in one column as JSON text.
# This preserves the API structure exactly.
PARQUET_SCHEMA = pa.schema([
    ("author_id", pa.string()),
    ("work_id", pa.string()),
    ("work_json", pa.large_string()),
])

def to_rows(author_id: str, works: list[dict]) -> list[dict]:
    rows = []
    for w in works:
        rows.append({
            "author_id": author_id,
            "work_id": w.get("id"),
            "work_json": dumps_json(w),
        })
    return rows

# URL HELPERS
def add_param(url: str, key: str, value: str) -> str:
    v = quote(str(value), safe="")
    sep = "&" if "?" in url else "?"
    return f"{url}{sep}{key}={v}"

def build_works_url(works_api_url: str) -> str:
    """
    Build a URL that returns FULL Work objects (no select=...).
    Use cursor paging for robustness. :contentReference[oaicite:2]{index=2}
    """
    url = works_api_url
    url = add_param(url, "mailto", EMAIL)
    url = add_param(url, "api_key", API_KEY)
    url = add_param(url, "per-page", str(PER_PAGE))
    # cursor paging start
    url = add_param(url, "cursor", "*")
    return url

# HTTP FETCH WITH RETRIES (hard fail on auth)
async def fetch_json(session: aiohttp.ClientSession, url: str) -> dict | None:
    for attempt in range(MAX_RETRIES):
        try:
            async with session.get(url) as r:
                if r.status == 200:
                    return await r.json()

                if r.status in (401, 403):
                    txt = await r.text()
                    raise RuntimeError(f"AUTH ERROR {r.status}: {txt[:800]}")

                if r.status == 429:
                    delay = min(60, (2 ** attempt)) + random.random()
                    await asyncio.sleep(delay)
                    continue

                if 500 <= r.status < 600:
                    delay = min(30, (2 ** attempt)) + random.random()
                    await asyncio.sleep(delay)
                    continue

                return None

        except (aiohttp.ClientError, asyncio.TimeoutError):
            delay = min(30, (2 ** attempt)) + random.random()
            await asyncio.sleep(delay)

    return None

# FETCH ALL WORKS FOR ONE AUTHOR (cursor paging)
async def fetch_author_works_full(session: aiohttp.ClientSession, author: dict) -> dict | None:
    author_id = author.get("id")
    works_api_url = author.get("works_api_url")
    if not works_api_url:
        return None

    base = build_works_url(works_api_url)
    works = []

    cursor = "*"
    while True:
        url = add_param(base.split("&cursor=")[0], "cursor", cursor) if "&cursor=" in base else add_param(base, "cursor", cursor)
        data = await fetch_json(session, url)
        if not data:
            break

        results = data.get("results") or []
        if not results:
            break

        works.extend(results)

        # Cursor paging: next_cursor in meta :contentReference[oaicite:3]{index=3}
        meta = data.get("meta") or {}
        next_cursor = meta.get("next_cursor")
        if not next_cursor or next_cursor == cursor:
            break
        cursor = next_cursor

    if not works:
        return None

    return {"author_id": author_id, "works": works}

# WORKERS + WRITER
async def fetch_worker(session: aiohttp.ClientSession, q_in: asyncio.Queue, q_out: asyncio.Queue):
    while True:
        author = await q_in.get()
        if author is None:
            q_in.task_done()
            break
        try:
            result = await fetch_author_works_full(session, author)
            if result and result.get("works"):
                await q_out.put(result)
        finally:
            q_in.task_done()

async def parquet_writer_worker(q_out: asyncio.Queue, out_path: Path) -> int:
    writer = None
    rows_written = 0
    try:
        while True:
            item = await q_out.get()
            if item is None:
                q_out.task_done()
                break

            aid = item["author_id"]
            rows = to_rows(aid, item["works"])
            if rows:
                table = pa.Table.from_pylist(rows, schema=PARQUET_SCHEMA)
                if writer is None:
                    writer = pq.ParquetWriter(
                        str(out_path),
                        PARQUET_SCHEMA,
                        compression="zstd",
                        use_dictionary=True,
                    )
                writer.write_table(table)
                rows_written += len(rows)

            q_out.task_done()
    finally:
        if writer is not None:
            writer.close()
    return rows_written


# RUN ONE BATCH (atomic write: .tmp then rename)
async def run_batch(batch_idx: int, authors_batch: list[dict]) -> int:
    out_path = OUT_DIR / f"part-{batch_idx:05d}.parquet"
    tmp_path = OUT_DIR / f"part-{batch_idx:05d}.parquet.tmp"

    if RESUME and out_path.exists():
        print(f"[batch {batch_idx:05d}] exists -> skipping")
        return 0

    # If a previous run crashed mid-batch, remove tmp
    if tmp_path.exists():
        tmp_path.unlink()

    timeout = aiohttp.ClientTimeout(total=120, connect=10, sock_connect=10, sock_read=120)
    connector = aiohttp.TCPConnector(limit_per_host=CONN_LIMIT_PER_HOST, limit=0, ttl_dns_cache=300)
    headers = {
        "Accept": "application/json",
        "User-Agent": f"openalex-full-works-fetch (mailto:{EMAIL})",
    }

    q_in = asyncio.Queue()  # unbounded => producer never blocks
    q_out = asyncio.Queue(maxsize=Q_OUT_MAX)

    async with aiohttp.ClientSession(timeout=timeout, connector=connector, headers=headers) as session:
        # sanity: fail immediately if auth wrong
        test = await fetch_json(session, add_param(build_works_url(authors_batch[0]["works_api_url"]), "cursor", "*"))
        print(f"[batch {batch_idx:05d}] sanity ok: page results={len((test or {}).get('results') or [])}")

        writer_task = asyncio.create_task(parquet_writer_worker(q_out, tmp_path))
        workers = [asyncio.create_task(fetch_worker(session, q_in, q_out)) for _ in range(FETCH_WORKERS)]

        try:
            for a in authors_batch:
                q_in.put_nowait(a)
            for _ in workers:
                q_in.put_nowait(None)

            await q_in.join()
            await q_out.put(None)
            await q_out.join()

            await asyncio.gather(*workers)
            rows_written = await writer_task

            # atomic commit
            tmp_path.rename(out_path)
            return rows_written

        finally:
            # ensure cleanup on interruption
            for t in workers:
                t.cancel()
            writer_task.cancel()
            await asyncio.gather(*workers, return_exceptions=True)
            await asyncio.gather(writer_task, return_exceptions=True)

# MAIN
async def main():
    authors = []
    with open(INPUT_FILE, "r", encoding="utf-8") as f:
        for line in f:
            try:
                p = loads_line(line)
                if p.get("works_count", 0) > 0 and p.get("works_api_url"):
                    authors.append(p)
            except Exception:
                continue

    n = len(authors)
    nb = math.ceil(n / BATCH_SIZE)
    print(f"Authors: {n:,} | Batch size: {BATCH_SIZE} | Batches: {nb}")

    t0 = perf_counter()
    total_rows = 0

    for b in range(nb):
        s = b * BATCH_SIZE
        e = min((b + 1) * BATCH_SIZE, n)
        batch = authors[s:e]

        bt0 = perf_counter()
        rows_written = await run_batch(b, batch)
        dt = perf_counter() - bt0

        total_rows += rows_written
        print(f"[batch {b:05d}] authors {s:,}-{e-1:,} | rows {rows_written:,} | {dt/60:.1f} min")

    print(f"Done. Total rows written (new): {total_rows:,}. Total time: {(perf_counter()-t0)/60:.1f} min")
    print(f"Parquet parts in: {OUT_DIR.resolve()}")


await main()
#   asyncio.run(main())


Note: you may need to restart the kernel to use updated packages.
Authors: 17,207 | Batch size: 500 | Batches: 35
[batch 00000] sanity ok: page results=191
[batch 00000] authors 0-499 | rows 61,653 | 1.1 min
[batch 00001] sanity ok: page results=8
[batch 00001] authors 500-999 | rows 65,985 | 1.7 min
[batch 00002] sanity ok: page results=12
[batch 00002] authors 1,000-1,499 | rows 49,846 | 0.9 min
[batch 00003] sanity ok: page results=6
[batch 00003] authors 1,500-1,999 | rows 58,159 | 1.1 min
[batch 00004] sanity ok: page results=10
[batch 00004] authors 2,000-2,499 | rows 55,091 | 1.0 min
[batch 00005] sanity ok: page results=20
[batch 00005] authors 2,500-2,999 | rows 54,305 | 1.0 min
[batch 00006] sanity ok: page results=8
[batch 00006] authors 3,000-3,499 | rows 60,915 | 1.2 min
[batch 00007] sanity ok: page results=7
[batch 00007] authors 3,500-3,999 | rows 54,849 | 1.1 min
[batch 00008] sanity ok: page results=7
[batch 00008] authors 4,000-4,499 | rows 57,003 | 1.1 min
[batch 00

Exploring the parqueet

In [1]:
import duckdb
from pathlib import Path

PARQUET_DIR = Path(r"D:\ITMO Big Data & ML School\semester 3\RI3\data\openalex_works_full_parquet")

con = duckdb.connect()

# --- 1. Basic counts ---
stats = con.execute(f"""
    SELECT 
        COUNT(*) as total_rows,
        COUNT(DISTINCT author_id) as unique_authors,
        COUNT(DISTINCT work_id) as unique_works
    FROM read_parquet('{PARQUET_DIR}/*.parquet')
""").fetchone()

print(f"Total rows:      {stats[0]:,}")
print(f"Unique authors:  {stats[1]:,}")
print(f"Unique works:    {stats[2]:,}")

# --- 2. Look at one work_json to understand the structure ---
sample = con.execute(f"""
    SELECT work_json 
    FROM read_parquet('{PARQUET_DIR}/*.parquet')
    LIMIT 1
""").fetchone()[0]

import json
work = json.loads(sample)
print("\n--- Fields in work_json ---")
for key in work.keys():
    val = work[key]
    if isinstance(val, list):
        print(f"  {key}: list[{len(val)}]")
    elif isinstance(val, dict):
        print(f"  {key}: dict with keys {list(val.keys())}")
    elif isinstance(val, str) and len(val) > 80:
        print(f"  {key}: '{val[:80]}...'")
    else:
        print(f"  {key}: {val}")

# --- 3. Check what 'authorships' looks like inside work_json ---
print("\n--- Sample authorships ---")
for authorship in work.get("authorships", [])[:3]:
    print(json.dumps(authorship, indent=2))

Total rows:      2,024,311
Unique authors:  17,207
Unique works:    1,730,232

--- Fields in work_json ---
  id: https://openalex.org/W2567026504
  doi: https://doi.org/10.1016/j.petrol.2016.12.023
  title: An efficient two-scale hybrid embedded fracture model for shale gas simulation
  display_name: An efficient two-scale hybrid embedded fracture model for shale gas simulation
  publication_year: 2016
  publication_date: 2016-12-27
  ids: dict with keys ['openalex', 'doi', 'mag']
  language: en
  primary_location: dict with keys ['id', 'is_oa', 'landing_page_url', 'pdf_url', 'source', 'license', 'license_id', 'version', 'is_accepted', 'is_published', 'raw_source_name', 'raw_type']
  type: article
  indexed_in: list[1]
  open_access: dict with keys ['is_oa', 'oa_status', 'oa_url', 'any_repository_has_fulltext']
  authorships: list[2]
  institutions: list[0]
  countries_distinct_count: 1
  institutions_distinct_count: 2
  corresponding_author_ids: list[1]
  corresponding_institution_ids

====================================

#### Trajectories
Finding ICCS Conference papers in JOCS Journal or vice versa

In [None]:
import pandas as pd
import json
from difflib import SequenceMatcher

def load_jsonl(path):
    data = []
    with open(path, 'r', encoding='utf-8') as f:
        for line in f:
            rec = json.loads(line)
            w = rec.get('openalex_work', {})
            # Extract basic fields
            authors = [a['author']['id'] for a in w.get('authorships', []) if 'author' in a]
            data.append({
                'id': w.get('id'),
                'title': w.get('display_name'),
                'year': w.get('publication_year'),
                'authors': set(authors),
                'citations': w.get('cited_by_count')
            })
    return pd.DataFrame(data)

# Paths (Aapke wale)
iccs_path = r"D:\ITMO Big Data & ML School\semester 3\RI3\Data\processed\pro_oa_iccs\iccs_final_complete_authors.jsonl"
jocs_path = r"D:\ITMO Big Data & ML School\semester 3\RI3\Data\processed\pro_oa_jocs\jocs_final_complete_authors.jsonl"

print("Loading data...")
iccs_df = load_jsonl(iccs_path)
jocs_df = load_jsonl(jocs_path)
print(f"Loaded {len(iccs_df)} ICCS papers and {len(jocs_df)} JoCS papers.")

# 2. Matching Logic
matches = []
for idx, iccs in iccs_df.iterrows():
    # Only look at JoCS papers published AFTER ICCS paper (up to 3 years)
    candidates = jocs_df[
        (jocs_df['year'] >= iccs['year']) & 
        (jocs_df['year'] <= iccs['year'] + 3)
    ]
    
    for _, jocs in candidates.iterrows():
        # Fast Check: Author Overlap
        if not iccs['authors'].intersection(jocs['authors']):
            continue
            
        # Slow Check: Title Similarity
        sim = SequenceMatcher(None, str(iccs['title']).lower(), str(jocs['title']).lower()).ratio()
        
        if sim > 0.6: # Threshold
            matches.append({
                'iccs_id': iccs['id'],
                'jocs_id': jocs['id'],
                'similarity': sim,
                'jocs_citations': jocs['citations']
            })

match_df = pd.DataFrame(matches).sort_values('similarity', ascending=False).drop_duplicates('iccs_id')
print(f"Found {len(match_df)} ICCS papers that went to JoCS.")


Loading data...
Loaded 8302 ICCS papers and 2077 JoCS papers.
Found 134 ICCS papers that went to JoCS.
