DOWNLOADING THE PDFS FOR ICCS & JOCS

# ICCS

ICCS 2001-2009

In [None]:
import os, re, json, time, requests
import pandas as pd
from pathlib import Path
from tqdm.notebook import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
from dotenv import load_dotenv


load_dotenv()
DBLP_DIR = Path(os.getenv("DBLP_DIR"))
INPUT_JSONL = DBLP_DIR/"interim"/"iccs"/"iccs_dblp_dois.jsonl"
OUTDIR = Path(os.getenv("ICCS_PDF_DIR"))
LOG_PATH = Path(os.getenv("DATA_DIR"))/"logs"/"iccs_download_status.csv"
WORKERS = 4 

OUTDIR.mkdir(parents=True, exist_ok=True)
LOG_PATH.parent.mkdir(parents=True, exist_ok=True)


print("Loading data...")
df = pd.read_json(INPUT_JSONL, lines=True)
df['year'] = pd.to_numeric(df['year'], errors='coerce')
# Focus on ICCS Springer years (2001-2009)
df = df[(df['year'] >= 2001) & (df['year'] <= 2009)].drop_duplicates(subset=['doi'])

# Resume
if LOG_PATH.exists():
    log_df = pd.read_csv(LOG_PATH)
    done_dois = set(log_df[log_df['outcome'] == 'downloaded']['doi'].astype(str))
    df = df[~df['doi'].isin(done_dois)]

tasks = df[['doi', 'year']].to_records(index=False)
print(f"Tasks to process: {len(tasks)}")

# DOWNLOAD
def download_paper(doi, year):
    doi = doi.strip()
    # Filename
    fname = f"{year}_{doi.replace('/', '_').replace(':', '_')}.pdf"
    fpath = OUTDIR / str(year) / fname
    fpath.parent.mkdir(parents=True, exist_ok=True)

    # Skip if file exists
    if fpath.exists() and fpath.stat().st_size > 30000:
        return {"doi": doi, "outcome": "skipped"}

    # Try Springer Direct first, then DOI resolver
    urls = [f"https://link.springer.com/content/pdf/{doi}.pdf", f"https://doi.org/{doi}"]
    
    for url in urls:
        try:
            r = requests.get(url, timeout=30, headers={"User-Agent": "Mozilla/5.0"}, allow_redirects=True)
            if r.status_code == 200 and r.content[:4] == b"%PDF":
                fpath.write_bytes(r.content)
                return {"doi": doi, "outcome": "downloaded", "url": url}
        except Exception:
            continue
    return {"doi": doi, "outcome": "failed"}

# EXECUTION
results = []
with ThreadPoolExecutor(max_workers=WORKERS) as executor:
    future_to_doi = {executor.submit(download_paper, t[0], t[1]): t[0] for t in tasks}
    
    for future in tqdm(as_completed(future_to_doi), total=len(tasks), desc="Downloading"):
        res = future.result()
        results.append(res)
        
        # Log
        with open(LOG_PATH, 'a', encoding='utf-8') as f:
            if f.tell() == 0: f.write("doi,outcome,url\n")
            f.write(f"{res['doi']},{res['outcome']},{res.get('url','')}\n")

print(f"Finished. Check your folder: {OUTDIR.absolute()}")


ICCS 2010-2017

In [None]:
load_dotenv()

ELS_API_KEY = os.getenv("ELS_API_KEY")
PDF_DIR = os.getenv("ICCS_PDF_DIR")

assert ELS_API_KEY, "ELS_API_KEY missing"
assert PDF_DIR, "ICCS_PDF_DIR missing"

DBLP_DIR = Path(os.getenv("DBLP_DIR"))
IN_PATH = DBLP_DIR/"interim"/"iccs"/"iccs_dblp_dois.jsonl"
BASE_OUT_DIR = Path(PDF_DIR)
BASE_OUT_DIR.mkdir(parents=True, exist_ok=True)

BASE_URL = "https://api.elsevier.com/content/article/doi/"
session = requests.Session()

print("Scanning input file to calculate total tasks")
tasks = []
with IN_PATH.open("r", encoding="utf-8") as f:
    for line in f:
        try:
            rec = json.loads(line)
            year = int(rec.get("year", 0))
            doi = rec.get("doi_normalized", "")
            if 2010 <= year <= 2017 and doi.startswith("10.1016/j.procs."):
                tasks.append(rec)
        except: continue

print(f"Total papers to process: {len(tasks)}")

#Tqdm
for rec in tqdm(tasks, desc="Downloading ICCS Papers"):
    doi = rec.get("doi_normalized")
    year = str(rec.get("year"))

    # Folder
    year_dir = BASE_OUT_DIR / year
    year_dir.mkdir(parents=True, exist_ok=True)
    
    fname = f"{year}_{doi.replace('/', '_')}.pdf"
    fpath = year_dir / fname

    if fpath.exists():
        continue

    try:
        r = session.get(
            BASE_URL + doi,
            headers={"X-ELS-APIKey": ELS_API_KEY, "Accept": "application/pdf"},
            timeout=60,
            stream=True
        )

        if r.status_code == 200:
            content = r.content 
            if content[:4] == b"%PDF":
                fpath.write_bytes(content)

        elif r.status_code != 200:
            print(f" [!] Failed {doi}: HTTP {r.status_code}")

    except (requests.exceptions.RequestException, requests.exceptions.ChunkedEncodingError) as e:
        print(f" [!] Network Error: {doi} - {e}")
        time.sleep(5)
        continue

    time.sleep(1.5) 

print(f"All downloads finished. Files are in: {BASE_OUT_DIR.absolute()}")


ICCS 2018-2025

In [None]:
from bs4 import BeautifulSoup

PDF_DIR = os.getenv("ICCS_PDF_DIR")
DBLP_DIR = Path(os.getenv("DBLP_DIR"))
IN_PATH = DBLP_DIR/"interim"/"iccs"/"iccs_dblp_dois.jsonl"
BASE_OUT_DIR = Path(PDF_DIR)
BASE_OUT_DIR.mkdir(parents=True, exist_ok=True)

session = requests.Session()

tasks_by_year = {}
with IN_PATH.open("r", encoding="utf-8") as f:
    for line in f:
        try:
            rec = json.loads(line)
            year = int(rec.get("year", 0))
            if 2018 <= year <= 2025:
                tasks_by_year.setdefault(year, []).append(rec)
        except: continue

for year in sorted(tasks_by_year.keys()):
    year_dir = BASE_OUT_DIR / str(year)
    year_dir.mkdir(parents=True, exist_ok=True)

    archive_url = f'https://www.iccs-meeting.org/archive/iccs{year}/'
    print(f"\nICCS {year}...")

    try:
        resp = session.get(archive_url, timeout=30)
        soup = BeautifulSoup(resp.text, 'html.parser')
    except Exception as e:
        print(f"Page load failed: {e}")
        continue

    for rec in tqdm(tasks_by_year[year], desc=f"ICCS {year}"):
        doi = rec.get("doi_normalized", "")
        if not doi: continue

        fname = f"{year}_{doi.replace('/', '_')}.pdf"
        fpath = year_dir / fname
        if fpath.exists(): continue

        doi_pattern = re.escape(doi)
        elements_with_doi = soup.find_all(string=re.compile(doi_pattern))
        
        pdf_url = None
        for text_node in elements_with_doi:
            container = text_node.find_parent(['div', 'p', 'tr', 'td'])
            if container:
                pdf_link = container.find('a', href=re.compile(r'papers/.*\.pdf'))
                if not pdf_link:
                    pdf_link = container.parent.find('a', href=re.compile(r'papers/.*\.pdf'))
                if pdf_link:
                    pdf_url = pdf_link['href']
                    break

        if pdf_url:
            if not pdf_url.startswith('http'):
                pdf_url = archive_url + pdf_url.lstrip('/')
            
            try:
                r = session.get(pdf_url, timeout=60, stream=True)
                if r.status_code == 200 and r.content[:4] == b'%PDF':
                    fpath.write_bytes(r.content)
            except: pass
        else:
            print(f"No PDF for {doi}")

        time.sleep(0.3)

print(f"Complete! Files in: {BASE_OUT_DIR}")


# JOCS 2010-2025

In [None]:
load_dotenv()

ELS_API_KEY = os.getenv("ELS_API_KEY")
PDF_DIR = os.getenv("JOCS_PDF_DIR")

assert ELS_API_KEY, "ELS_API_KEY missing"
assert PDF_DIR, "JOCS_PDF_DIR missing"

DBLP_DIR = Path(os.getenv("DBLP_DIR"))
IN_PATH = DBLP_DIR/"interim"/"jocs"/"jocs_dblp_dois.jsonl"
OUT_DIR = Path(PDF_DIR)
OUT_DIR.mkdir(parents=True, exist_ok=True)

BASE_URL = "https://api.elsevier.com/content/article/doi/"

results = []
session = requests.Session()

with IN_PATH.open("r", encoding="utf-8") as f:
    for i, line in enumerate(f, 1):
        rec = json.loads(line)
        doi = rec.get("doi_normalized")
        year = rec.get("year")

        # skip if year missing or >= 2026
        if not year or int(year) >= 2026:
            continue

        if not doi or not doi.startswith("10.1016/j.jocs."):
            continue

        fname = f"{year}_{doi.replace('/', '_')}.pdf"
        fpath = OUT_DIR / fname

        if fpath.exists():
            continue

        print(f"[{i}] downloading {doi}")

        r = session.get(
            BASE_URL + doi,
            headers={
                "X-ELS-APIKey": ELS_API_KEY,
                "Accept": "application/pdf"
            },
            timeout=60
        )

        if r.status_code == 200 and r.content[:4] == b"%PDF":
            fpath.write_bytes(r.content)
            results.append({"doi": doi, "success": True})
            print("  saved")
        else:
            results.append({"doi": doi, "success": False, "status": r.status_code})
            print("  failed")

        time.sleep(3)
