In [5]:
import pandas as pd
import openreview
import json
import openreview.api as openreview_api
import shutil

from collections import OrderedDict
from typing import Any, Callable, Iterable, List, Dict, Optional
import openreview.api as openreview_api

In [6]:
def convert_2_json(item):
    if hasattr(item, "to_json"):
        payload = dict(item.to_json())
        if getattr(item, "details", None) is not None:
            payload["details"] = item.details
        return payload
    if isinstance(item, list):
        return [convert_2_json(x) for x in item]
    if isinstance(item, dict):
        return {k: convert_2_json(v) for k, v in item.items()}
    return item

In [7]:
def save_json(data: Any, file_name: str, convert: bool = True) -> None:
    serializable = convert_2_json(data) if convert else data
    with open(f"{file_name}.json", "w") as fh:
        json.dump(serializable, fh, indent=4)


In [None]:
# client = openreview.Client(baseurl="https://api.openreview.net")
client = openreview_api.OpenReviewClient(baseurl="https://api2.openreview.net")
# groups = client.get_groups(id="ICLR.cc/2021")
groups = client.get_all_groups(id="NeurIPS.cc/2023/Conference")
print(len(groups))
print(groups[0])
save_json(groups[0],"ggggg")

In [None]:
invitation_url = 'ICLR.cc/2024/Conference/-/Submission'
# API 2
client = openreview_api.OpenReviewClient(baseurl="https://api2.openreview.net")
notes = client.get_all_notes(invitation = invitation_url, details="directReplies,revisions")
if len(notes) == 0:
    # API 1
    client = openreview.Client(baseurl="https://api.openreview.net")
    notes = client.get_all_notes(invitation = invitation_url, details="directReplies,revisions")

print(len(notes))
save_json(notes[50],"skandfsajn")
# save_json(notes[50],"../raw out/NeurIPS_2025_paper_50_raw")

Getting V2 Notes: 100%|█████████▉| 5533/5539 [00:13<00:00, 422.22it/s]

5539





In [None]:


papers = convert_2_json(notes)
venue = "iclr_2024"
index = 200

# # iclr 2024
if venue in ["iclr_2024","iclr_2025","neurips_2023","neurips_2024","neurips_2025"]:
    new_paper = getAttr(papers[index], id = "id" , 
                    title = "content:title:value", pdf_url = "content:pdf:value", has_revisions= "details:revisions",
                    authors = "content:authors:value", created_date = "cdate", original_paper_id = "original", 
                    reviews = "details:directReplies", invitation = "invitations"
                    )
else:
    new_paper = getAttr(papers[index], id = "id" , 
                    title = "content:title" , pdf_url = "content:pdf", has_revisions= "details:revisions", 
                    authors = "content:authors", created_date = "cdate" , original_paper_id = "original",
                    reviews = "details:directReplies", invitation = "invitation"
                    )


new_paper = paperCleaner(new_paper,venue)
save_json(new_paper,"new_paper")

In [None]:
def get_pdf_markdown():
    ''' 
        !!! This is a SAFTEY return, DO NOT Run this function
        I REPEAT, DO NOT run this function on local 
    '''
    return 
    from markitdown import MarkItDown
    from io import BytesIO
    import requests

    pdf_url = new_paper.get("pdf_url")
    if not(pdf_url): return
    response = requests.get()
    pdf_bytes = response.content

    md = MarkItDown()

    result = md.convert(BytesIO(pdf_bytes), mime="application/pdf")

    new_paper["pdf"] = result.text_content
    save_json(new_paper,"new_paper")

In [3]:
import os

def print_results(directory_path):
    # Check if directory exists
    if not os.path.isdir(directory_path):
        print(f"Error: '{directory_path}' is not a valid directory.")
        return

    for filename in os.listdir(directory_path):
        file_path = os.path.join(directory_path, filename)
        
        # Skip folders; only print files
        if os.path.isfile(file_path):
            size_bytes = os.path.getsize(file_path)

            # Convert to human-readable units
            if size_bytes < 1024:
                readable = f"{size_bytes} B"
            elif size_bytes < (1024 ** 2):
                readable = f"{size_bytes / 1024:.2f} KB"
            elif size_bytes < (1024 ** 3):
                readable = f"{size_bytes / (1024 ** 2):.2f} MB"
            else:
                readable = f"{size_bytes / (1024 ** 3):.2f} GB"

            print(f"{filename} — {readable}")

In [8]:
def getAttr(paper,**attrs:str):
    def go_deep(node,paths):
        if not(node): return node
        if len(paths) == 1:
            return node.get(paths[0])
        return go_deep(node.get(paths[0]), paths[1:])

    results = {}
    for key, values in attrs.items():
        paths = values.split(":")
        results[key] = go_deep(paper, paths)
    return results

def paperCleaner(paper, specialCase = '',doCleanString = True):
    # Since some might have style in text 
    def cleanString(s):
        return ''.join(c for c in s if c.isascii() and c != '\n')

    cp_paper = paper.copy()

    # Reviews
    reviews = []
    decision = None
    for reviewItem in paper.get("reviews"):
        ''' 
            Some have "invitations" and others have "invitation" 
            ==> make everything "invitation" for easier conditions
        '''
        puralVal = reviewItem.get("invitations")
        if puralVal: reviewItem["invitation"] = puralVal

        # Decisions
        if not decision:
            if specialCase == "iclr_2019":
                decision = getAttr(reviewItem, decision = "content:recommendation")["decision"]
            elif specialCase in ["iclr_2024", "iclr_2025", "neurips_2023", "neurips_2024", "neurips_2025"]:
                decision = getAttr(reviewItem, decision = "content:decision:value")["decision"]
            else:      
                decision = getAttr(reviewItem, decision = "content:decision")["decision"]
        
        # Reviews
        if specialCase in ["iclr_2024", "iclr_2025", "neurips_2023", "neurips_2024", "neurips_2025"]:
            if "meta" in ",".join(i.lower() for i in reviewItem["invitation"]):
                continue
            if "review" in ",".join(i.lower() for i in reviewItem["invitation"]):
                
                result = "\n".join(
                (
                    # v is a list of objects
                    f"{k}:{cleanString(','.join(str(item['value']) for item in v))}"
                    if (doCleanString and isinstance(v, list))
                    else f"{k}:{', '.join(str(item['value']) for item in v)}"
                ) if isinstance(v, list)
                else (
                    # v is a single object
                    f"{k}:{cleanString(str(v['value']))}"
                    if doCleanString
                    else f"{k}:{str(v['value'])}"
                )
                for k, v in reviewItem["content"].items())
                
                date = reviewItem["cdate"] if reviewItem.get("cdate") else reviewItem.get("tcdate")
                reviewObj = {"date": date,"review": result}
                reviews.append(reviewObj)

        elif "review" in reviewItem["invitation"].lower():
            if specialCase in ["iclr_2019","neurips_2022"] and ("meta" in reviewItem["invitation"].lower()):
                continue
            result = "\n".join(
                f"{k}:{cleanString(','.join(v)) if (doCleanString and isinstance(v, list)) else cleanString(v) if doCleanString else v}"
                for k, v in reviewItem["content"].items()
            )

            date = reviewItem["cdate"] if reviewItem.get("cdate") else reviewItem.get("tcdate")
            reviewObj = {"date": date,"review": result}
            reviews.append(reviewObj)
        
    del cp_paper["reviews"]

    cp_paper["reviews"] = reviews
    cp_paper["decision"] = decision
    cp_paper["has_revisions"] = True if cp_paper["original_paper_id"] else False
    if cp_paper.get("pdf_url"):
        cp_paper["pdf_url"] = "https://openreview.net" + cp_paper["pdf_url"]
    else:
        cp_paper["pdf_url"] = None
    return cp_paper

In [None]:
venue_invitations_url = {
  'iclr_2016': 'ICLR.cc/2016/workshop/-/submission',
  'iclr_2017': 'ICLR.cc/2017/conference/-/submission',
  'iclr_2018': 'ICLR.cc/2018/Conference/-/Blind_Submission',
  'iclr_2019': 'ICLR.cc/2019/Conference/-/Blind_Submission',
  'iclr_2020': 'ICLR.cc/2020/Conference/-/Blind_Submission',
  'iclr_2021': 'ICLR.cc/2021/Conference/-/Blind_Submission',
  'iclr_2022': 'ICLR.cc/2022/Conference/-/Blind_Submission',
  'iclr_2023': 'ICLR.cc/2023/Conference/-/Blind_Submission',
  'iclr_2024': 'ICLR.cc/2024/Conference/-/Submission',
  'iclr_2025': 'ICLR.cc/2025/Conference/-/Submission',
  'neurips_2019': 'NeurIPS.cc/2019/Reproducibility_Challenge/-/Blind_Report',
  'neurips_2021': 'NeurIPS.cc/2021/Conference/-/Blind_Submission',
  'neurips_2022': 'NeurIPS.cc/2022/Conference/-/Blind_Submission',
  'neurips_2023': 'NeurIPS.cc/2023/Conference/-/Submission',
  'neurips_2024': 'NeurIPS.cc/2024/Conference/-/Submission',
  'neurips_2025': 'NeurIPS.cc/2025/Conference/-/Submission'
}


if os.path.exists("out") and os.path.isdir("out"):
    shutil.rmtree("out")
os.makedirs("out")  
  
for venueName, invite_url in venue_invitations_url.items():
    [venue_name , venue_year] = venueName.split("_")
    print(f"> Retriving papers from venue {venue_name.upper()} year {venue_year}")
    # API 2
    client = openreview_api.OpenReviewClient(baseurl="https://api2.openreview.net")
    notes = client.get_all_notes(invitation = invite_url, details="directReplies,revisions")
    if len(notes) == 0:
        # API 1
        client = openreview.Client(baseurl="https://api.openreview.net")
        notes = client.get_all_notes(invitation = invite_url, details="directReplies,revisions")
    save_json(notes[50],"skandfsajn")
    papers = convert_2_json(notes)

    with open(f"out/{venueName.upper()}.jsonl","w") as f:
      for paper in papers:
          if venueName in ["iclr_2024","iclr_2025","neurips_2023","neurips_2024","neurips_2025"]:
            new_paper = getAttr(paper, id = "id" , 
                                title = "content:title:value", pdf_url = "content:pdf:value", has_revisions= "details:revisions",
                                authors = "content:authors:value", created_date = "cdate", original_paper_id = "original", 
                                reviews = "details:directReplies", invitation = "invitations"
                                )
          else:
            new_paper = getAttr(paper, id = "id" , 
                                title = "content:title" , pdf_url = "content:pdf", has_revisions= "details:revisions", 
                                authors = "content:authors", created_date = "cdate" , original_paper_id = "original",
                                reviews = "details:directReplies", invitation = "invitation"
                                )

          new_paper = paperCleaner(new_paper,venueName)
          f.write(json.dumps(new_paper) + "\n")
print("---------------------")
print("\n✓ All Jobs Done ✓")
print("=========== File Sizes Generated =========")
print_results("out")

> Retriving papers from venue ICLR year 2016
> Retriving papers from venue ICLR year 2017
> Retriving papers from venue ICLR year 2018
> Retriving papers from venue ICLR year 2019


Getting V1 Notes: 100%|█████████▉| 1417/1419 [00:00<00:00, 2149.92it/s]


> Retriving papers from venue ICLR year 2020


Getting V1 Notes: 100%|█████████▉| 2210/2213 [00:01<00:00, 1256.28it/s]


> Retriving papers from venue ICLR year 2021


Getting V1 Notes: 100%|█████████▉| 2591/2594 [00:03<00:00, 832.01it/s]


> Retriving papers from venue ICLR year 2022


Getting V1 Notes: 100%|█████████▉| 2614/2617 [00:03<00:00, 686.49it/s]


> Retriving papers from venue ICLR year 2023


Getting V1 Notes: 100%|█████████▉| 3789/3793 [00:05<00:00, 667.14it/s]


> Retriving papers from venue ICLR year 2024


Getting V2 Notes: 100%|█████████▉| 7396/7404 [00:13<00:00, 554.27it/s]


> Retriving papers from venue ICLR year 2025


Getting V2 Notes: 100%|█████████▉| 11660/11672 [00:23<00:00, 500.43it/s]


> Retriving papers from venue NEURIPS year 2019
> Retriving papers from venue NEURIPS year 2021


Getting V1 Notes: 100%|█████████▉| 2765/2768 [00:03<00:00, 757.71it/s]


> Retriving papers from venue NEURIPS year 2022


Getting V1 Notes: 100%|█████████▉| 2821/2824 [00:03<00:00, 712.97it/s]


> Retriving papers from venue NEURIPS year 2023


Getting V2 Notes: 100%|█████████▉| 3391/3395 [00:07<00:00, 450.84it/s]


> Retriving papers from venue NEURIPS year 2024


Getting V2 Notes: 100%|█████████▉| 4231/4236 [00:07<00:00, 599.57it/s]


> Retriving papers from venue NEURIPS year 2025


Getting V2 Notes: 100%|█████████▉| 5533/5539 [00:11<00:00, 485.90it/s]


All Job Done 
NEURIPS_2025.jsonl — 80.99 MB
ICLR_2024.jsonl — 91.48 MB
ICLR_2019.jsonl — 12.07 MB
ICLR_2020.jsonl — 20.45 MB
NEURIPS_2023.jsonl — 52.86 MB
ICLR_2022.jsonl — 39.81 MB
NEURIPS_2021.jsonl — 39.64 MB
ICLR_2018.jsonl — 7.24 MB
ICLR_2025.jsonl — 149.20 MB
NEURIPS_2024.jsonl — 49.90 MB
NEURIPS_2019.jsonl — 683.81 KB
ICLR_2023.jsonl — 54.62 MB
NEURIPS_2022.jsonl — 36.17 MB
ICLR_2021.jsonl — 32.09 MB
ICLR_2016.jsonl — 353.92 KB
ICLR_2017.jsonl — 3.84 MB
