In [3]:
import openreview
import json
import openreview.api as openreview_api
from pprint import pprint
from typing import Any, Callable, Iterable, List, Dict, Optional
import requests
from markitdown import MarkItDown
from io import BytesIO
import pymupdf

In [4]:
# Convert any data type to json object
def convert_2_json(item):
    if hasattr(item, "to_json"):
        payload = dict(item.to_json())
        if getattr(item, "details", None) is not None:
            payload["details"] = item.details
        return payload
    if isinstance(item, list):
        return [convert_2_json(x) for x in item]
    if isinstance(item, dict):
        return {k: convert_2_json(v) for k, v in item.items()}
    return item

# Save json file (convert to json format first if necessary)
def save_json(data: Any, file_name: str, convert: bool = True) -> None:
    serializable = convert_2_json(data) if convert else data
    with open(f"{file_name}.json", "w") as fh:
        json.dump(serializable, fh, indent=4)

In [5]:
def get_paper_url(paper_id:str, isV2 = False):
    if isV2 == True:
        return None
    
    client = openreview.Client(baseurl="https://api.openreview.net")
    paper = convert_2_json(client.get_note(paper_id))
    return f"https://openreview.net{paper["content"]["pdf"]}"

In [6]:
# Save actual PDF File
def save_pdf(url, file_path):
    response = requests.get(url)
    response.raise_for_status()  # Raise an exception for bad status codes
    
    with open(file_path, 'wb') as f:
        f.write(response.content)

# Save Markdown File
def get_pdf_markdown(pdf_url:str,file_path):
    response = requests.get(pdf_url)
    pdf_bytes = response.content

    md = MarkItDown()
    result = md.convert(BytesIO(pdf_bytes), mime="application/pdf")
    if result and result.text_content:
        with open(file_path, "w", encoding="utf-8") as f:
            f.write(result.text_content)
        return None
    print("Goooz")
    with pymupdf.open(stream=pdf_bytes, filetype="pdf") as doc:
        return "\n".join(page.get_text() for page in doc)

In [11]:
paperID = "ryykVe-0W"
url = get_paper_url(paperID)
fileName = f"paper_{paperID}"
save_pdf(url,f"./out/{fileName}.pdf")
get_pdf_markdown(url,f"./out/{fileName}.md")
print(url)

https://openreview.net/pdf/918f910202a69f08ada9f06eef31e21178166564.pdf
