In [2]:

import os
import sys
import json
from tqdm import tqdm
import argparse
import requests
from bs4 import BeautifulSoup

base_url = "https://papers.nips.cc"
default_headers = {
    "user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36",
}


def get_conference_url(year):
    return f"{base_url}/paper/{year}"


def fetch_url(url):
    try:
        response = requests.get(url, headers=default_headers)
        response.raise_for_status()
        return response.text
    except requests.exceptions.RequestException as e:
        print(f"Error fetching {url}: {e}")
        sys.exit(1)


def get_paper_paths(year):
    url = get_conference_url(year)
    page_content = fetch_url(url)
    soup = BeautifulSoup(page_content, "html.parser")

    paper_ids, abstract_paths, metadata_paths, pdf_paths = [], [], [], []

    for li in soup.find("div", class_='container-fluid').find_all("li"):
        paper_temp_url = li.a.get('href')
        paper_id = paper_temp_url.split("/")[-1].split("-")[0]
        paper_ids.append(paper_id)
        
        abstract_paths.append(f"{base_url}{paper_temp_url}")
        metadata_paths.append(f"{base_url}{paper_temp_url.replace('Abstract', 'Metadata').replace('hash', 'file').replace('.html', '.json')}")
        pdf_paths.append(f"{base_url}{paper_temp_url.replace('Abstract', 'Paper').replace('hash', 'file').replace('.html', '.pdf')}")

    return paper_ids, abstract_paths, metadata_paths, pdf_paths




In [3]:
paper_ids, abstract_paths, metadata_paths, pdf_paths = get_paper_paths(2020)    

In [16]:
pdf_paths

['https://papers.nips.cc/paper_files/paper/2020/file/0004d0b59e19461ff126e3a08a814c33-Paper.pdf',
 'https://papers.nips.cc/paper_files/paper/2020/file/00482b9bed15a272730fcb590ffebddd-Paper.pdf',
 'https://papers.nips.cc/paper_files/paper/2020/file/0060ef47b12160b9198302ebdb144dcf-Paper.pdf',
 'https://papers.nips.cc/paper_files/paper/2020/file/007ff380ee5ac49ffc34442f5c2a2b86-Paper.pdf',
 'https://papers.nips.cc/paper_files/paper/2020/file/0084ae4bc24c0795d1e6a4f58444d39b-Paper.pdf',
 'https://papers.nips.cc/paper_files/paper/2020/file/00a03ec6533ca7f5c644d198d815329c-Paper.pdf',
 'https://papers.nips.cc/paper_files/paper/2020/file/00ac8ed3b4327bdd4ebbebcb2ba10a00-Paper.pdf',
 'https://papers.nips.cc/paper_files/paper/2020/file/00e26af6ac3b1c1c49d7c3d79c60d000-Paper.pdf',
 'https://papers.nips.cc/paper_files/paper/2020/file/012a91467f210472fab4e11359bbfef6-Paper.pdf',
 'https://papers.nips.cc/paper_files/paper/2020/file/012d9fe15b2493f21902cd55603382ec-Paper.pdf',
 'https://papers.nip

In [21]:
def download_pdf(pdf_url, output_dir):
    response = requests.get(pdf_url, headers=default_headers, stream=True)
    response.raise_for_status()
    with open(os.path.join(output_dir, 'pd.pdf'), 'wb') as f:
        for chunk in response.iter_content(chunk_size=8192):
            f.write(chunk)

In [23]:
pdf_paths[0]

'https://papers.nips.cc/paper_files/paper/2020/file/0004d0b59e19461ff126e3a08a814c33-Paper.pdf'

In [24]:
import requests

file_url = "http://medianet.edmond-de-rothschild.fr/edram/pdf/kiid_fr0010172767_en_20200120_20200128_1954.pdf"

headers = {
    "User-Agent": "PostmanRuntime/7.20.1",
    "Accept": "*/*",
    "Cache-Control": "no-cache",
    "Postman-Token": "8eb5df70-4da6-4ba1-a9dd-e68880316cd9,30ac79fa-969b-4a24-8035-26ad1a2650e1",
    "Host": "medianet.edmond-de-rothschild.fr",
    "Accept-Encoding": "gzip, deflate",
    "Connection": "keep-alive",
    "cache-control": "no-cache",
}

r = requests.get(file_url, file_url, headers=headers)

with open("python.pdf", "wb") as pdf:
    pdf.write(r.content)

In [25]:
download_pdf(pdf_paths[0], "../data")

In [13]:
import os
import sys
import json
import argparse
import asyncio
import aiohttp
from bs4 import BeautifulSoup

base_url = "https://papers.nips.cc"
default_headers = {
    "user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36",
}


def get_conference_url(year):
    return f"{base_url}/paper/{year}"


async def fetch(session, url):
    try:
        async with session.get(url, headers=default_headers) as response:
            response.raise_for_status()
            return await response.text()
    except aiohttp.ClientError as e:
        print(f"Error fetching {url}: {e}")
        sys.exit(1)


async def fetch_all(session, urls):
    tasks = [fetch(session, url) for url in urls]
    return await asyncio.gather(*tasks)


async def get_paper_paths(year):
    url = get_conference_url(year)
    
    async with aiohttp.ClientSession() as session:
        page_content = await fetch(session, url)
        soup = BeautifulSoup(page_content, "html.parser")
        
        paper_ids, abstract_paths, metadata_paths, pdf_paths = [], [], [], []
        
        base_path = base_url + "/paper"
        
        for li in soup.find("div", class_='container-fluid').find_all("li"):
            paper_temp_url = li.a.get('href')
            paper_id = paper_temp_url.split("/")[-1].split("-")[0]
            
            paper_ids.append(paper_id)
            abstract_paths.append(f"{base_url}{paper_temp_url}")
            
            paper_base_url = f"{base_url}{paper_temp_url.rsplit('.', 1)[0]}"
            metadata_paths.append(f"{paper_base_url.replace('Abstract', 'Metadata').replace('hash', 'file')}.json")
            pdf_paths.append(f"{paper_base_url.replace('Abstract', 'Paper').replace('hash', 'file')}.pdf")
        
        return paper_ids, abstract_paths, metadata_paths, pdf_paths



def get_args():
    parser = argparse.ArgumentParser(description='Download NIPS papers')
    parser.add_argument('--start_year', type=int, default=1987, help='start year')
    parser.add_argument('--end_year', type=int, default=2023, help='end year')
    parser.add_argument('--output_dir', type=str, default='data', help='output directory')
    return parser.parse_args()



def get_metadata(meta_path):
    response = requests.get(meta_path)
    response.raise_for_status()
    return response.json()

def get_pdf(pdf_path):
    response = requests.get(pdf_path)
    response.raise_for_status()
    return response.content


def get_args():
    parser = argparse.ArgumentParser(description='Download NIPS papers')
    parser.add_argument('--start_year', type=int, default=1987, help='start year')
    parser.add_argument('--end_year', type=int, default=2023, help='end year')
    parser.add_argument('--output_dir', type=str, default='data', help='output directory')
    return parser.parse_args()


def main():
    start_year = 2020
    end_year = 2020
    for year in range(start_year, end_year + 1):


        year_dir = os.path.join('../data', str(year))

        paper_ids, abstract_paths, metadata_paths, pdf_paths = await get_paper_paths(year)

        for paper_id, abstract_path, metadata_path, pdf_path in zip(paper_ids, abstract_paths, metadata_paths, pdf_paths):
            print(f"Downloading {paper_id}")
            
            # save directory for the paper
            save_dir = os.path.join(year_dir, paper_id)

            metadata = get_metadata(metadata_path)
            pdf = get_pdf(pdf_path)

            with open(os.path.join(save_dir, "metadata.json"), "w") as f:
                json.dump(metadata, f, indent=4)

            with open(os.path.join(save_dir, "paper.pdf"), "wb") as f:
                f.write(pdf)


SyntaxError: 'await' outside async function (4231259847.py, line 96)

In [38]:
def get_latest_conference_year():
    url = 'https://papers.nips.cc/'
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    years = []
    for item in soup.find_all('li'):
        try:
            year = int(item.text.split()[-1].strip('()'))
            years.append(year)
        except (ValueError, IndexError):
            continue

    return max(years) if years else 2023

In [37]:
get_latest_conference_year()

2023