In [1]:
import pathlib
from pathlib import Path
import requests
from datetime import datetime
import urllib.request as libreq
import string

import pandas as pd
import numpy as np
import xml.etree.ElementTree as ET
from bs4 import BeautifulSoup
from tqdm import tqdm

from objects import Entry
from utils import create_q,downloader, check_col_downloaded_value

import warnings
warnings.filterwarnings("ignore")

In [2]:
# Define the URL for the arXiv API
url = 'http://export.arxiv.org/api/query?search_query='

# Initialize an empty list to store the downloaded papers
papers = []

# Get the current working directory
os_path = pathlib.Path().resolve()

# Create a unique save path for the downloaded papers based on the current date and time
save_path = 'Papers-%s' % datetime.today().strftime('%Y-%m-%d %H-%M-%S')

# Combine the working directory and save path to create a full path
full_path = os_path / save_path

# Create the directory for the downloaded papers if it doesn't already exist
Path(str(full_path)).mkdir(parents=True, exist_ok=True)

In [3]:
# Define the query terms and options for the arXiv API search
qs = [('Interpolation',), [('Interpolation',)],['cs.CV'], ('KDD OR ICML OR NeurIPS OR ICLR OR COLT OR CIKM OR AAAI OR UAI OR IJCAI OR VLDB OR AISTATS',), ]
options = ['abs', 'ti', 'cat', 'co']

In [4]:
# Initialize a log dictionary to track information
log = {}

# Define the main function for fetching and processing papers
def main(url, download_path, start=1, max=100):
    # Read the existing CSV file into a DataFrame
    df = pd.read_csv('downloaded.csv')

    # Build the query string and options
    q = create_q(qs, options).replace(' ', '+')
    q_options = f'&start=0&max_results={max}&sortBy=lastUpdatedDate&sortOrder=descending'
    print(f'{url}{q[:-4]}{q_options}')

    # Fetch data from the specified URL
    with libreq.urlopen(f'{url}{q[:-4]}{q_options}') as url:
        r = url.read()

    # Parse HTML content with BeautifulSoup
    soup = BeautifulSoup(r, 'lxml')

    # Extract entries from the HTML content
    entries = soup.find_all('entry')

    # Iterate over entries and process each one
    for entry in tqdm(entries, leave=False):
        downloaded = False
        e = Entry(entry)
        e.scrape()
        data = e.get_info()
        comp_cat = '_'.join(data['category'])
        ven_pup = data['comment']
        title = data['title'].translate(str.maketrans('', '', string.punctuation)).replace('\n', '').replace(' ', '-') + '.pdf'
        pup_date = data['published'][0]

        # Update the log with publication date information
        if pup_date in log.keys():
            log[pup_date] += 1
        else:
            log[pup_date] = 1

        # Create a folder for each publication date
        download_folder = download_path / pup_date 
        Path(str(download_folder)).mkdir(parents=True, exist_ok=True)

        # Update the log with a combination of publication date and category
        if pup_date + ' ' + comp_cat in log.keys():
            log[pup_date + ' ' + comp_cat] += 1
        else:
            log[pup_date + ' ' + comp_cat] = 1

        # Update the log with category information
        if comp_cat in log.keys():
            log[comp_cat] += 1
        else:
            log[comp_cat] = 1

        # Create a folder for each category
        download_folder = download_folder / comp_cat 
        Path(str(download_folder)).mkdir(parents=True, exist_ok=True)

        # Check if the entry has been downloaded before
        is_downloaded = check_col_downloaded_value(df, comp_cat)

        # If the entry has a PDF link and has not been downloaded, download it
        if 'link' in data and 'pdf' in data['link'][0]:
            download_link = data['link'][0]['pdf']
            if title not in df['file_name'].tolist() or not is_downloaded:
                downloader(download_link, download_folder, title)
                downloaded = True

        # If the entry has not been recorded in the DataFrame, record it
        if title not in df['file_name'].tolist() or not is_downloaded:
            new_row = {'file_name': title, 'path': download_folder, 'tag': comp_cat, 'downloaded': downloaded}
            df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)

            # Save the updated DataFrame to the CSV file
            df.to_csv('downloaded.csv', index=False)

# Call the main function with specified arguments
main(url, full_path, max=339)

# Create a DataFrame from the log dictionary and save it to a CSV file
log_df = pd.DataFrame([dict(sorted(log.items()))]).T
log_df.to_csv('log.csv')

http://export.arxiv.org/api/query?search_query=(abs:Interpolation)+OR+(ti:Interpolation)+AND+(cat:cs.CV)+AND+(co:KDD+OR+ICML+OR+NeurIPS+OR+ICLR+OR+COLT+OR+CIKM+OR+AAAI+OR+UAI+OR+IJCAI+OR+VLDB+OR+AISTATS)&start=0&max_results=339&sortBy=lastUpdatedDate&sortOrder=descending


  0%|                                                                                 | 0/23 [00:00<?, ?it/s]

  0%|          | 0.00/4.58M [00:00<?, ?B/s]

  0%|          | 0/4579299 [00:00<?, ?it/s]

 96%|████████████████████████████████████████████████████████████████████▊   | 22/23 [00:10<00:00,  2.08it/s]

  0%|          | 0.00/8.02M [00:00<?, ?B/s]

  0%|          | 0/8021165 [00:00<?, ?it/s]

                                                                                                             