In [28]:
import urllib.request
import urllib.parse
import feedparser
import sqltables.sqlite3
from tqdm.notebook import tqdm

In [29]:
start_date = "2023-01-01"

In [37]:
db = sqltables.sqlite3.Database("submissions.sqlite3")
if "submissions" in list(db.tables):
    submissions = db.open_table("submissions")
else:
    submissions = db.create_table(name="submissions", column_names=["arxiv_id", "date", "title", "authors", "url", "abstract", "categories"])

In [18]:
submissions.view("select max(arxiv_id) from _")

|max\(arxiv\_id\)|
|-|
|\'2301\.03580v1\'|


In [19]:
processed_submissions = set(row.arxiv_id for row in submissions)
len(processed_submissions)

25324

In [20]:
def fetch_entries(search_query, start_date, batch_size = 128):
    base_url = "http://export.arxiv.org/api/query"
    query_params = {
        "search_query": search_query,
        "sortBy": "submittedDate",
        "sortOrder": "descending",
        "max_results": batch_size,
        "start": 0
    }
    hit_start_date = False
    while not hit_start_date:
        query_str = urllib.parse.urlencode(query_params)
        with urllib.request.urlopen(f"{base_url}?{query_str}") as fh:
            feed = feedparser.parse(fh)
        for entry in feed["entries"]:
            if entry["published"] < start_date:
                hit_start_date = True
                break
            yield entry
        query_params["start"] += batch_size

In [34]:
categories = ["cs.LG", "math.PR", "math.OC"]
rows = []
# it = fetch_entries("cat:cs.LG", start_date=start_date)
it = (entry 
      for cat in categories 
      for entry in fetch_entries(f"cat:{cat}", start_date=start_date))
for entry in tqdm(it):
    arxiv_id = entry["id"].replace("http://arxiv.org/abs/", "")
    if arxiv_id in processed_submissions:
        continue    
    date = entry["published"]
    title = entry["title"]
    url = entry["id"]
    abstract = entry["summary"]
    authors = ", ".join(x["name"] for x in entry["authors"])
    categories = ",".join([t["term"].strip() for t in entry["tags"] if "term" in t])
    rows.append([arxiv_id, date, title, authors, url, abstract, categories])
len(rows)

0it [00:00, ?it/s]

3870

In [38]:
submissions.insert(rows)

In [39]:
submissions.view("select count(*) from _")

|count\(\*\)|
|-|
|32291|


In [40]:
submissions.view("select count(distinct arxiv_id) from _")

|count\(distinct arxiv\_id\)|
|-|
|29225|


In [41]:
db.close()