In [1]:
import urllib
import arxiv
import requests
import json
import csv
import pandas as pd
from collections import Counter, defaultdict
import numpy as np # for array manipulation
import matplotlib.pyplot as plt # for data visualization
%matplotlib inline 
import datetime
from tqdm import tqdm

In [None]:
import urllib.parse
import urllib.request
import xml.etree.ElementTree as ET

def search_arxiv_by_category(category, max_results=10):
    """
    Search arXiv papers by category
    
    Args:
        category (str): arXiv category (e.g., 'cs.AI', 'physics.comp-ph')
        max_results (int): Maximum number of results to return
        
    Returns:
        list: List of dictionaries containing paper details
    """
    base_url = 'http://export.arxiv.org/api/query?'
    
    # Construct the query
    search_query = f'cat:{category}'
    
    # Parameters for the API request
    params = {
        'search_query': search_query,
        'start': 0,
        'max_results': max_results,
        'sortBy': 'submittedDate',
        'sortOrder': 'descending'
    }
    
    # Construct the full URL
    query_url = base_url + urllib.parse.urlencode(params)
    
    # Make the request
    response = urllib.request.urlopen(query_url)
    data = response.read().decode('utf-8')
    
    # Parse the XML response
    root = ET.fromstring(data)
    
    # Define the XML namespace
    namespace = {'arxiv': 'http://www.w3.org/2005/Atom'}
    
    # Extract paper information
    papers = []
    for entry in root.findall('arxiv:entry', namespace):
        paper = {
            'title': entry.find('arxiv:title', namespace).text.strip(),
            'authors': [author.find('arxiv:name', namespace).text for author in entry.findall('arxiv:author', namespace)],
            'published': entry.find('arxiv:published', namespace).text,
            'summary': entry.find('arxiv:summary', namespace).text.strip(),
            'link': entry.find('arxiv:id', namespace).text
        }
        papers.append(paper)
    
    return papers, entry


categories = ["cs.AI", "cs.DB", "cs.CE", "cs.CC", "cs.CV", "cs.DC", "cs.DS", "cs.GT", "cs.LG", "cs.MA", "cs.NE", "cs.SE"]
# categories = ["cs.*"]

papers = []
for category in categories:
    print(f"\nFetching papers from category: {category}")
    paper_results = search_arxiv_by_category(category, max_results=3000)
    papers.extend([p for p in paper_results])
    print(f"papser length: {len(papers)} | search result: {len(paper_results)}")


Fetching papers from category: cs.AI
papser length: 2 | search result: 2

Fetching papers from category: cs.DB
papser length: 4 | search result: 2

Fetching papers from category: cs.CE
papser length: 6 | search result: 2

Fetching papers from category: cs.CC
papser length: 8 | search result: 2

Fetching papers from category: cs.CV
papser length: 10 | search result: 2

Fetching papers from category: cs.DC
papser length: 12 | search result: 2

Fetching papers from category: cs.DS
papser length: 14 | search result: 2

Fetching papers from category: cs.GT
papser length: 16 | search result: 2

Fetching papers from category: cs.LG
papser length: 18 | search result: 2

Fetching papers from category: cs.MA
papser length: 20 | search result: 2

Fetching papers from category: cs.NE
papser length: 22 | search result: 2

Fetching papers from category: cs.SE
papser length: 24 | search result: 2


In [7]:
paper_result, entry = paper_results

In [8]:
entry

<Element '{http://www.w3.org/2005/Atom}entry' at 0x16a0d3830>

In [4]:
paper_results[0]

{'title': 'Textured Gaussians for Enhanced 3D Scene Appearance Modeling',
 'authors': ['Brian Chao',
  'Hung-Yu Tseng',
  'Lorenzo Porzi',
  'Chen Gao',
  'Tuotuo Li',
  'Qinbo Li',
  'Ayush Saraf',
  'Jia-Bin Huang',
  'Johannes Kopf',
  'Gordon Wetzstein',
  'Changil Kim'],
 'published': '2024-11-27T18:59:59Z',
 'summary': '3D Gaussian Splatting (3DGS) has recently emerged as a state-of-the-art 3D\nreconstruction and rendering technique due to its high-quality results and fast\ntraining and rendering time. However, pixels covered by the same Gaussian are\nalways shaded in the same color up to a Gaussian falloff scaling factor.\nFurthermore, the finest geometric detail any individual Gaussian can represent\nis a simple ellipsoid. These properties of 3DGS greatly limit the expressivity\nof individual Gaussian primitives. To address these issues, we draw inspiration\nfrom texture and alpha mapping in traditional graphics and integrate it with\n3DGS. Specifically, we propose a new genera

In [15]:
paper_results[0]

{'title': 'Textured Gaussians for Enhanced 3D Scene Appearance Modeling',
 'authors': ['Brian Chao',
  'Hung-Yu Tseng',
  'Lorenzo Porzi',
  'Chen Gao',
  'Tuotuo Li',
  'Qinbo Li',
  'Ayush Saraf',
  'Jia-Bin Huang',
  'Johannes Kopf',
  'Gordon Wetzstein',
  'Changil Kim'],
 'published': '2024-11-27T18:59:59Z',
 'summary': '3D Gaussian Splatting (3DGS) has recently emerged as a state-of-the-art 3D\nreconstruction and rendering technique due to its high-quality results and fast\ntraining and rendering time. However, pixels covered by the same Gaussian are\nalways shaded in the same color up to a Gaussian falloff scaling factor.\nFurthermore, the finest geometric detail any individual Gaussian can represent\nis a simple ellipsoid. These properties of 3DGS greatly limit the expressivity\nof individual Gaussian primitives. To address these issues, we draw inspiration\nfrom texture and alpha mapping in traditional graphics and integrate it with\n3DGS. Specifically, we propose a new genera

In [3]:
paper_dict = {
    "title": [],
    "authors": [],
    "abstract": [],
    "published": [],
    "link": []
}

for p in tqdm(papers):
    paper_dict["title"].append(p["title"])
    paper_dict["authors"].append(p["authors"])
    paper_dict["abstract"].append(p["summary"])
    paper_dict["published"].append(p["published"])
    paper_dict["link"].append(p["link"])

paper_df = pd.DataFrame(paper_dict)

paper_df.head()

100%|██████████| 13573/13573 [00:00<00:00, 447863.62it/s]


Unnamed: 0,title,authors,abstract,published,link
0,Cross-modal Information Flow in Multimodal Lar...,"[Zhi Zhang, Srishti Yadav, Fengze Han, Ekateri...",The recent advancements in auto-regressive mul...,2024-11-27T18:59:26Z,http://arxiv.org/abs/2411.18620v1
1,Diffusion Self-Distillation for Zero-Shot Cust...,"[Shengqu Cai, Eric Chan, Yunzhi Zhang, Leonida...",Text-to-image diffusion models produce impress...,2024-11-27T18:58:52Z,http://arxiv.org/abs/2411.18616v1
2,Proactive Gradient Conflict Mitigation in Mult...,"[Zhi Zhang, Jiayi Shen, Congfeng Cao, Gaole Da...",Advancing towards generalist agents necessitat...,2024-11-27T18:58:22Z,http://arxiv.org/abs/2411.18615v1
3,Robust Offline Reinforcement Learning with Lin...,"[Cheng Tang, Zhishuai Liu, Pan Xu]",The Distributionally Robust Markov Decision Pr...,2024-11-27T18:57:03Z,http://arxiv.org/abs/2411.18612v1
4,Automated Literature Review Using NLP Techniqu...,"[Nurshat Fateh Ali, Md. Mahdi Mohtasim, Shakil...",This research presents and compares multiple a...,2024-11-27T18:27:07Z,http://arxiv.org/abs/2411.18583v1


In [4]:
paper_df.to_csv("./paper_df_arxiv_large.csv", index=False)

In [5]:
df_tmp = pd.read_csv('/Users/11488608/Documents/master_degree/dsde/paper_search/scripts/paper_df_arxiv_large.csv')

df_tmp.shape

(13573, 5)