In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import networkx as nx

#Parameters
start = 0
size = 200
total_journals = 2000

articles =[]

## Loop pages
while start < total_journals:
    url = f"https://arxiv.org/search/?query=a&searchtype=all&abstracts=show&order=-announced_date_first&size={size}&start={start}"
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, "html.parser")
        results = soup.find_all(attrs={'class': 'arxiv-result'})

        for result in results:
            # Extract title
            title_elem = result.find(attrs={'class': 'title is-5 mathjax'})
            title = title_elem.get_text(strip=True) if title_elem else None
                
            # Extract keywords
            keyword_elems = result.find_all(attrs={'class': 'tag is-small is-link tooltip is-tooltip-top'})
            keywords = [kw.get('data-tooltip', '').strip() for kw in keyword_elems]
            
            # Extract authors
            author_head = result.find(attrs={'class': 'authors'})
            author_elems = author_head.find_all("a", href=True)
            authors = [author.get_text(strip=True) for author in author_elems]
            
            # Extract abstract
            abstract_elem = result.find(attrs={'class': 'abstract-full has-text-grey-dark mathjax'})
            abstract = abstract_elem.get_text(strip=True) if abstract_elem else None
            
            # Extract date -> using the submitted date (if has a v1 -> use v1 submitted date)
            date_elems = result.find_all(attrs={'class': 'has-text-black-bis has-text-weight-semibold'})
            date = None

            for elem in date_elems:
                if "v1" in elem.get_text(strip=True):
                    date = elem.get_text(strip=True)  
                    break
                if date is None:
                    submitted_elem = result.find(string="Submitted")
                if submitted_elem:
                    date = submitted_elem.find_next(text=True).strip()
                    
            pdf_elem = result.find("a", string="pdf")
            pdf = pdf_elem['href'] if pdf_elem else None
            
            #append data
            articles.append({
                'title': title,
                'keywords': keywords,
                'authors': authors,
                'abstract': abstract,
                'date': date,
                'pdf': pdf
            })
    else:
        print(f"Failed to retrieve data for start={start}. HTTP Status Code: {response.status_code}")
        break
    
    #loop pages (200 journals per time)
    start += size

#Create df
df = pd.DataFrame(articles)
display(df)

csv_path = "arxiv_articles.csv"
df.to_csv(csv_path, index=False)

G = nx.Graph()  # Use Graph() for an undirected graph, or DiGraph() for directed

for index, row in df.iterrows():
    # Extract authors from the article
    authors = row['authors']
    
    # Add authors as nodes and edges connecting them to the article
    for author in authors:
        author_node = f"Author_{author}"
        # Add author node if it doesn't exist
        if not G.has_node(author_node):
            G.add_node(author_node, type="author")

# Save graph to GML
output_gml = "authors_graph.gml"
try:
    nx.write_gml(G, output_gml)
    print(f"Graph saved to {output_gml}")
except Exception as e:
    print(f"Failed to save graph: {e}")



  date = submitted_elem.find_next(text=True).strip()


Unnamed: 0,title,keywords,authors,abstract,date,pdf
0,QUEEN: QUantized Efficient ENcoding of Dynamic...,[Computer Vision and Pattern Recognition],"[Sharath Girish, Tianye Li, Amrita Mazumdar, A...",Online free-viewpoint video (FVV) streaming is...,"5 December, 2024;",https://arxiv.org/pdf/2412.04469
1,NVILA: Efficient Frontier Visual Language Models,[Computer Vision and Pattern Recognition],"[Zhijian Liu, Ligeng Zhu, Baifeng Shi, Zhuoyan...",Visual language models (VLMs) have made signif...,"5 December, 2024;",https://arxiv.org/pdf/2412.04468
2,UnZipLoRA: Separating Content and Style from a...,[Computer Vision and Pattern Recognition],"[Chang Liu, Viraj Shah, Aiyu Cui, Svetlana Laz...","This paper introduces UnZipLoRA, a method for ...","5 December, 2024;",https://arxiv.org/pdf/2412.04465
3,DualPM: Dual Posed-Canonical Point Maps for 3D...,[Computer Vision and Pattern Recognition],"[Ben Kaye, Tomas Jakab, Shangzhe Wu, Christian...",The choice of data representation is a key fac...,"5 December, 2024;",https://arxiv.org/pdf/2412.04464
4,"MegaSaM: Accurate, Fast, and Robust Structure ...",[Computer Vision and Pattern Recognition],"[Zhengqi Li, Richard Tucker, Forrester Cole, Q...","We present a system that allows for accurate, ...","5 December, 2024;",https://arxiv.org/pdf/2412.04463
...,...,...,...,...,...,...
1995,SCoTT: Wireless-Aware Path Planning with Visio...,[Machine Learning],"[Aladin Djuhera, Vlad C. Andrei, Amin Seffo, H...",Path planning is a complex problem for many pr...,"27 November, 2024;",https://arxiv.org/pdf/2411.18212
1996,Nature of metallic and insulating domains in t...,[Strongly Correlated Electrons],"[M. Straub, F. Petocchi, C. Witteveen, F. B. K...",We study the electronic structure of bulk 1T-T...,"27 November, 2024;",https://arxiv.org/pdf/2411.18205
1997,The ViCTORIA project: description of a multi-f...,[Cosmology and Nongalactic Astrophysics],"[F. de Gasperin, H. W. Edler, A. Boselli, P. S...",The Virgo cluster is the closest richest nearb...,"27 November, 2024;",https://arxiv.org/pdf/2411.18204
1998,"The Galaxy Activity, Torus, and Outflow Survey...",[Astrophysics of Galaxies],"[R. Poitevineau, F. Combes, S. Garcia-Burillo,...",The detailed feeding and feedback mechanisms o...,"27 November, 2024;",https://arxiv.org/pdf/2411.18200


Graph saved to authors_graph.gml
