# Web scraping of article abstracts from new submissions on arXiv

In [1]:
import requests
from bs4 import BeautifulSoup
import json
import re
import pandas as pd

### We are gonna choose theoretical high energy physics articles 😃

In [2]:
url="https://arxiv.org/list/hep-th/new"  ##New theoretical high energy physics articles
response = requests.get(url)

In [3]:
soup = BeautifulSoup(response.text)
articles = soup.find_all("div",attrs={"class":"meta"})

In [22]:
##Collecting the info in a dictionary
articleInfo = []

for article in articles:
   
    title= article.find("div",attrs={"class":"list-title mathjax"}).text.replace("Title:\n","").strip() #title
    abstract = article.find("p").text.replace("\n","").strip() #abstract
    authors=[article.find('div',attrs={"class":"list-authors"}).text] #authors list
    subjects=re.sub(r"(Subjects:|\n)","",article.find('div',attrs={"class":"list-subjects"}).text) #article subjects

    arxivArticle= {
        "Title": title,
        "Abstract": abstract,
        "Authors": authors,
        "Subjects": subjects
    }

    articleInfo.append(arxivArticle)

In [23]:
final_info=pd.DataFrame(articleInfo)

In [24]:
final_info.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 84 entries, 0 to 83
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Title     84 non-null     object
 1   Abstract  84 non-null     object
 2   Authors   84 non-null     object
 3   Subjects  84 non-null     object
dtypes: object(4)
memory usage: 2.8+ KB


In [25]:
final_info.head()

Unnamed: 0,Title,Abstract,Authors,Subjects
0,"Algebras, Entanglement Islands, and Observers",Some recent work has postulated the existence ...,"[Hao Geng, Yikun Jiang, Jiuci Xu]",High Energy Physics - Theory (hep-th); General...
1,Generalized False Vacuum Skyrme model,We propose a generalization of the False Vacuu...,"[L. A. Ferreira, L. R. Livramento]",High Energy Physics - Theory (hep-th); Mathema...
2,Compton amplitude and Contact term(s) in the S...,"In gauge theories, contact terms play an impor...","[Aakash Kumar, Arnab Rudra, Rahul Shaw]",High Energy Physics - Theory (hep-th); High En...
3,Can Non-Relativistic Strings Propagate Without...,We present a minimal and dynamically consisten...,"[Partha Nandi, Sk. Moinuddin, Abdus Sattar]",High Energy Physics - Theory (hep-th); General...
4,On the Renormalization Group flow of distribut...,Renormalization Group flows relate the values ...,"[Astrid Eichhorn, Aaron Held]",High Energy Physics - Theory (hep-th); High En...


### With a search query

In [10]:
query = input("What do you want to search for on Arxiv.org? use + as separator: \n")
query_url=f"https://arxiv.org/search/?query={query}&searchtype=all&abstracts=show&order=-announced_date_first&size=50"

What do you want to search for on Arxiv.org? use + as separator: 
 dark+matter


In [18]:
response_query = requests.get(query_url)
soup_query = BeautifulSoup(response_query.text)
articles_50 = soup_query.find_all("li",attrs={"class":"arxiv-result"})

In [20]:
searchInfo=[]

for article in articles_50:
    Title = re.sub(r"\n","",article.find("p",attrs={"class":"title"}).text).strip()
    abstract = article.find("p",attrs={"class":"abstract"})
    FullAbstract = re.sub(r"(\n|△ Less)","",abstract.find("span",{"class":"abstract-full"}).text).strip()
    pdfURL = article.div.p.span.a['href'],
    authors = article.find("p",{"class":"authors"}).text.split()[1:]


    arxivArticle_50= {
        "Topic": Title,
        "Abstract": FullAbstract,
        "PDF": pdfURL,
        "Authors": authors

    }


    searchInfo.append(arxivArticle_50)

In [26]:
Query_result_df=pd.DataFrame(searchInfo)

In [27]:
Query_result_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Topic     50 non-null     object
 1   Abstract  50 non-null     object
 2   PDF       50 non-null     object
 3   Authors   50 non-null     object
dtypes: object(4)
memory usage: 1.7+ KB


In [28]:
Query_result_df.head()

Unnamed: 0,Topic,Abstract,PDF,Authors
0,Cosmological Remapping for Efficient Generatio...,We present a novel application of cosmological...,"(https://arxiv.org/pdf/2506.14588,)","[Rahima, Mokeddem,, Bruno, B., Bizarria,, Jiaj..."
1,The power of SKA to Constrain cosmological gra...,The inspirals of supermaissive black hole bina...,"(https://arxiv.org/pdf/2506.14366,)","[Chengjie, Fu,, Jing, Liu]"
2,Enhancing photon-axion conversion probability ...,"In particle physics, axions and axion-like par...","(https://arxiv.org/pdf/2506.14354,)","[Taiki, Ikeda,, Sugumi, Kanno,, Jiro, Soda]"
3,On Geometrization of Classical Fields II (MES:...,The study of arXiv:2502.01174 geometrization o...,"(https://arxiv.org/pdf/2506.14313,)","[V., I., Noskov]"
4,The dynamics of background evolution and struc...,"The Baryon Acoustic Oscillation (BAO) feature,...","(https://arxiv.org/pdf/2506.14275,)","[Pankaj, Chavan,, Tapomoy, Guha, Sarkar,, Anja..."
