<a href="https://colab.research.google.com/github/arnav-chauhan-kgpian/Detect-and-Segment-Oil-Spills-Using-Computer-Vision/blob/main/Text_Mining_Project_Scraping_ArXiv_Articles.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# category and total number of articles
ARXIV_FIELDS = {
    # Computer Science
    "cs": 10000
}

In [2]:
ARXIV_BASE_URL = "https://arxiv.org/list/"
ARXIV_ABS_URL = "https://arxiv.org/abs/"

In [3]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [4]:
def generate_arxiv_pages(fields):
    pages = {}

    for category, total_articles in fields.items():
        # Generating URLs based on the category and total number of articles
        category_pages = []
        for skip in range(0, total_articles, 25):
            page = f"{ARXIV_BASE_URL}{category}/23?skip={skip}&show=25"
            category_pages.append(page)

        pages[category] = category_pages

    return pages

In [5]:
arxiv_pages = generate_arxiv_pages(ARXIV_FIELDS)

In [6]:
total_pages = 0
for category, category_pages in arxiv_pages.items():
    print(f"{category}:")
    length = len(category_pages)
    print(length)
    total_pages += length
print("total pages: ", total_pages)

cs:
400
total pages:  400


In [7]:
def extract_ids(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    dl_tags = soup.find_all('dl')
    extracted_ids = []
    for dl_tag in dl_tags:
        dt_tags = dl_tag.find_all('dt')
        for dt_tag in dt_tags:
            # Iterate over all children of dt_tag
            for child in dt_tag.children:
                if getattr(child, 'name', None) == 'a' and child.get('title', '').lower() == 'abstract':
                    href_attr = child.get('href')
                    if href_attr:
                        id_value = href_attr.split('/')[-1]
                        extracted_ids.append(id_value)
                    break  # Only one abstract link per dt
    return extracted_ids

In [8]:
def scrape_articles_ids(page_url):
    try:
        # Send an HTTP GET request to the URL
        response = requests.get(page_url)

        # Check if the request was successful (status code 200)
        if response.status_code == 200:
            # Extract Ids
            return extract_ids(response.content)
        else:
            print(f"Failed to retrieve HTML content. Status code: {response.status_code}")
            return None

    except Exception as e:
        print(f"An error occurred: {e}")
        return None

In [9]:
def get_articles_ids(pages_urls):
    articles_ids = {}
    for category, pages_urls in arxiv_pages.items():
        ids = []
        total_pages = len(pages_urls)
        print(f"\nProcessing category: {category} ({total_pages} pages)")
        try:
            for i, page_url in enumerate(pages_urls, 1):
                page_ids = scrape_articles_ids(page_url)
                if page_ids:
                    ids += page_ids
                print(f"  Page {i}/{total_pages} for {category}: {len(ids)} IDs collected so far")
            articles_ids[category] = ids
        except Exception as e:
            print(f"An error occurred in category '{category}': {e}")

        print(f"Finished {category}: {len(ids)} IDs collected.")

    return articles_ids

In [10]:
articles_ids = get_articles_ids(arxiv_pages)


Processing category: cs (400 pages)
  Page 1/400 for cs: 50 IDs collected so far
  Page 2/400 for cs: 100 IDs collected so far
  Page 3/400 for cs: 150 IDs collected so far
  Page 4/400 for cs: 200 IDs collected so far
  Page 5/400 for cs: 250 IDs collected so far
  Page 6/400 for cs: 300 IDs collected so far
  Page 7/400 for cs: 350 IDs collected so far
  Page 8/400 for cs: 400 IDs collected so far
  Page 9/400 for cs: 450 IDs collected so far
  Page 10/400 for cs: 500 IDs collected so far
  Page 11/400 for cs: 550 IDs collected so far
  Page 12/400 for cs: 600 IDs collected so far
  Page 13/400 for cs: 650 IDs collected so far
  Page 14/400 for cs: 700 IDs collected so far
  Page 15/400 for cs: 750 IDs collected so far
  Page 16/400 for cs: 800 IDs collected so far
  Page 17/400 for cs: 850 IDs collected so far
  Page 18/400 for cs: 900 IDs collected so far
  Page 19/400 for cs: 950 IDs collected so far
  Page 20/400 for cs: 1000 IDs collected so far
  Page 21/400 for cs: 1050 IDs c

In [11]:
total_ids = 0
for _, ids in articles_ids.items():
  total_ids += len(ids)

print(f"total number of articles: {total_ids}")

total number of articles: 20000


In [12]:
def extract_summary(html_content):
    # Parse HTML content with BeautifulSoup
    soup = BeautifulSoup(html_content, 'html.parser')

    # Find all blockquote tags containing blockquote tags
    blockquote = soup.find_all('blockquote')

    content_without_span = ''.join(blockquote[0].find_all(string=True, recursive=False)).strip()

    return content_without_span

In [13]:
def scrape_article(id):
    article_url = ARXIV_ABS_URL + id
    try:
        # Send an HTTP GET request to the URL
        response = requests.get(article_url)

        # Check if the request was successful (status code 200)
        if response.status_code == 200:
            # Extract Ids
            return extract_summary(response.content)
        else:
            print(f"Failed to retrieve HTML content. Status code: {response.status_code}")
            return None

    except Exception as e:
        print(f"An error occurred: {e}")
        return None

In [14]:
def extract_title(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    h1_tag = soup.find('h1', class_='title mathjax')
    if h1_tag:
        title_text = h1_tag.get_text(strip=True).replace('Title:', '').strip()
        return title_text
    return None

In [15]:
def scrape_title(id):
    article_url = ARXIV_ABS_URL + id
    try:
        response = requests.get(article_url)
        if response.status_code == 200:
            return extract_title(response.content)
        else:
            print(f"Failed to retrieve HTML content. Status code: {response.status_code}")
            return None
    except Exception as e:
        print(f"An error occurred: {e}")
        return None


In [18]:
def create_dfs(articles_ids):
    list_dfs = []
    for category, ids in articles_ids.items():
        df_dicts_list = []
        total = len(ids)
        print(f"\nProcessing category: {category} ({total} articles)")
        for idx, id in enumerate(ids, 1):
            content = scrape_article(id)  # abstract
            title = scrape_title(id)      # title
            df_dicts_list.append({"id": id, "title": title, "abstract": content})
            if idx % 10 == 0 or idx == total:
                print(f"  {idx}/{total} articles processed for {category}")
        category_df = pd.DataFrame(df_dicts_list)
        list_dfs.append(category_df)
        print(f"Done for {category} category with shape: {category_df.shape}")
    return list_dfs

In [19]:
list_articles_dfs = create_dfs(articles_ids)


Processing category: cs (20000 articles)
  10/20000 articles processed for cs
  20/20000 articles processed for cs
  30/20000 articles processed for cs
  40/20000 articles processed for cs
  50/20000 articles processed for cs
  60/20000 articles processed for cs
  70/20000 articles processed for cs
  80/20000 articles processed for cs
  90/20000 articles processed for cs
  100/20000 articles processed for cs
  110/20000 articles processed for cs
  120/20000 articles processed for cs
  130/20000 articles processed for cs
  140/20000 articles processed for cs
  150/20000 articles processed for cs
  160/20000 articles processed for cs
  170/20000 articles processed for cs
  180/20000 articles processed for cs
  190/20000 articles processed for cs
  200/20000 articles processed for cs
  210/20000 articles processed for cs
  220/20000 articles processed for cs
  230/20000 articles processed for cs
  240/20000 articles processed for cs
  250/20000 articles processed for cs
  260/20000 artic

In [21]:
final_df = pd.concat(list_articles_dfs, ignore_index=True)
print(final_df.shape)
final_df.head()

(20000, 3)


Unnamed: 0,id,title,abstract
0,2301.00001,NFTrig,NFTrig is a web-based application created for ...
1,2301.00002,Evaluating Alternative Glyph Design for Showin...,We present experimental results to explore a f...
2,2301.00003,Emotion in Cognitive Architecture: Emergent Pr...,This document presents endeavors to represent ...
3,2301.00005,Intrinsic Motivation in Dynamical Control Systems,Biological systems often choose actions withou...
4,2301.00006,Recovering Top-Two Answers and Confusion Proba...,Crowdsourcing has emerged as an effective plat...


In [22]:
final_df.tail()

Unnamed: 0,id,title,abstract
19995,2301.0013,Accuracy-Guaranteed Collaborative DNN Inferenc...,Collaboration among industrial Internet of Thi...
19996,2301.00131,Guided Hybrid Quantization for Object detectio...,"Considering the computation complexity, we pro..."
19997,2301.00134,Exploring the Use of Data-Driven Approaches fo...,The Internet of Things (IoT) is a system that ...
19998,2301.00135,TeViS:Translating Text Synopses to Video Story...,A video storyboard is a roadmap for video crea...
19999,2301.00136,Power of Decision Trees with Monotone Queries,"In this paper, we initiate study of the comput..."


In [23]:
final_df.to_csv('arxiv_dataset.csv', index=False)