In [1]:
"""Data Extraction from arxiv api"""
import os.path
from tqdm import tqdm
import argparse
from datetime import datetime, timedelta
from typing import Dict, List, Tuple
import requests
import feedparser
import json # import json instead of pandas
import fitz  # this is pymupdf

In [32]:
STANDARD_SEARCH_QUERY:str = "cat:cs.CV OR cat:cs.AI OR cat:cs.LG OR cat:cs.CL OR cat:cs.NE OR cat:stat.ML OR cat:cs.IR"


class ArxivParser:
    """Extract & Parse data from the Arxiv API"""
    base_url = "http://export.arxiv.org/api/query"

    def __init__(self, data_path="../data/"):
        self.extracted_data: List[Dict[str, str]] = [] # create an empty list instead of a dataframe

        if not os.path.exists(data_path):
            os.makedirs(data_path)
        self.data_path = data_path

    def get_results(
        self,
        max_results: int = 5,
        days: int = 60,
        search_query: str = STANDARD_SEARCH_QUERY,
    ) -> List[Dict[str, str]]: # return a list of dictionaries instead of a dataframe
        # Construct the url with the query parameters
        params = {
            "search_query": search_query,
            "start": 0,
            "max_results": max_results,
            "sortBy": "submittedDate",
            "sortOrder": "descending",
        }
        url = self.base_url + "?" + requests.compat.urlencode(params)

        # Send a GET request to the api endpoint
        response = requests.get(url)
        # Parse the response
        entries = feedparser.parse(response.text).entries

        downloaded_data: List[Dict[str, str]] = [] # create an empty list instead of a dictionary

        # Loop through the entries
        for entry in tqdm(entries):
            published_date = datetime.strptime(entry.published, "%Y-%m-%dT%H:%M:%SZ")
            current_date = datetime.now()
            date_diff = (current_date - published_date).days

            # Check if the date difference is less than or equal to the days parameter
            if date_diff <= days:
                id = entry.id
                title = entry.title
                link = entry.link
                summary = entry.summary

                # Get the pdf link by replacing the "abs" with "pdf" in the link
                pdf_link = link.replace("abs", "pdf")
                # Get the pdf content by sending a GET request to the pdf link and opening it with fitz
                pdf_content = requests.get(pdf_link).content
                pdf_file = fitz.open(stream=pdf_content, filetype="pdf")
                # Extract the text from the pdf file
                pdf_text = ""
                for page in pdf_file:
                    pdf_text += page.get_text()
                # Store the extracted data in a dictionary and append it to the list
                downloaded_data.append({
                    "id": id,
                    "title": title,
                    "published_date": published_date,
                    "pdf_link": pdf_link,
                    "summary": summary,
                    "pdf_text": pdf_text,
                })
        # Extend the extracted data list with the downloaded data list
        self.extracted_data.extend(downloaded_data)
        # Return the list as it is
        return self.extracted_data


    def store_data(
        self,
        save_file_name: str = "master_data.json",
        max_results: int = 10,
        days: int = 60,
    ) -> None:
        self.extracted_data = self.get_results(max_results, days)
    
        assert len(self.extracted_data) > 0, "Got no results with the search query"
        # Convert the published_date to a string format
        for data in self.extracted_data:
            data["published_date"] = data["published_date"].strftime("%Y-%m-%d")
        # Save the list of dictionaries as a json file
        save_location = os.path.join(self.data_path, save_file_name)
        with open(save_location, "w") as f:
            json.dump(self.extracted_data, f, indent=4)

    def get_stored_data(self):
        # Return the self.extracted_data attribute

        assert len(self.extracted_data) != 0, "Please store data first"
        return self.extracted_data

In [33]:
a=ArxivParser()

In [34]:
a

<__main__.ArxivParser at 0x106bc14d0>

In [35]:
a.get_results()

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5/5 [00:10<00:00,  2.11s/it]


[{'id': 'http://arxiv.org/abs/2402.03312v1',
  'title': 'Test-Time Adaptation for Depth Completion',
  'published_date': datetime.datetime(2024, 2, 5, 18, 59, 52),
  'pdf_link': 'http://arxiv.org/pdf/2402.03312v1',
  'summary': 'It is common to observe performance degradation when transferring models\ntrained on some (source) datasets to target testing data due to a domain gap\nbetween them. Existing methods for bridging this gap, such as domain adaptation\n(DA), may require the source data on which the model was trained (often not\navailable), while others, i.e., source-free DA, require many passes through the\ntesting data. We propose an online test-time adaptation method for depth\ncompletion, the task of inferring a dense depth map from a single image and\nassociated sparse depth map, that closes the performance gap in a single pass.\nWe first present a study on how the domain shift in each data modality affects\nmodel performance. Based on our observations that the sparse depth mo

In [36]:
a.store_data()

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10/10 [00:16<00:00,  1.66s/it]


In [37]:
res=a.get_stored_data()

In [38]:
type(res)

list

In [39]:
len(res)

15

In [40]:
import pandas as pd

In [42]:
pd.read_json("../data/master_data.json")

Unnamed: 0,id,title,published_date,pdf_link,summary,pdf_text
0,http://arxiv.org/abs/2402.03312v1,Test-Time Adaptation for Depth Completion,2024-02-05,http://arxiv.org/pdf/2402.03312v1,It is common to observe performance degradatio...,Test-Time Adaptation for Depth Completion\nHyo...
1,http://arxiv.org/abs/2402.03311v1,HASSOD: Hierarchical Adaptive Self-Supervised ...,2024-02-05,http://arxiv.org/pdf/2402.03311v1,The human visual perception system demonstrate...,HASSOD: Hierarchical Adaptive Self-Supervised\...
2,http://arxiv.org/abs/2402.03310v1,V-IRL: Grounding Virtual Intelligence in Real ...,2024-02-05,http://arxiv.org/pdf/2402.03310v1,There is a sensory gulf between the Earth that...,V-IRL: Grounding Virtual Intelligence in Real ...
3,http://arxiv.org/abs/2402.03309v1,AONeuS: A Neural Rendering Framework for Acous...,2024-02-05,http://arxiv.org/pdf/2402.03309v1,Underwater perception and 3D surface reconstru...,AONeuS: A Neural Rendering Framework for Acous...
4,http://arxiv.org/abs/2402.03307v1,4D Gaussian Splatting: Towards Efficient Novel...,2024-02-05,http://arxiv.org/pdf/2402.03307v1,We consider the problem of novel view synthesi...,4D Gaussian Splatting:\nTowards Efficient Nove...
5,http://arxiv.org/abs/2402.03312v1,Test-Time Adaptation for Depth Completion,2024-02-05,http://arxiv.org/pdf/2402.03312v1,It is common to observe performance degradatio...,Test-Time Adaptation for Depth Completion\nHyo...
6,http://arxiv.org/abs/2402.03311v1,HASSOD: Hierarchical Adaptive Self-Supervised ...,2024-02-05,http://arxiv.org/pdf/2402.03311v1,The human visual perception system demonstrate...,HASSOD: Hierarchical Adaptive Self-Supervised\...
7,http://arxiv.org/abs/2402.03310v1,V-IRL: Grounding Virtual Intelligence in Real ...,2024-02-05,http://arxiv.org/pdf/2402.03310v1,There is a sensory gulf between the Earth that...,V-IRL: Grounding Virtual Intelligence in Real ...
8,http://arxiv.org/abs/2402.03309v1,AONeuS: A Neural Rendering Framework for Acous...,2024-02-05,http://arxiv.org/pdf/2402.03309v1,Underwater perception and 3D surface reconstru...,AONeuS: A Neural Rendering Framework for Acous...
9,http://arxiv.org/abs/2402.03307v1,4D Gaussian Splatting: Towards Efficient Novel...,2024-02-05,http://arxiv.org/pdf/2402.03307v1,We consider the problem of novel view synthesi...,4D Gaussian Splatting:\nTowards Efficient Nove...
