In [2]:
!pip install pandas arxivscraper requests

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [3]:
import arxivscraper
import pandas as pd
import json
import os
import time
import boto3
import requests
import logging
import random

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

In [4]:
## CATEGORIES IN ARXIV 
#    cats = ['cs', 'stat', 'econ', 'eess', 'math', 'physics', 'physics:astro-ph', 
#            'physics:cond-mat', 'physics:gr-qc', 'physics:hep-ex', 'physics:hep-lat', 
#            'physics:hep-ph', 'physics:hep-th', 'physics:math-ph', 'physics:nlin',
#            'physics:nucl-ex', 'physics:nucl-th', 'physics:physics', 'physics:quant-ph',
#            'q-bio', 'q-fin']


In [5]:
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def scrape_ai(start_date, end_date, max_articles=20):
    folder = "ARXIV"
    if not os.path.exists(folder):
        os.makedirs(folder)
    
    categories = ['cs', 'stat', 'econ']
    for category in categories:
        scraper = arxivscraper.Scraper(category=category, date_from=start_date, date_until=end_date)
        output = scraper.scrape()
        
        cols = ('id', 'title', 'abstract', 'doi', 'created', 'url', 'authors')
        df = pd.DataFrame(output, columns=cols)
        
        # Limit the number of articles to max_articles
        df = df.head(3)
        
        json_data = df.to_json(orient='records')
        formatted_json = json.loads(json_data)
        
        with open(f'ARXIV/arxiv_data_{category}_{start_date}_{end_date}.json', 'w') as file:
            json.dump(formatted_json, file, indent=4)

def fetch_references(arxiv_id, created_date, authors, max_retries=3, base_delay=3):
    try:
        year = created_date[:4]
        number = arxiv_id.replace('arXiv:', '').replace('/', '').replace('.', '')
        initial = ''
        if authors:
            initial = authors[0].split()[-1][0]
        bibcode = f"{year}arXiv{number}{initial}"
        ads_api_token = "f8XsbXMlFWBC2umi8tCdJP9VoUV0ojEGW5AxtxdE"
        headers = {"Authorization": f"Bearer {ads_api_token}"}
        query = f"bibcode:{bibcode}"
        ads_url = f"https://api.adsabs.harvard.edu/v1/search/query?q={query}&fl=bibcode,title,author,year,reference"
        
        retries = 0
        while retries < max_retries:
            response = requests.get(ads_url, headers=headers)
            if response.status_code == 200:
                try:
                    docs = response.json().get('response', {}).get('docs', [])
                    if docs:
                        return [doc.get('reference', []) for doc in docs]
                    return []
                except json.JSONDecodeError:
                    logging.error("Failed to decode JSON response")
                    return []
            elif response.status_code == 429:
                retries += 1
                delay = base_delay * (1 ** (retries - 1))
                delay += random.uniform(0, 1)
                logging.warning(f"Rate limit exceeded. Retrying in {delay:.2f} seconds... (Retry {retries}/{max_retries})")
                time.sleep(delay)
            else:
                logging.error(f"Failed to fetch references with status code {response.status_code} for query {query}")
                logging.error(f"Response Body: {response.text}")
                return []
        
        logging.error(f"Max retries exceeded for query {query}")
        return []
    except IndexError as e:
        logging.error(f"Error processing arXiv ID {arxiv_id}: {e}")
        return []

In [6]:
scrape_ai(start_date = '2024-04-17', end_date = '2024-04-17')

fetching up to  1000 records...
fetching is completed in 6.0 seconds.
Total number of records 750
fetching up to  1000 records...
Got 503. Retrying after 30 seconds.
fetching up to  1000 records...
fetching is completed in 33.3 seconds.
Total number of records 72
fetching up to  1000 records...
Got 503. Retrying after 30 seconds.
fetching up to  1000 records...
fetching is completed in 33.0 seconds.
Total number of records 12


In [7]:
def merge_and_fetch_references(file_paths, output_file, references_file):
    # Load existing data from merged.json
    if os.path.exists(output_file):
        with open(output_file, 'r') as file:
            merged_data = json.load(file)
    else:
        merged_data = []

    # Load existing data from references.json
    if os.path.exists(references_file):
        with open(references_file, 'r') as file:
            references_data = json.load(file)
    else:
        references_data = []

    for path in file_paths:
        logging.info(f"Processing file: {path}")
        with open(path, 'r') as file:
            data = json.load(file)
            for entry in data:
                arxiv_id = entry['id']
                created_date = entry['created']
                authors = entry.get('authors', [])
                logging.info(f"Fetching references for arXiv ID: {arxiv_id}, Year: {created_date[:4]}")
                try:
                    references = fetch_references(arxiv_id, created_date, authors)
                    if references:
                        logging.info(f"Found references for {arxiv_id}")
                    else:
                        logging.warning(f"No references found for {arxiv_id}")
                    entry['references'] = {str(i): ref for i, ref in enumerate(references)}
                    arxiv_references = [ref for ref in references if any(item.startswith("arXiv:") for item in ref)]
                    if arxiv_references:
                        references_entry = {
                            "source": [arxiv_id, f"https://arxiv.org/abs/{arxiv_id}"]
                        }
                        for i, ref in enumerate(arxiv_references):
                            target_id = ref[0].split(":")[-1].split("v")[0]
                            references_entry[f"target_{i+1:02d}"] = [
                                target_id,
                                f"https://arxiv.org/abs/{target_id}"
                            ]
                        references_data.append(references_entry)
                except Exception as e:
                    logging.error(f"Error fetching references for arXiv ID: {arxiv_id}. Error: {str(e)}")
                    entry['references'] = {}  # Leave references blank if not retrievable
            merged_data.extend(data)
        logging.info(f"Finished processing file: {path}")

    with open(output_file, 'w') as outfile:
        json.dump(merged_data, outfile, indent=4)
    logging.info(f"Merged data saved to {output_file}")

    with open(references_file, 'w') as outfile:
        json.dump(references_data, outfile, indent=4)
    logging.info(f"References data saved to {references_file}")

In [8]:
files = [os.path.join('ARXIV', file) for file in os.listdir('ARXIV') if file.endswith('.json')]
output_file = 'ARXIV/merged.json'
references_file = 'ARXIV/references.json'
merge_and_fetch_references(files, output_file, references_file)

2024-04-17 18:43:52,450 - INFO - Processing file: ARXIV/arxiv_data_stat_2024-04-08_2024-04-08.json
2024-04-17 18:43:52,454 - INFO - Fetching references for arXiv ID: 1801.04064, Year: 2018
2024-04-17 18:43:55,631 - ERROR - Max retries exceeded for query bibcode:2018arXiv180104064s
2024-04-17 18:43:55,638 - INFO - Fetching references for arXiv ID: 1808.02933, Year: 2018
2024-04-17 18:43:58,815 - ERROR - Max retries exceeded for query bibcode:2018arXiv180802933u
2024-04-17 18:43:58,819 - INFO - Fetching references for arXiv ID: 1907.01136, Year: 2019
2024-04-17 18:44:01,996 - ERROR - Max retries exceeded for query bibcode:2019arXiv190701136c
2024-04-17 18:44:01,999 - INFO - Finished processing file: ARXIV/arxiv_data_stat_2024-04-08_2024-04-08.json
2024-04-17 18:44:02,000 - INFO - Processing file: ARXIV/arxiv_data_stat_2024-04-17_2024-04-17.json
2024-04-17 18:44:02,001 - INFO - Fetching references for arXiv ID: 1710.10345, Year: 2017
2024-04-17 18:44:05,341 - ERROR - Max retries exceeded 

JSONDecodeError: Expecting value: line 1 column 1 (char 0)