In [2]:
!pip install pandas arxivscraper requests

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [3]:
import arxivscraper
import pandas as pd
import json
import os
import time
import boto3
import requests
import logging

In [4]:
## CATEGORIES IN ARXIV 
#    cats = ['cs', 'stat', 'econ', 'eess', 'math', 'physics', 'physics:astro-ph', 
#            'physics:cond-mat', 'physics:gr-qc', 'physics:hep-ex', 'physics:hep-lat', 
#            'physics:hep-ph', 'physics:hep-th', 'physics:math-ph', 'physics:nlin',
#            'physics:nucl-ex', 'physics:nucl-th', 'physics:physics', 'physics:quant-ph',
#            'q-bio', 'q-fin']


In [5]:
logging.basicConfig(level=logging.INFO)

def scrape_ai(start_date, end_date):
    folder = "ARXIV"
    if not os.path.exists(folder):
        os.makedirs(folder)
    
    categories = ['cs', 'stat', 'econ']
    for category in categories:
        scraper = arxivscraper.Scraper(category=category, date_from=start_date, date_until=end_date)
        output = scraper.scrape()
        
        cols = ('id', 'title', 'abstract', 'doi', 'created', 'url', 'authors')
        df = pd.DataFrame(output, columns=cols)
        json_data = df.to_json(orient='records')
        formatted_json = json.loads(json_data)
    
        with open(f'ARXIV/arxiv_data_{category}_{start_date}_{end_date}.json', 'w') as file:
            json.dump(formatted_json, file, indent=4)

def fetch_references(arxiv_id, created_date, authors):
    try:
        year = created_date[:4]
        number = arxiv_id.replace('arXiv:', '').replace('/', '').replace('.', '')
        initial = ''
        if authors:
            initial = authors[0].split()[-1][0]
        bibcode = f"{year}arXiv{number}{initial}"
        ads_api_token = "KYhVwN1WVnupN6wteeHCfVjtxAAegFktq1jVDqRj"
        headers = {"Authorization": f"Bearer {ads_api_token}"}
        query = f"bibcode:{bibcode}"
        ads_url = f"https://api.adsabs.harvard.edu/v1/search/query?q={query}&fl=bibcode,title,author,year,reference"
        
        response = requests.get(ads_url, headers=headers)
        if response.status_code == 200:
            try:
                docs = response.json().get('response', {}).get('docs', [])
                if docs:
                    return [doc.get('reference', []) for doc in docs]
                return []
            except json.JSONDecodeError:
                logging.error("Failed to decode JSON response")
                return []
        else:
            logging.error(f"Failed to fetch references with status code {response.status_code} for query {query}")
            logging.error(f"Response Body: {response.text}")
            return []
    except IndexError as e:
        logging.error(f"Error processing arXiv ID {arxiv_id}: {e}")
        return []

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

In [6]:
scrape_ai(start_date = '2024-04-08', end_date = '2024-04-08') 

fetching up to  1000 records...
fetching is completed in 4.1 seconds.
Total number of records 517
fetching up to  1000 records...
Got 503. Retrying after 30 seconds.
fetching up to  1000 records...
fetching is completed in 33.3 seconds.
Total number of records 57
fetching up to  1000 records...
Got 503. Retrying after 30 seconds.
fetching up to  1000 records...
fetching is completed in 33.1 seconds.
Total number of records 18


In [7]:
def merge_and_fetch_references(file_paths, output_file):
    merged_data = []
    for path in file_paths:
        logging.info(f"Processing file: {path}")
        with open(path, 'r') as file:
            data = json.load(file)
            for entry in data:
                arxiv_id = entry['id']
                created_date = entry['created']
                authors = entry.get('authors', [])
                logging.info(f"Fetching references for arXiv ID: {arxiv_id}, Year: {created_date[:4]}")
                references = fetch_references(arxiv_id, created_date, authors)
                if references:
                    logging.info(f"Found references for {arxiv_id}")
                else:
                    logging.warning(f"No references found for {arxiv_id}")
                entry['references'] = references
            merged_data.extend(data)
        logging.info(f"Finished processing file: {path}")

    with open(output_file, 'w') as outfile:
        json.dump(merged_data, outfile, indent=4)
        logging.info(f"Merged data saved to {output_file}")

files = [os.path.join('ARXIV', file) for file in os.listdir('ARXIV') if file.endswith('.json')]
output_file = 'ARXIV/merged.json'
merge_and_fetch_references(files, output_file)


INFO:root:Processing file: ARXIV/arxiv_data_stat_2024-04-08_2024-04-08.json
INFO:root:Fetching references for arXiv ID: 1801.04064, Year: 2018
INFO:root:Found references for 1801.04064
INFO:root:Fetching references for arXiv ID: 1808.02933, Year: 2018
INFO:root:Found references for 1808.02933
INFO:root:Fetching references for arXiv ID: 1907.01136, Year: 2019
INFO:root:Found references for 1907.01136
INFO:root:Fetching references for arXiv ID: 1907.02652, Year: 2019
INFO:root:Fetching references for arXiv ID: 2012.09561, Year: 2020
INFO:root:Found references for 2012.09561
INFO:root:Fetching references for arXiv ID: 2102.00618, Year: 2021
INFO:root:Found references for 2102.00618
INFO:root:Fetching references for arXiv ID: 2112.04389, Year: 2021
INFO:root:Found references for 2112.04389
INFO:root:Fetching references for arXiv ID: 2206.01012, Year: 2022
INFO:root:Found references for 2206.01012
INFO:root:Fetching references for arXiv ID: 2207.14800, Year: 2022
INFO:root:Found references 