In [1]:
!pip install pandas requests



In [2]:
import sys
sys.path.insert(0, 'D:/Developers/Scripts')
import os
import pandas as pd
import json
import time
import requests
import logging
import re
from arxivscraper import Scraper

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

In [3]:
def generate_bibcode(arxiv_id, created_date, authors):
    year = created_date[:4]
    number = arxiv_id.replace('arXiv:', '').replace('/', '')
    initial = authors[0].split()[-1][0].upper() if authors else ''
    bibcode_without_dot = f"{year}arXiv{number.replace('.', '')}{initial}"
    bibcode_with_dot = f"{year}arXiv{number}{initial}"
    logging.info(f"Generated bibcodes for arXiv ID {arxiv_id}: {bibcode_without_dot}, {bibcode_with_dot}")
    return bibcode_without_dot, bibcode_with_dot

def fetch_references(bibcodes):
    ads_api_token = "f8XsbXMlFWBC2umi8tCdJP9VoUV0ojEGW5AxtxdE"
    headers = {"Authorization": f"Bearer {ads_api_token}"}
    data = 'bibcode\n' + '\n'.join(bibcodes)
    url = "https://api.adsabs.harvard.edu/v1/search/bigquery"
    params = {'q': '*:*', 'fl': 'bibcode,reference', 'wt': 'json', 'fq': '{!bitset}'}

    response = requests.post(url, headers=headers, params=params, data=data)
    if response.status_code == 200:
        return {doc['bibcode']: doc.get('reference', []) for doc in response.json().get('response', {}).get('docs', [])}
    else:
        logging.error(f"Failed to fetch references with status code {response.status_code}")
        logging.error(f"API response content: {response.text}")
        return {}

In [4]:
def scrape_ai(start_date, end_date, max_limit=20):
    folder = "ARXIV"
    if not os.path.exists(folder):
        os.makedirs(folder)

    categories = ['cs', 'stat', 'econ']
    for category in categories:
        scraper = Scraper(category=category, date_from=start_date, date_until=end_date, max_records=max_limit)
        output = scraper.scrape()

        cols = ('id', 'title', 'abstract', 'doi', 'created', 'url', 'authors')
        df = pd.DataFrame(output, columns=cols)
        json_data = df.to_json(orient='records')
        formatted_json = json.loads(json_data)

        with open(f'{folder}/arxiv_data_{category}_{start_date}_{end_date}.json', 'w') as file:
            json.dump(formatted_json, file, indent=4)

In [5]:
scrape_ai(start_date='2024-04-17', end_date='2024-04-17', max_limit=1000)

Fetching is completed in 6.7 seconds.
Total number of records: 750
Fetching is completed in 8.8 seconds.
Total number of records: 72
Fetching is completed in 8.2 seconds.
Total number of records: 12


In [6]:
def merge_and_fetch_references(file_paths, output_file, references_file):
    merged_data = []
    arxiv_references = {}

    for path in file_paths:
        with open(path, 'r') as file:
            data = json.load(file)
            for entry in data:
                bibcode_without_dot, bibcode_with_dot = generate_bibcode(entry['id'], entry['created'], entry.get('authors', []))
                merged_data.append(entry)
                references = entry.get('references', [])
                arxiv_refs = [ref.split(":")[-1] for ref in references if ref.startswith("arXiv:")]
                arxiv_references[entry['id']] = arxiv_refs
                if arxiv_refs:
                    logging.info(f"References found for arXiv ID: {entry['id']}")
                else:
                    logging.warning(f"No references found for arXiv ID: {entry['id']}")

    all_bibcodes = []
    for entry in merged_data:
        bibcode_without_dot, bibcode_with_dot = generate_bibcode(entry['id'], entry['created'], entry.get('authors', []))
        all_bibcodes.append(bibcode_without_dot)
        all_bibcodes.append(bibcode_with_dot)

    references = fetch_references(all_bibcodes)

    for entry in merged_data:
        bibcode_without_dot, bibcode_with_dot = generate_bibcode(entry['id'], entry['created'], entry.get('authors', []))
        fetched_references = references.get(bibcode_without_dot, []) + references.get(bibcode_with_dot, [])
        entry['references'] = fetched_references
        arxiv_references[entry['id']] = fetched_references
        if entry['references']:
            logging.info(f"References found for bibcodes: {bibcode_without_dot}, {bibcode_with_dot}")
        else:
            logging.warning(f"No references found for bibcodes: {bibcode_without_dot}, {bibcode_with_dot}")

    with open(output_file, 'w') as outfile:
        json.dump(merged_data, outfile, indent=4)
    with open(references_file, 'w') as outfile:
        json.dump(arxiv_references, outfile, indent=4)

In [7]:
def fix_references_json(file_path):
    with open(file_path, 'r') as file:
        arxiv_references = json.load(file)

    fixed_arxiv_references = {}  # Create a new dictionary to store fixed references
    for arxiv_id, references in arxiv_references.items():
        fixed_references = []
        for ref in references:
            if 'arXiv' in ref:
                match = re.search(r'(?:arXiv)?(\d{4,5}\.\d{4,5}|\d{8,9})(?=[a-zA-Z]|$)', ref)
                if match:
                    fixed_arxiv_id = match.group(1)
                    if '.' not in fixed_arxiv_id:
                        fixed_arxiv_id = f"{fixed_arxiv_id[:4]}.{fixed_arxiv_id[4:]}"
                    reference_url = f"https://arxiv.org/abs/{fixed_arxiv_id}"
                    fixed_references.append([fixed_arxiv_id, reference_url])
        fixed_arxiv_references[arxiv_id] = fixed_references  # Use the original arxiv_id as key

    with open(file_path, 'w') as file:
        json.dump(fixed_arxiv_references, file, indent=4)  # Write the corrected dictionary back to file

In [8]:
files = [os.path.join('ARXIV', file) for file in os.listdir('ARXIV') if file.endswith('.json')]
output_file = 'ARXIV/merged.json'
references_file = 'ARXIV/references.json'

In [9]:
merge_and_fetch_references(files, output_file, references_file)

2024-04-17 18:09:03,769 - INFO - Generated bibcodes for arXiv ID 1406.0134: 2014arXiv14060134K, 2014arXiv1406.0134K
2024-04-17 18:09:03,770 - INFO - Generated bibcodes for arXiv ID 1710.10345: 2017arXiv171010345S, 2017arXiv1710.10345S
2024-04-17 18:09:03,771 - INFO - Generated bibcodes for arXiv ID 1902.01353: 2019arXiv190201353K, 2019arXiv1902.01353K
2024-04-17 18:09:03,771 - INFO - Generated bibcodes for arXiv ID 1905.05655: 2019arXiv190505655A, 2019arXiv1905.05655A
2024-04-17 18:09:03,772 - INFO - Generated bibcodes for arXiv ID 2003.05813: 2020arXiv200305813D, 2020arXiv2003.05813D
2024-04-17 18:09:03,773 - INFO - Generated bibcodes for arXiv ID 2004.05704: 2020arXiv200405704S, 2020arXiv2004.05704S
2024-04-17 18:09:03,773 - INFO - Generated bibcodes for arXiv ID 2007.07879: 2020arXiv200707879L, 2020arXiv2007.07879L
2024-04-17 18:09:03,773 - INFO - Generated bibcodes for arXiv ID 2010.10824: 2020arXiv201010824H, 2020arXiv2010.10824H
2024-04-17 18:09:03,774 - INFO - Generated bibcodes

In [10]:
fix_references_json(references_file)