In [2]:
#script

In [3]:
import sys
import os
import pandas as pd
import json
import time
import requests
import logging
import re

sys.path.append(os.getcwd())

from arxivscraper import Scraper

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

In [4]:
def generate_bibcode(arxiv_id, created_date, authors):
    year = created_date[:4]
    number = arxiv_id.replace('arXiv:', '').replace('/', '')
    initial = authors[0].split()[-1][0].upper() if authors else ''
    bibcode_without_dot = f"{year}arXiv{number.replace('.', '')}{initial}"
    bibcode_with_dot = f"{year}arXiv{number}{initial}"
    logging.info(f"Generated bibcodes for arXiv ID {arxiv_id}: {bibcode_without_dot}, {bibcode_with_dot}")
    return bibcode_without_dot, bibcode_with_dot

def fetch_references(bibcodes):
    ads_api_token = "f8XsbXMlFWBC2umi8tCdJP9VoUV0ojEGW5AxtxdE"
    headers = {"Authorization": f"Bearer {ads_api_token}"}
    data = 'bibcode\n' + '\n'.join(bibcodes)
    url = "https://api.adsabs.harvard.edu/v1/search/bigquery"
    params = {'q': '*:*', 'fl': 'bibcode,reference', 'wt': 'json', 'fq': '{!bitset}'}

    response = requests.post(url, headers=headers, params=params, data=data)
    if response.status_code == 200:
        return {doc['bibcode']: doc.get('reference', []) for doc in response.json().get('response', {}).get('docs', [])}
    else:
        logging.error(f"Failed to fetch references with status code {response.status_code}")
        logging.error(f"API response content: {response.text}")
        return {}

In [5]:
def scrape_ai(start_date, end_date, max_limit):
    folder = "ARXIV"
    if not os.path.exists(folder):
        os.makedirs(folder)

    categories = ['cs', 'stat', 'econ']
    for category in categories:
        scraper = Scraper(category=category, date_from=start_date, date_until=end_date, max_records=max_limit)
        output = scraper.scrape()

        cols = ('id', 'title', 'abstract', 'doi', 'created', 'url', 'authors')
        df = pd.DataFrame(output, columns=cols)
        json_data = df.to_json(orient='records')
        formatted_json = json.loads(json_data)

        with open(f'{folder}/arxiv_data_{category}_{start_date}_{end_date}.json', 'w') as file:
            json.dump(formatted_json, file, indent=4)

In [6]:
scrape_ai(start_date='2024-04-22', end_date='2024-04-22', max_limit=100)

Fetching is completed in 4.3 seconds.
Total number of records: 100
Fetching is completed in 8.3 seconds.
Total number of records: 69
Fetching is completed in 8.0 seconds.
Total number of records: 16


In [7]:
def merge_and_fetch_references(file_paths, output_file, references_file):
    merged_data = []
    arxiv_references = {}

    for path in file_paths:
        with open(path, 'r') as file:
            data = json.load(file)
            print("Data loaded:", data)
            print("Data type:", type(data))
            if not isinstance(data, list):
                logging.error(f"Invalid data format in file {path}: Expected a list, got {type(data)}")
                continue
            for entry in data:
                if not isinstance(entry, dict):
                    logging.error(f"Invalid entry format in file {path}: Expected a dictionary, got {type(entry)}")
                    continue
                bibcode_without_dot, bibcode_with_dot = generate_bibcode(entry['id'], entry['created'], entry.get('authors', []))
                merged_data.append(entry)
                references = entry.get('references', [])
                arxiv_refs = [ref.split(":")[-1] for ref in references if ref.startswith("arXiv:")]
                arxiv_references[entry['id']] = arxiv_refs
                if arxiv_refs:
                    logging.info(f"References found for arXiv ID: {entry['id']}")
                else:
                    logging.warning(f"No references found for arXiv ID: {entry['id']}")


    all_bibcodes = []
    for entry in merged_data:
        bibcode_without_dot, bibcode_with_dot = generate_bibcode(entry['id'], entry['created'], entry.get('authors', []))
        all_bibcodes.append(bibcode_without_dot)
        all_bibcodes.append(bibcode_with_dot)

    references = fetch_references(all_bibcodes)

    for entry in merged_data:
        bibcode_without_dot, bibcode_with_dot = generate_bibcode(entry['id'], entry['created'], entry.get('authors', []))
        fetched_references = references.get(bibcode_without_dot, []) + references.get(bibcode_with_dot, [])
        entry['references'] = fetched_references
        arxiv_references[entry['id']] = fetched_references
        if entry['references']:
            logging.info(f"References found for bibcodes: {bibcode_without_dot}, {bibcode_with_dot}")
        else:
            logging.warning(f"No references found for bibcodes: {bibcode_without_dot}, {bibcode_with_dot}")

    with open(output_file, 'w') as outfile:
        json.dump(merged_data, outfile, indent=4)
    with open(references_file, 'w') as outfile:
        json.dump(arxiv_references, outfile, indent=4)

In [8]:
def fix_references_json(file_path):
    with open(file_path, 'r') as file:
        arxiv_references = json.load(file)

    fixed_arxiv_references = {}
    for arxiv_id, references in arxiv_references.items():
        fixed_references = []
        for ref in references:
            if 'arXiv' in ref:
                match = re.search(r'(?:arXiv)?(\d{4,5}\.\d{4,5}|\d{8,9})(?=[a-zA-Z]|$)', ref)
                if match:
                    fixed_arxiv_id = match.group(1)
                    if '.' not in fixed_arxiv_id:
                        fixed_arxiv_id = f"{fixed_arxiv_id[:4]}.{fixed_arxiv_id[4:]}"
                    reference_url = f"https://arxiv.org/abs/{fixed_arxiv_id}"
                    fixed_references.append([fixed_arxiv_id, reference_url])
        fixed_arxiv_references[arxiv_id] = fixed_references

    with open(file_path, 'w') as file:
        json.dump(fixed_arxiv_references, file, indent=4)

In [9]:
files = [os.path.join('ARXIV', file) for file in os.listdir('ARXIV') if file.endswith('.json')]
output_file = 'ARXIV/merged.json'
references_file = 'ARXIV/references.json'

In [10]:
merge_and_fetch_references(files, output_file, references_file)

2024-04-22 15:46:28,569 - INFO - Generated bibcodes for arXiv ID 1711.08265: 2017arXiv171108265L, 2017arXiv1711.08265L
2024-04-22 15:46:28,570 - INFO - Generated bibcodes for arXiv ID 1907.05325: 2019arXiv190705325M, 2019arXiv1907.05325M
2024-04-22 15:46:28,571 - INFO - Generated bibcodes for arXiv ID 2101.01157: 2021arXiv210101157A, 2021arXiv2101.01157A
2024-04-22 15:46:28,573 - INFO - Generated bibcodes for arXiv ID 2110.15517: 2021arXiv211015517H, 2021arXiv2110.15517H
2024-04-22 15:46:28,575 - INFO - Generated bibcodes for arXiv ID 2111.04652: 2021arXiv211104652M, 2021arXiv2111.04652M
2024-04-22 15:46:28,577 - INFO - Generated bibcodes for arXiv ID 2201.09648: 2022arXiv220109648P, 2022arXiv2201.09648P
2024-04-22 15:46:28,579 - INFO - Generated bibcodes for arXiv ID 2210.02171: 2022arXiv221002171C, 2022arXiv2210.02171C
2024-04-22 15:46:28,580 - INFO - Generated bibcodes for arXiv ID 2210.16655: 2022arXiv221016655J, 2022arXiv2210.16655J
2024-04-22 15:46:28,582 - INFO - Generated bibco

Data loaded: [{'id': '1711.08265', 'title': 'sparse variable selection on high dimensional heterogeneous data with   tree structured responses', 'abstract': 'we consider the problem of sparse variable selection on high dimension heterogeneous data sets, which has been taking on renewed interest recently due to the growth of biological and medical data sets with complex, non-i.i.d. structures and huge quantities of response variables. the heterogeneity is likely to confound the association between explanatory variables and responses, resulting in enormous false discoveries when lasso or its variants are na\\"ively applied. therefore, developing effective confounder correction methods is a growing heat point among researchers. however, ordinarily employing recent confounder correction methods will result in undesirable performance due to the ignorance of the convoluted interdependency among response variables. to fully improve current variable selection methods, we introduce a model, the

2024-04-22 15:46:28,776 - INFO - Generated bibcodes for arXiv ID 2304.09779: 2023arXiv230409779S, 2023arXiv2304.09779S
2024-04-22 15:46:28,784 - INFO - Generated bibcodes for arXiv ID 2304.10286: 2023arXiv230410286P, 2023arXiv2304.10286P
2024-04-22 15:46:28,785 - INFO - Generated bibcodes for arXiv ID 2304.13029: 2023arXiv230413029M, 2023arXiv2304.13029M
2024-04-22 15:46:28,789 - INFO - Generated bibcodes for arXiv ID 2305.03803: 2023arXiv230503803H, 2023arXiv2305.03803H
2024-04-22 15:46:28,791 - INFO - Generated bibcodes for arXiv ID 2305.07877: 2023arXiv230507877G, 2023arXiv2305.07877G
2024-04-22 15:46:28,794 - INFO - Generated bibcodes for arXiv ID 2305.18453: 2023arXiv230518453D, 2023arXiv2305.18453D
2024-04-22 15:46:28,795 - INFO - Generated bibcodes for arXiv ID 2306.03027: 2023arXiv230603027G, 2023arXiv2306.03027G
2024-04-22 15:46:28,797 - INFO - Generated bibcodes for arXiv ID 2306.06449: 2023arXiv230606449B, 2023arXiv2306.06449B
2024-04-22 15:46:28,801 - INFO - Generated bibco

Data loaded: [{'id': '2110.15517', 'title': 'cp factor model for dynamic tensors', 'abstract': 'observations in various applications are frequently represented as a time series of multidimensional arrays, called tensor time series, preserving the inherent multidimensional structure. in this paper, we present a factor model approach, in a form similar to tensor cp decomposition, to the analysis of high-dimensional dynamic tensor time series. as the loading vectors are uniquely defined but not necessarily orthogonal, it is significantly different from the existing tensor factor models based on tucker-type tensor decomposition. the model structure allows for a set of uncorrelated one-dimensional latent dynamic factor processes, making it much more convenient to study the underlying dynamics of the time series. a new high order projection estimator is proposed for such a factor model, utilizing the special structure and the idea of the higher order orthogonal iteration procedures commonly 

2024-04-22 15:46:29,231 - INFO - Generated bibcodes for arXiv ID 1711.08265: 2017arXiv171108265L, 2017arXiv1711.08265L
2024-04-22 15:46:29,236 - INFO - Generated bibcodes for arXiv ID 1907.05325: 2019arXiv190705325M, 2019arXiv1907.05325M
2024-04-22 15:46:29,238 - INFO - Generated bibcodes for arXiv ID 2101.01157: 2021arXiv210101157A, 2021arXiv2101.01157A
2024-04-22 15:46:29,240 - INFO - Generated bibcodes for arXiv ID 2110.15517: 2021arXiv211015517H, 2021arXiv2110.15517H
2024-04-22 15:46:29,241 - INFO - Generated bibcodes for arXiv ID 2111.04652: 2021arXiv211104652M, 2021arXiv2111.04652M
2024-04-22 15:46:29,242 - INFO - Generated bibcodes for arXiv ID 2201.09648: 2022arXiv220109648P, 2022arXiv2201.09648P
2024-04-22 15:46:29,243 - INFO - Generated bibcodes for arXiv ID 2210.02171: 2022arXiv221002171C, 2022arXiv2210.02171C
2024-04-22 15:46:29,244 - INFO - Generated bibcodes for arXiv ID 2210.16655: 2022arXiv221016655J, 2022arXiv2210.16655J
2024-04-22 15:46:29,248 - INFO - Generated bibco

In [11]:
fix_references_json(references_file)