In [None]:
import os
main_run_dir = os.getcwd()

def switch_folder_dirs(work_dir, previous_dir):
    json_dir = os.path.join(previous_dir, work_dir)
    try:
        if not os.path.exists(json_dir):
            os.makedirs(json_dir)  
    except OSError:
        #print('Error: Creating directory. '+ json_dir)
        pass
    os.chdir(json_dir)

## chemrxiv

In [None]:
import requests
import json
from datetime import datetime

switch_folder_dirs(os.path.join(main_run_dir, 'chemrxiv'),main_run_dir)
start=datetime.now()
print("Get metadate from ChemRxiv")
# Set the initial offset and limit values
skip = 0
limit = 50

# Initialize an empty list to store the results
results = []

# Loop through the pages until all results have been retrieved  
# ChemRxiv is announced on Aug. 14, 2017
while True:
    # Build the search query with the current offset and limit values
    #file_name="chemrxiv_metadata_oringal.json"
    #url_chemrxiv = f"https://chemrxiv.org/engage/chemrxiv/public-api/v1/items?searchDateFrom=2017-08-14T00:00:00.000Z&limit={limit}&skip={skip}"
    
    file_name="chemrxiv_metadata_oringal_asc.json"
    url_chemrxiv = f"https://chemrxiv.org/engage/chemrxiv/public-api/v1/items?sort=PUBLISHED_DATE_ASC&searchDateFrom=2017-08-14T00:00:00.000Z&searchDateTo=2023-02-16T00:00:00.000Z&limit={limit}&skip={skip}" 
    ## sort=PUBLISHED_DATE_ASC: sorted from the latest to the starting date
    
    response = requests.get(url_chemrxiv)
    if response.status_code != 200:
        print("Error: request failed with status code", response.status_code)
        break
    
    # Parse the JSON data and append the items to the results list
    data = response.json()
    results.extend(data["itemHits"])
    
    # Update the offset and check if there are more results to retrieve
    skip += limit
    if skip >= data["totalCount"]:
        break
    if skip%2000==0:
        print(skip, data["totalCount"])

print('finish requesting chemrxiv ....')
# Write the results to a JSON file
with open(file_name, "w") as f:
    f.write(json.dumps(results, indent=4))

print('finish writing to chemrxiv json ....')
print(datetime.now()-start)
os.chdir(main_run_dir)

## medrxiv

In [None]:
import requests
import json
from datetime import datetime

switch_folder_dirs(os.path.join(main_run_dir, 'medrxiv'),main_run_dir)


start=datetime.now()
print("Get metadate from medrxiv")

# Build the API endpoint URL
server = "medrxiv"
start_date = "2019-06-01"
#end_date = datetime.today().strftime('%Y-%m-%d')
end_date="2023-02-16"
cursor = 0
max_limit=100

# Initialize an empty list to store the results
results = []

# Loop through the pages until all results have been retrieved  
# medRxiv is announced on Jun. 01, 2019
while True:
    # Build the search query with the current offset and limit values
    
    # save as medrxiv_metadata_oringal_bio.json
    
    #file_name='medrxiv_metadata_oringal_bio.json'
    #url_medrxiv = f"https://api.biorxiv.org/details/{server}/{start_date}/{end_date}/{cursor}"
    
    # save as medrxiv_metadata_oringal_pubs.json
    
    #file_name='medrxiv_metadata_oringal_pubs.json'
    #url_medrxiv = f"https://api.biorxiv.org/pubs/{server}/{start_date}/{end_date}/{cursor}" # less entries 

    # save as medrxiv_metadata_oringal.json
    file_name='medrxiv_metadata_oringal.json'
    url_medrxiv = f"https://api.medrxiv.org/details/{server}/{start_date}/{end_date}/{cursor}"
    
    # Make the API call and check the status code
    response = requests.get(url_medrxiv)
    if response.status_code != 200:
        print("Error: request failed with status code", response.status_code)
        break
    
    # Parse the JSON data and append the items to the results list
    data = response.json()
    results.extend(data["collection"])
    
    # Update the offset and check if there are more results to retrieve
    cursor += max_limit
    if "total" in data["messages"][0]:
        if cursor >= data["messages"][0]["total"]:
            break
        else:
            if cursor%2000==0:
                print(cursor, data["messages"][0]["total"])
    else:
        break

print('finish requesting medrxiv ....')
# Write the results to a JSON file
with open(file_name, "w") as f:
    f.write(json.dumps(results, indent=4))

print('finish writing to medrxiv json ....')
print(datetime.now()-start)
os.chdir(main_run_dir)

## bioRxiv 
#### it will break due to too many requests; if stop, then store until the stop, and re-request from the previous stop and do the request again

In [None]:
import requests
import json
from datetime import datetime

switch_folder_dirs(os.path.join(main_run_dir, 'biorxiv'),main_run_dir)

print("Get metadate from bioRxiv")

start=datetime.now()

# Build the API endpoint URL
server = "biorxiv"
start_date = "2013-11-01"
#end_date = datetime.today().strftime('%Y-%m-%d')
end_date="2023-02-16"
cursor = 0
max_limit=100

# Initialize an empty list to store the results
results = []

print(end_date)
# Loop through the pages until all results have been retrieved  
# bioRxiv is announced on Nov. 01, 2013
while True:
    # Build the search query with the current offset and limit values
    url_biorxiv = f"https://api.biorxiv.org/details/{server}/{start_date}/{end_date}/{cursor}"
    
    # Make the API call and check the status code
    response = requests.get(url_biorxiv)
    
    if response.status_code == 200:
        
        # Parse the JSON data and append the items to the results list
        data = response.json()
        results.extend(data["collection"])

        # Update the offset and check if there are more results to retrieve
        cursor += max_limit
        if "total" in data["messages"][0]:

            if cursor >= data["messages"][0]["total"]:
                break
            else:
                if cursor%3000==0: 
                    #print(cursor, data["messages"][0]["total"])
                    
                if cursor%50000==0: 
                    print(cursor, data["messages"][0]["total"])
                    with open("biorxiv_metadata_oringal.json", "w") as f:
                        f.write(json.dumps(results, indent=4))
                if cursor>=(data["messages"][0]["total"]-data["messages"][0]["total"]%50000):
                    
                    with open("biorxiv_metadata_cursor_last.json", "w") as f:
                        f.write(json.dumps(results, indent=4))
            
        else:
            break
        
    else:
        print("Error: request failed with status code", response.status_code)
        break
    

        
print('finish requesting ....')
# Write the results to a JSON file
with open("biorxiv_metadata_oringal_all.json", "w") as f:
    f.write(json.dumps(results, indent=4))

print('finish writing to biorxiv json if no stop....')
print(datetime.now()-start)

os.chdir(main_run_dir)

### if stop without complete finishing, then store the previous results

In [None]:
os.chdir(main_run_dir)
switch_folder_dirs(os.path.join(main_run_dir, 'biorxiv'),main_run_dir)
print('finish requesting ....')
# Write the results to a JSON file
with open("biorxiv_metadata_oringal_247700.json", "w") as f:  ## in my case, re-start from 247700
    f.write(json.dumps(results, indent=4))

print('finish part writing to biorxiv json ....')
print(datetime.now()-start)
os.chdir(main_run_dir)

### then re-start from the previous stop

In [None]:
import requests
import json
from datetime import datetime

switch_folder_dirs(os.path.join(main_run_dir, 'biorxiv'),main_run_dir)
print("Get metadate from bioRxiv")

start=datetime.now()

# Build the API endpoint URL
server = "biorxiv"
start_date = "2013-11-01"
#end_date = datetime.today().strftime('%Y-%m-%d') #'2023-02-16'
end_date="2023-02-16"
# try to get the rest

cursor = 247700  ## this 
max_limit=100

# Initialize an empty list to store the results
results = []

print(end_date)
# Loop through the pages until all results have been retrieved  
# bioRxiv is announced on Nov. 01, 2013
while True:
    # Build the search query with the current offset and limit values
    url_biorxiv = f"https://api.biorxiv.org/details/{server}/{start_date}/{end_date}/{cursor}"
    
    # Make the API call and check the status code
    response = requests.get(url_biorxiv)
    
    if response.status_code == 200:
        
        # Parse the JSON data and append the items to the results list
        data = response.json()
        results.extend(data["collection"])

        # Update the offset and check if there are more results to retrieve
        cursor += max_limit
        if "total" in data["messages"][0]:

            if cursor >= data["messages"][0]["total"]:
                break
            else:
                    
                if cursor%3000==0: 
                    print(cursor, data["messages"][0]["total"])
                    with open("biorxiv_metadata_oringal_rest.json", "w") as f:
                        f.write(json.dumps(results, indent=4))
            
        else:
            break
        
    else:
        print("Error: request failed with status code", response.status_code)
        break
    

        
print('finish requesting ....')
# Write the results to a JSON file
with open("biorxiv_metadata_oringal_rest.json", "w") as f:
    f.write(json.dumps(results, indent=4))

print('finish rest writing to biorxiv json ....')
print(datetime.now()-start)
os.chdir(main_run_dir)

### combine all 

In [None]:
switch_folder_dirs(os.path.join(main_run_dir, 'biorxiv'),main_run_dir)
def merge_JsonFiles(filename):
        result = []
        for f1 in filename:
            with open(f1, 'r') as infile:
                result.extend(json.load(infile))
    
        with open('biorxiv_metadata_final_all.json', 'w') as output_file:
            json.dump(result, output_file)
    
files=['biorxiv_metadata_oringal_247700.json','biorxiv_metadata_oringal_rest.json']
merge_JsonFiles(files)
os.chdir(main_run_dir)
