In [3]:
import yaml
import pandas as pd
import json
from thirdparty.xploreapi import XPLORE

with open('./config.yaml', 'r') as file:
    config = yaml.safe_load(file)

# Access the API key
api_key = config['xplore_api']['api_key']
auth_token = config['xplore_api']['auth_token']

In [None]:
def get_all_dois(journal_issn, api_key =  api_key):
    query = XPLORE(api_key)
    query.issn(journal_issn)
    query.dataType('json')
    query.dataFormat('raw')
    data = query.callAPI()
    
    title_doi_dataframe = pd.DataFrame(json.loads(data))
    title_doi_dataframe.to_csv(f"../journal-meta/{journal_issn}_doi.csv", index=False)
    return title_doi_dataframe

In [5]:
# Replace with a valid article DOI or EID for testing
# article_doi = "10.1016/j.trc.2023.104311"
def retrieve_full_text(article_doi, save_folder, api_key = api_key, auth_token = auth_token):
    # API Endpoint for Article Retrieval
    query = XPLORE(api_key)
    query.setAuthToken('auth_token')
    query.dataType('json')
    query.dataFormat('raw')
    query.doi(article_doi)
    query.fullTextRequest('article number')

    data = query.callAPI()
    
    unique_id = article_doi.replace('/','_')
    with open(f'{save_folder}/{unique_id}.txt', 'w', encoding='utf-8') as file:
        file.write(data)
    return data

In [None]:
# read the journal-meta folder and get a list of csv files
import os
folder = '../journal-meta'
full_text_folder = '../journal-full-text'
csv_files = [file for file in os.listdir(folder) if file.endswith('.csv')]
for csv_file in csv_files:
    journal_info = csv_file.split('_')[0]
    # create a folder in the full_text_folder under the journal info
    save_folder = os.path.join(full_text_folder, journal_info)
    # Check if the folder doesn't already exist and create it if necessary
    if not os.path.exists(save_folder):
        os.makedirs(save_folder)
    article_info = pd.read_csv(folder + '/' + csv_file)
    article_info['year'] = article_info['date'].str.extract(r'(\d{4})')
    # get the month from the format of YYYY-MM-DD
    article_info['month'] = article_info['date'].str.extract(r'-(\d{2})-')
    article = article_info[article_info['year'] >= '2019']
    article.loc[:, 'abstract'] = article.apply(lambda x: retrieve_full_text(x['doi'],save_folder), axis=1)
    article.to_csv(full_text_folder + '/' + journal_info + '.csv', index=False)