In [59]:
import requests
import json
import csv
import os
import pandas as pd

In [60]:
author_name = "A_LUQUE"
output_path = "../results"
generate_individual_files = False
include_timestamp = True
global_json_file = os.path.join(output_path, f'{author_name}_all_articles_extensive_data.json')

In [61]:
def get_orcid_articles(orcid_id):
    # Replace 'YOUR_ACCESS_TOKEN' with an actual ORCID API access token if needed.
    headers = {
        'Accept': 'application/json',
        #'Authorization': 'Bearer YOUR_ACCESS_TOKEN',
    }
    
    # Construct the URL to access the ORCID record
    url = f'https://pub.orcid.org/v3.0/{orcid_id}/works'
    
    # Make the request to the ORCID API
    response = requests.get(url, headers=headers)
    
    # Check if the request was successful
    if response.status_code == 200:
        data = response.json()
        dois = []
        
        # Loop through the returned works and extract the DOIs
        for work in data.get('group', []):
            for work_summary in work.get('work-summary', []):
                doi = work_summary.get('external-ids', {}).get('external-id', [])
                for id in doi:
                    if id.get('external-id-type') == 'doi':
                        dois.append(id.get('external-id-value'))
        
        return dois
    else:
        print(f'Failed to retrieve data for ORCID ID {orcid_id}. Status code: {response.status_code}')
        return []


def get_crossref_articles(orcid_id):
    """
    Retrieves a list of DOIs for publications associated with a given ORCID ID from CrossRef.

    Parameters:
    orcid_id (str): The ORCID ID of the author.

    Returns:
    list: A list of DOIs for the author's publications.
    """
    # Base URL for CrossRef API
    crossref_api_url = "https://api.crossref.org/works"
    # Parameters for the API request, filtering by ORCID ID
    params = {
        'filter': f'orcid:{orcid_id}',
        'rows': 1000  # Adjust the number of results as needed
    }
    
    # Perform the API request
    response = requests.get(crossref_api_url, params=params)
    
    # Check if the request was successful
    if response.status_code == 200:
        data = response.json()
        # Extract DOIs from the items in the response
        dois = [item['DOI'] for item in data['message']['items']]
        return dois
    else:
        print(f"Error fetching data: HTTP {response.status_code}")
        return []


def get_combined_dois(orcid_id):
    orcid_dois = get_orcid_articles(orcid_id)
    crossref_dois = get_crossref_articles(orcid_id)
    combined_dois = orcid_dois + crossref_dois
    unique_dois = list(set(combined_dois))
    
    # Print summary
    print(f"DOIs for ORCID ID:{orcid_id}\n")
    print(f"Number of DOIs from ORCID: {len(orcid_dois)}")
    print(f"Number of DOIs from CrossRef: {len(crossref_dois)}")
    print(f"Total number of unique DOIs: {len(unique_dois)}")
    
    return unique_dois






In [62]:
# Example usage get_combined_dois
orcid_id = "0000-0002-5817-4914" # Replace with the actual ORCID ID
query_dois = get_combined_dois(orcid_id)
print(query_dois)

DOIs for ORCID ID:0000-0002-5817-4914

Number of DOIs from ORCID: 47
Number of DOIs from CrossRef: 17
Total number of unique DOIs: 39
['10.1038/s41467-019-12367-3', '10.1101/2023.04.20.537752', '10.1016/j.bpj.2016.04.024', '10.1128/mSystems.00353-20', '10.1080/10511970.2021.1881847', '10.1101/495481', '10.1111/1462-2920.15640', '10.1093/nar/gku491', '10.1063/1.4712304', '10.1088/1478-3975/9/3/036003', '10.1038/nmicrobiol.2017.64', '10.1038/nature17193', '10.1128/mBio.02207-17', '10.1101/2023.03.05.531146', '10.1093/bioinformatics/btad761', '10.1186/s12915-023-01571-9', '10.3390/microorganisms8121944', '10.1128/msystems.00353-20', '10.1099/mgen.0.001100', '10.1073/pnas.0915122107', '10.1016/j.bpj.2010.02.051', '10.1186/s12864-020-6523-2', '10.1007/978-94-007-6552-8_19', '10.1101/2023.12.27.573307', '10.1101/2023.02.27.529640', '10.1101/2020.04.22.056689', '10.20944/preprints202011.0024.v1', '10.3390/v14050973', '10.1101/327031', '10.1016/j.sbi.2015.04.002', '10.1038/s41598-019-52794-2',

In [63]:

def fetch_article_data(doi):
    """
    Fetches article data from CrossRef using DOI.
    """
    url = f"https://api.crossref.org/works/{doi}"
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        return data
    else:
        print(f"Failed to fetch data for DOI: {doi}")
        return None

def extract_author_data(article_data):
    """
    Extracts author data from CrossRef article data.
    """
    authors = article_data['message']['author']
    author_data = []
    for author in authors:
        author_info = {
            'given_name': author.get('given', ''),
            'family_name': author.get('family', ''),
            'affiliation': []
        }
        if 'affiliation' in author:
            if isinstance(author['affiliation'], list):
                for affiliation in author['affiliation']:
                    if isinstance(affiliation, str):
                        author_info['affiliation'].append(affiliation)
                    elif isinstance(affiliation, dict) and 'name' in affiliation:
                        author_info['affiliation'].append(affiliation['name'])
            elif isinstance(author['affiliation'], str):
                author_info['affiliation'].append(author['affiliation'])
        author_data.append(author_info)
    return author_data

def save_article_data_to_json(article_data, author_data, doi):
    """
    Saves article data and author data to a JSON file.
    """
    article_message = article_data.get('message', {})
    
    title = article_message.get('title', [''])[0]
    
    container_title = article_message.get('container-title', [])
    journal = container_title[0] if container_title else ''
    
    publication_date = ''
    if 'published-online' in article_message:
        date_parts = article_message['published-online'].get('date-parts', [[]])
        if date_parts:
            publication_date = date_parts[0][0]

    filename = f'article_data_{doi.replace("/", "_")}.json'
    
    data_to_save = {
        'article': {
            'title': title,
            'journal': journal,
            'doi': doi,
            'publication_date': publication_date
        },
        'authors': author_data
    }

    with open(filename, 'w') as f:
        json.dump(data_to_save, f, indent=4)

In [64]:
def generate_global_json_file(dois, generate_individual_files):
    all_articles_data = []

    for doi in dois:
        article_data = fetch_article_data(doi)
        if article_data:
            author_data = extract_author_data(article_data)
            print(f"Authors data for DOI {doi}:")
            for author in author_data:
                print(f"Name: {author['given_name']} {author['family_name']}")
                print(f"Affiliation(s): {', '.join(author['affiliation']) if author['affiliation'] else 'Not available'}")
                print()
            print()
            if generate_individual_files:
                save_article_data_to_json(article_data, author_data, doi) # This line execution depends on the value of the variable generate_individual_files
            if 'message' in article_data and 'title' in article_data['message'] and 'container-title' in article_data['message'] and 'published-online' in article_data['message']:
                all_articles_data.append({
                    'doi': doi,
                    'article_data': {
                        'title': article_data['message']['title'][0],
                        'journal': article_data['message']['container-title'][0],
                        'doi': doi,
                        'publication_date': article_data['message']['published-online']['date-parts'][0][0]
                    },
                    'author_data': author_data
                })

    with open(global_json_file, 'w') as f:
        json.dump(all_articles_data, f, indent=4)


def generate_global_df(global_json_file):
    # Read the contents of the global_json_file
    with open(global_json_file, 'r') as f:
        data = json.load(f)

    # Create an empty list to store the rows
    rows = []

    # Iterate over the data and create a row for each article+author combination
    for article_data in data:
        doi = article_data['doi']
        title = article_data['article_data']['title']
        journal = article_data['article_data']['journal']
        publication_date = article_data['article_data']['publication_date']
        authors = article_data['author_data']

        for author in authors:
            given_name = author['given_name']
            family_name = author['family_name']
            affiliation = ', '.join(author['affiliation']) if author['affiliation'] else 'Not available'

            row = {
                'DOI': doi,
                'Title': title,
                'Journal': journal,
                'Publication Date': publication_date,
                'Given Name': given_name,
                'Family Name': family_name,
                'Affiliation': affiliation
            }

            rows.append(row)

    # Create the dataframe
    df = pd.DataFrame(rows)

    return df


In [66]:
# Usage example:
generate_global_json_file(query_dois, generate_individual_files)
global_df = generate_global_df(global_json_file)
print(global_df)

csv_file = global_json_file.replace('.json', '.csv')
global_df.to_csv(csv_file, index=False)



Authors data for DOI 10.1038/s41467-019-12367-3:
Name: Reidun Twarock
Affiliation(s): Not available

Name: Antoni Luque
Affiliation(s): Not available


Authors data for DOI 10.1101/2023.04.20.537752:
Name: Jody C. McKerral
Affiliation(s): Not available

Name: Bhavya Papudeshi
Affiliation(s): Not available

Name: Laura K. Inglis
Affiliation(s): Not available

Name: Michael J. Roach
Affiliation(s): Not available

Name: Przemyslaw Decewicz
Affiliation(s): Not available

Name: Katelyn McNair
Affiliation(s): Not available

Name: Antoni Luque
Affiliation(s): Not available

Name: Elizabeth A. Dinsdale
Affiliation(s): Not available

Name: Robert A. Edwards
Affiliation(s): Not available


Authors data for DOI 10.1016/j.bpj.2016.04.024:
Name: Antoni Luque
Affiliation(s): Not available

Name: Gungor Ozer
Affiliation(s): Not available

Name: Tamar Schlick
Affiliation(s): Not available


Authors data for DOI 10.1128/mSystems.00353-20:
Name: Antoni Luque
Affiliation(s): Department of Mathematics and