In [1]:
import requests
import json
import csv
import os

csv_file = 'Cynthia_Silveira.csv' # Must contain a column named 'DOI'
generate_individual_files = False
include_timestamp = True


In [3]:
with open(csv_file, newline='', encoding='utf-8') as csvfile:
    reader = csv.DictReader(csvfile)
    query_dois = []
    for row in reader:
        # Extract the DOI from the CSV file
        doi = row['DOI']
        if doi:
            query_dois.append(doi)

for doi in query_dois:
    print(doi)

10.1093/bioinformatics/btad761
10.1101/2023.11.03.565530
10.1101/2023.04.20.537752
10.1186/s12915-023-01571-9
10.1101/2023.12.27.573307
10.1016/j.csbj.2021.12.032
10.1111/1462-2920.15640
10.7717/peerj.11213
10.1080/10511970.2021.1881847
10.3389/fphy.2021.594306
10.3390/microorganisms8121944
10.1128/msystems.00353-20
10.1101/2020.04.08.028340
10.1186/s12864-020-6523-2
10.25891/14f6-by82
10.21203/rs.3.rs-3040647/v1
10.1016/j.scitotenv.2023.164465
10.1016/j.scitotenv.2023.164465
10.1038/s43247-023-00796-4
10.1186/s12915-023-01571-9
10.1186/s40168-023-01547-5
10.1007/s00338-022-02272-5
10.1186/s40793-022-00401-9
10.1038/s43017-021-00214-3
10.7717/peerj.11213
10.1111/1462-2920.15640
10.3390/microorganisms9061115
10.3389/fmicb.2021.637430
10.3390/ijms222112050
10.3389/fmars.2021.627724
10.1186/s12864-020-6523-2
10.1128/msystems.00353-20
10.1073/pnas.1915455117
10.1101/2020.04.08.028340
10.1038/s41467-018-08286-4
10.7554/eLife.49114
10.1016/j.jembe.2018.02.008
10.1038/ncomms15955
10.1038/nmic

In [4]:

def fetch_article_data(doi):
    """
    Fetches article data from CrossRef using DOI.
    """
    url = f"https://api.crossref.org/works/{doi}"
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        return data
    else:
        print(f"Failed to fetch data for DOI: {doi}")
        return None

def extract_author_data(article_data):
    """
    Extracts author data from CrossRef article data.
    """
    authors = article_data['message']['author']
    author_data = []
    for author in authors:
        author_info = {
            'given_name': author.get('given', ''),
            'family_name': author.get('family', ''),
            'affiliation': []
        }
        if 'affiliation' in author:
            if isinstance(author['affiliation'], list):
                for affiliation in author['affiliation']:
                    if isinstance(affiliation, str):
                        author_info['affiliation'].append(affiliation)
                    elif isinstance(affiliation, dict) and 'name' in affiliation:
                        author_info['affiliation'].append(affiliation['name'])
            elif isinstance(author['affiliation'], str):
                author_info['affiliation'].append(author['affiliation'])
        author_data.append(author_info)
    return author_data

def save_article_data_to_json(article_data, author_data, doi):
    """
    Saves article data and author data to a JSON file.
    """
    article_message = article_data.get('message', {})
    
    title = article_message.get('title', [''])[0]
    
    container_title = article_message.get('container-title', [])
    journal = container_title[0] if container_title else ''
    
    publication_date = ''
    if 'published-online' in article_message:
        date_parts = article_message['published-online'].get('date-parts', [[]])
        if date_parts:
            publication_date = date_parts[0][0]

    filename = f'article_data_{doi.replace("/", "_")}.json'
    
    data_to_save = {
        'article': {
            'title': title,
            'journal': journal,
            'doi': doi,
            'publication_date': publication_date
        },
        'authors': author_data
    }

    with open(filename, 'w') as f:
        json.dump(data_to_save, f, indent=4)

In [5]:

def main():
    dois = query_dois

    all_articles_data = []

    for doi in dois:
        article_data = fetch_article_data(doi)
        if article_data:
            author_data = extract_author_data(article_data)
            print(f"Authors data for DOI {doi}:")
            for author in author_data:
                print(f"Name: {author['given_name']} {author['family_name']}")
                print(f"Affiliation(s): {', '.join(author['affiliation']) if author['affiliation'] else 'Not available'}")
                print()
            print()
            if generate_individual_files:
                save_article_data_to_json(article_data, author_data, doi) # This line execution depends on the value of the variable generate_individual_files
            if 'message' in article_data and 'title' in article_data['message'] and 'container-title' in article_data['message'] and 'published-online' in article_data['message']:
                all_articles_data.append({
                    'doi': doi,
                    'article_data': {
                        'title': article_data['message']['title'][0],
                        'journal': article_data['message']['container-title'][0],
                        'doi': doi,
                        'publication_date': article_data['message']['published-online']['date-parts'][0][0]
                    },
                    'author_data': author_data
                })

    global_json_file = f'{csv_file.replace(".csv", "_all_articles_extensive_data.json")}'
    with open(global_json_file, 'w') as f:
        json.dump(all_articles_data, f, indent=4)
if __name__ == "__main__":
    main()


Authors data for DOI 10.1093/bioinformatics/btad761:
Name: Colin Brown
Affiliation(s): Viral Information Institute, San Diego State University , San Diego, CA 92116, United States, Department of Physics, San Diego State University , San Diego, CA 92116, United States

Name: Anuradha Agarwal
Affiliation(s): Viral Information Institute, San Diego State University , San Diego, CA 92116, United States, Computational Science Research Center, San Diego State University , San Diego, CA 92116, United States

Name: Antoni Luque
Affiliation(s): Viral Information Institute, San Diego State University , San Diego, CA 92116, United States, Computational Science Research Center, San Diego State University , San Diego, CA 92116, United States, Department of Mathematics and Statistics, San Diego State University , San Diego, CA 92116, United States, Department of Biology, University of Miami , Coral Gables, FL 33146, United States


Authors data for DOI 10.1101/2023.11.03.565530:
Name: Jennifer M. Pod

In [7]:
import pandas as pd
import json

# Read the contents of the global_json_file
with open(f'{csv_file.replace(".csv", "_all_articles_extensive_data.json")}', 'r') as f:
    data = json.load(f)

# Create an empty list to store the rows
rows = []

# Iterate over the data and create a row for each article+author combination
for article_data in data:
    doi = article_data['doi']
    title = article_data['article_data']['title']
    journal = article_data['article_data']['journal']
    publication_date = article_data['article_data']['publication_date']
    authors = article_data['author_data']
    
    for author in authors:
        given_name = author['given_name']
        family_name = author['family_name']
        affiliation = ', '.join(author['affiliation']) if author['affiliation'] else 'Not available'
        
        row = {
            'DOI': doi,
            'Title': title,
            'Journal': journal,
            'Publication Date': publication_date,
            'Given Name': given_name,
            'Family Name': family_name,
            'Affiliation': affiliation
        }
        
        rows.append(row)

# Create the dataframe
df = pd.DataFrame(rows)

# Print the dataframe
print(df)


                                DOI  \
0    10.1093/bioinformatics/btad761   
1    10.1093/bioinformatics/btad761   
2    10.1093/bioinformatics/btad761   
3        10.1186/s12915-023-01571-9   
4        10.1186/s12915-023-01571-9   
..                              ...   
487       10.1007/s00792-008-0162-x   
488       10.1007/s00792-008-0162-x   
489       10.1007/s00792-008-0162-x   
490       10.1007/s00792-008-0162-x   
491       10.1007/s00792-008-0162-x   

                                                 Title         Journal  \
0    pyCapsid: identifying dominant dynamics and qu...  Bioinformatics   
1    pyCapsid: identifying dominant dynamics and qu...  Bioinformatics   
2    pyCapsid: identifying dominant dynamics and qu...  Bioinformatics   
3              Viral predation pressure on coral reefs     BMC Biology   
4              Viral predation pressure on coral reefs     BMC Biology   
..                                                 ...             ...   
487  Prokaryo

In [8]:
import datetime

# Get the current year, month, and date
current_year = datetime.datetime.now().year
current_month = datetime.datetime.now().month
current_day = datetime.datetime.now().day

# Create the output file name with the specific suffix
if include_timestamp:
    output_file = f'{csv_file.replace(".csv", f"_coauthors_{current_year}-{current_month}-{current_day}.csv")}'
else:
    output_file = f'{csv_file.replace(".csv", "_coauthors.csv")}'

# Save the dataframe to the output CSV file
df.to_csv(output_file, index=False)
