Make a python script that can automate the process for creating publication entries. If possible, the script should get the author names, article title, journal name, publication date, and abstract and compile it into a templated markdown file for insertion into the publications page.

For the publications: we can stick with pubs starting from 2018 or 2019 since that's when the lab started up. There will be a relatively large number of publications here, so it would probably be ideal if this could be automated to some degree (like a script that could automatically parse a google scholar entry or DOI into a properly formatted markdown file that matches the template for the wowchemy publications page.

In [35]:
import os
import requests
import sys
import json

name_to_account = {
    'Tod A. Pascal': 'admin',
}

def replace_with_dictionary(array, dictionary):
    # Iterate over each string in the array
    for i in range(len(array)):
        # If the string is a key in the dictionary, replace it with the corresponding value
        if array[i] in dictionary:
            array[i] = dictionary[array[i]]
    return array

def fetch_data(doi):
    """Fetch publication data from CrossRef API."""
    url = f"https://api.crossref.org/works/{doi}"
    response = requests.get(url)
    if response.status_code == 200:
        return response.json()['message']
    else:
        print("Failed to fetch data.")
        sys.exit(1)

def create_files(pub_data, dir_name):
    """Create directory and files based on publication data."""
    # Ensure valid filenames
    directory = "out"
    valid_dir_name = os.path.join(directory, dir_name.replace("/", "_").replace("\\", "_").replace('"', '').replace(':', '_').replace('?', ''))
    os.makedirs(valid_dir_name, exist_ok=True)

    # Extracting and formatting author list to include only first names
    authors_source = [f"{author.get('given', '')} {author.get('family', '')}" for author in pub_data.get('author', [])]
    authors = authors_source.copy()
    authors = replace_with_dictionary(authors, name_to_account)
    authors_md = "\n- ".join([""] + authors)  # Markdown list format
    
    # Format publication date correctly
    pub_date_parts = pub_data.get('published-print', pub_data.get('published-online', {'date-parts': [[0]]}))['date-parts'][0]
    if len(pub_date_parts) == 3:  # Full date available
        formatted_pub_date = f"{pub_date_parts[0]:04d}-{pub_date_parts[1]:02d}-{pub_date_parts[2]:02d}T00:00:00Z"
    elif len(pub_date_parts) == 2:  # Only year and month available
        formatted_pub_date = f"{pub_date_parts[0]:04d}-{pub_date_parts[1]:02d}-01T00:00:00Z"
    else:  # Only year available
        formatted_pub_date = f"{pub_date_parts[0]:04d}-01-01T00:00:00Z"

    # Edge Case during parsing:
    publication = pub_data.get('container-title', [''])
    if len(publication) == 0:
        publication = ['']
    # Index.md content
    index_content = f"""---
title: "{pub_data.get('title', [''])[0]}"
authors:{authors_md}
date: "{formatted_pub_date}"
doi: "{pub_data.get('DOI', '')}"
abstract: "{pub_data.get('abstract', '').replace('\n', ' ')}"
url_pdf: "{pub_data.get('URL', '')}"
publication: "{publication[0]}"
publication_types: ["article-journal"]
---

Add the publication's full text or supplementary notes here.
"""
    with open(os.path.join(valid_dir_name, "index.md"), "w", encoding='utf-8') as f:
        f.write(index_content)

    # Prepare full names for citation
    bib_authors = ", ".join(authors_source)
    bib_content = f"""@article{{{pub_data.get('DOI', '').replace('/', '_')},
  title = {{{pub_data.get('title', [''])[0]}}},
  author= {{{bib_authors}}},
  journal = {{{publication[0]}}},
  year    = {pub_date_parts[0]},
  volume  = {pub_data.get('volume', '')},
  number  = {pub_data.get('issue', '')},
  doi     = {{{pub_data.get('DOI', '')}}},
  url     = {{{pub_data.get('URL', '')}}}
}}
"""
    with open(os.path.join(valid_dir_name, "cite.bib"), "w", encoding='utf-8') as f:
        f.write(bib_content)

    print(f"Files generated in directory: {valid_dir_name}")




# if __name__ == "__main__":
#     if len(sys.argv) != 2:
#         print("Usage: python fetch_pub_data.py <DOI>")
#         sys.exit(1)
    
#     doi = sys.argv[1]
#     pub_data = fetch_data(doi)

#     # Use title for directory name, or DOI if title is unavailable
#     dir_name = pub_data.get('DOI').split('/')[1]
#     create_files(pub_data, dir_name)

In [37]:
doi = "10.1021/bi901283p"
pub_data = fetch_data(doi)
dir_name = pub_data.get('DOI').split('/')[1]
create_files(pub_data, dir_name)

Files generated in directory: out/bi901283p


Script to fetch all DOIs by an author

In [14]:
# import requests

# def find_papers_by_author(author_name):
#     # Construct the CrossRef API URL for querying papers by author
#     url = f"https://api.crossref.org/works?query.author={author_name}"
    
#     # Make a GET request to the CrossRef API
#     response = requests.get(url)
    
#     # Check if the request was successful (status code 200)
#     if response.status_code == 200:
#         # Parse the JSON response
#         data = response.json()
        
#         # Extract DOIs from the API response
#         dois = [item['DOI'] for item in data['message']['items']]
        
#         return dois
#     else:
#         # If request was not successful, print error message
#         print("Error fetching papers:", response.status_code)
#         return None

# # Example usage:
# author_name = "Tod A. Pascal"
# paper_dois = find_papers_by_author(author_name)
# if paper_dois:
#     print("Papers by", author_name + ":")
#     for doi in paper_dois:
#         print("- DOI:", doi)


Papers by Tod A. Pascal:
- DOI: 10.1038/s41467-023-37857-3
- DOI: 10.21203/rs.3.rs-1683269/v1
- DOI: 10.1021/jp410861h
- DOI: 10.1021/jp309693d
- DOI: 10.1021/acs.jpclett.1c02609
- DOI: 10.1016/j.cmpb.2011.04.006
- DOI: 10.1039/c6cp03940e
- DOI: 10.1039/c4cp05316h
- DOI: 10.1149/ma2022-01381691mtgabs
- DOI: 10.1063/5.0054314
- DOI: 10.1039/d2nr05732h
- DOI: 10.1021/jz200453u
- DOI: 10.1007/s40262-012-0014-9
- DOI: 10.1063/1.3456543
- DOI: 10.1073/pnas.1108073108
- DOI: 10.1149/ma2014-02/1/17
- DOI: 10.1149/ma2017-01/43/1984
- DOI: 10.1016/j.commatsci.2012.12.024
- DOI: 10.2139/ssrn.236033
- DOI: 10.1149/ma2016-03/2/422


In [20]:
for doi in paper_dois:
    pub_data = fetch_data(doi)
    dir_name = pub_data.get('DOI').split('/')[1]
    create_files(pub_data, dir_name)

Files generated in directory: s41467-023-37857-3
Files generated in directory: rs.3.rs-1683269
Files generated in directory: jp410861h
Files generated in directory: jp309693d
Files generated in directory: acs.jpclett.1c02609
Files generated in directory: j.cmpb.2011.04.006
Files generated in directory: c6cp03940e
Files generated in directory: c4cp05316h
Files generated in directory: ma2022-01381691mtgabs
Files generated in directory: 5.0054314
Files generated in directory: d2nr05732h
Files generated in directory: jz200453u
Files generated in directory: s40262-012-0014-9
Files generated in directory: 1.3456543
Files generated in directory: pnas.1108073108
Files generated in directory: ma2014-02
Files generated in directory: ma2017-01
Files generated in directory: j.commatsci.2012.12.024
Files generated in directory: ssrn.236033
Files generated in directory: ma2016-03


In [1]:
import os

def find_main_versions(directory):
    main_versions = {}
    for filename in os.listdir(directory):
        if filename.endswith(".pdf") and filename.startswith(tuple(str(num) for num in range(10))):
            if "_si" not in filename and "_SM" not in filename and "-si" not in filename and "-mmc1" not in filename and ".sapp" not in filename:
                paper_number = filename.split(".")[0]
                if paper_number not in main_versions:
                    main_versions[paper_number] = filename
    return main_versions

# Example usage:
directory = "published"
main_versions = find_main_versions(directory)
print("Main versions of papers:")
for paper_number, filename in main_versions.items():
    print(f"Paper {paper_number}: {filename}")

Main versions of papers:
Paper 004: 004.jbc-8829-36.pdf
Paper 013: 013.nl104227t.pdf
Paper 003: 003.NanoSciNanoTech.707.pdf
Paper 059: 059.s41467-021-23603-0.pdf
Paper 034: 034.c5cp02951a.pdf
Paper 044: 044.jacs.7b11891.pdf
Paper 055: 055.s41560-021-00783-z.pdf
Paper 079: 079.acs.jpcb.2c08843.ms.pdf
Paper 032: 032.nature14327-s1.pdf
Paper 068: 068.acsenergylett.1c02723.pdf
Paper 085: 085.acsami.3c07224.si.pdf
Paper 069: 069.d1ee03422g.pdf
Paper 012: 012.jz200453u.pdf
Paper 054: 054.1-s2.0-S1369702120303382-main.pdf
Paper 087: 087.jacs.3c05093.si.pdf
Paper 051: 051.acs.jpclett.9b01835.pdf
Paper 080: 080.acs.jpclett.2c03942.si.1.pdf
Paper 067: 067.1-s2.0-S2666386421004537-main.pdf
Paper 008: 008.JChemPhys_133_134114.pdf
Paper 046: 046.J. Electrochem. Soc.-2018-Wang-A3487-95.pdf
Paper 061: 061.acs.nanolett.1c01502.pdf
Paper 070: 070.acs.nanolett.2c00047.si.pdf
Paper 083: 083.acs.nanolett.3c01825.si.pdf
Paper 048: 048.jacs.8b09743.pdf
Paper 040: 040.nlE7b00249.pdf
Paper 006: 006.BioChem.bi

In [17]:
import os

def find_main_and_supplementary_versions(directory):
    main_versions = {}
    supplementary_versions = {}
    for paper_number in range(1, 89):  # Loop from 1 to 88
        padded_number = str(paper_number).zfill(3)  # Zero-pad the number to ensure three digits
        main_file = None
        sup_file1 = None
        sup_file2 = None
        count = 0
        for filename in os.listdir(directory):
            if filename.startswith(padded_number) and filename.endswith(".pdf"):
                count += 1
                if not any(keyword in filename for keyword in ["_si", "_SM", "-si", "-mmc1", ".sapp", "-s1", ".SM", ".si", "_sm", "_Sm", "_SI", "-SI", "-sm", "_ESM"]):
                    main_file = filename
                    # break  # Found main version, exit loop
                else:
                    if sup_file1:
                        sup_file2 = filename
                    else:
                        sup_file1 = filename
        if main_file:
            main_versions[padded_number] = main_file
        if sup_file1:
            if sup_file2:
                supplementary_versions[padded_number] = sup_file1 + ";" + sup_file2
                if count != 3:
                    raise ValueError(f"File number does not checkout 3 for {padded_number}")
            else:
                supplementary_versions[padded_number] = sup_file1
                if count != 2:
                    raise ValueError(f"File number does not checkout 2 for {padded_number}")
        else:
            if count != 1:
                raise ValueError(f"File number does not checkout 1 for {padded_number}")
    return main_versions, supplementary_versions

# Example usage:
directory = "published"
main_versions, supplementary_versions = find_main_and_supplementary_versions(directory)
print("Main versions of papers:")
for paper_number, filename in main_versions.items():
    print(f"Paper {paper_number}: {filename}")

print("Sup versions of papers:")
for paper_number, file_path in supplementary_versions.items():
    print(f"Paper {paper_number}: {file_path}")

Main versions of papers:
Paper 001: 001.nar-6047.full.pdf
Paper 002: 002.BioPhysJ-2006-1463.pdf
Paper 003: 003.NanoSciNanoTech.707.pdf
Paper 004: 004.jbc-8829-36.pdf
Paper 005: 005.jbc-15835-46.pdf
Paper 006: 006.BioChem.bi901283p.pdf
Paper 007: 007.cphc_201000528.pdf
Paper 008: 008.JChemPhys_133_134114.pdf
Paper 009: 009.jbc-37753-61.pdf
Paper 010: 010.jz101391r.pdf
Paper 011: 011.c0cp01549k.pdf
Paper 012: 012.jz200453u.pdf
Paper 013: 013.nl104227t.pdf
Paper 014: 014.ct200211b.pdf
Paper 015: 015.jz200760n.pdf
Paper 016: 016.pnas.1108073108.pdf
Paper 017: 017.jp209541e.pdf
Paper 018: 018.jz201612y.pdf
Paper 019: 019.jz3000036.pdf
Paper 020: 020.jp301610b.pdf
Paper 021: 021.jp306473u.pdf
Paper 022: 022.jp309693d.pdf
Paper 023: 023.025248JCP.pdf
Paper 024: 024.jp310422q.pdf
Paper 025: 025.commmat_4952.pdf
Paper 026: 026.066401JCP.pdf
Paper 027: 027.jz500260s.pdf
Paper 028: 028.jecs.A1100-6.pdf
Paper 029: 029.jp410861h.pdf
Paper 030: 030.science.831.full.pdf
Paper 031: 031.C4CP05316H.pdf


In [74]:
import pdfplumber
import re

def extract_dois_from_pdf(pdf_file_path):
    dois = []
    with pdfplumber.open(pdf_file_path) as pdf:
        for page in pdf.pages:
            text = page.extract_text()
            # Use regex to find DOI patterns
            doi_matches = re.findall(r'\b(10\.\d{4,}(?:\.\d+)*\/[-._;()\/:A-Z0-9]+)\b', text, re.IGNORECASE) 
            for match in doi_matches:
                dois.append(match)
    return dois

In [64]:
import os
import requests
import sys
import json
import shutil

name_to_account = {
    'Tod A. Pascal': 'admin',
}

def replace_with_dictionary(array, dictionary):
    # Iterate over each string in the array
    for i in range(len(array)):
        # If the string is a key in the dictionary, replace it with the corresponding value
        if array[i] in dictionary:
            array[i] = dictionary[array[i]]
    return array

def fetch_data(doi):
    """Fetch publication data from CrossRef API."""
    url = f"https://api.crossref.org/works/{doi}"
    response = requests.get(url)
    if response.status_code == 200:
        return response.json()['message']
    else:
        print("Failed to fetch data.")
        sys.exit(1)

def create_files(pub_data, dir_name, source_dir, main_file, sup_file1, sup_file2):
    """Create directory and files based on publication data."""
    # Ensure valid filenames
    directory = "out"
    sub_directory = dir_name.replace("/", "_").replace("\\", "_").replace('"', '').replace(':', '_').replace('?', '')
    valid_dir_name = os.path.join(directory, sub_directory)
    os.makedirs(valid_dir_name, exist_ok=True)

    # Extracting and formatting author list to include only first names
    authors_source = [f"{author.get('given', '')} {author.get('family', '')}" for author in pub_data.get('author', [])]
    authors = authors_source.copy()
    authors = replace_with_dictionary(authors, name_to_account)
    authors_md = "\n- ".join([""] + authors)  # Markdown list format
    
    # Format publication date correctly
    pub_date_parts = pub_data.get('published-print', pub_data.get('published-online', {'date-parts': [[0]]}))['date-parts'][0]
    if len(pub_date_parts) == 3:  # Full date available
        formatted_pub_date = f"{pub_date_parts[0]:04d}-{pub_date_parts[1]:02d}-{pub_date_parts[2]:02d}T00:00:00Z"
    elif len(pub_date_parts) == 2:  # Only year and month available
        formatted_pub_date = f"{pub_date_parts[0]:04d}-{pub_date_parts[1]:02d}-01T00:00:00Z"
    else:  # Only year available
        formatted_pub_date = f"{pub_date_parts[0]:04d}-01-01T00:00:00Z"

    main_link = ""
    if main_file:
        shutil.copy2(os.path.join(source_dir, main_file), valid_dir_name)
        main_link = f"""- name: Main Paper
  url: "publication/{sub_directory}/{main_file}" """
    sup_link1 = ""
    if sup_file1:
        shutil.copy2(os.path.join(source_dir, sup_file1), valid_dir_name)
        sup_link1 = f"""- name: Supporting Material
  url: "publication/{sub_directory}/{sup_file1}" """
    sup_link2 = ""
    if sup_file2:
        shutil.copy2(os.path.join(source_dir, sup_file2), valid_dir_name)
        sup_link2 = f"""- name: Supporting Material 2
  url: "publication/{sub_directory}/{sup_file2}" """
    
    # Edge Case during parsing:
    publication = pub_data.get('container-title', [''])
    if len(publication) == 0:
        publication = ['']
    # Index.md content
    index_content = f"""---
title: "{pub_data.get('title', [''])[0]}"
authors:{authors_md}
date: "{formatted_pub_date}"
doi: "{pub_data.get('DOI', '')}"
abstract: "{pub_data.get('abstract', '').replace('\n', ' ')}"
links:
{main_link}
{sup_link1}
{sup_link2}
publication: "{publication[0]}"
publication_types: ["article-journal"]
---

Add the publication's full text or supplementary notes here.
"""
    with open(os.path.join(valid_dir_name, "index.md"), "w", encoding='utf-8') as f:
        f.write(index_content)

    # Prepare full names for citation
    bib_authors = ", ".join(authors_source)
    bib_content = f"""@article{{{pub_data.get('DOI', '').replace('/', '_')},
  title = {{{pub_data.get('title', [''])[0]}}},
  author= {{{bib_authors}}},
  journal = {{{publication[0]}}},
  year    = {pub_date_parts[0]},
  volume  = {pub_data.get('volume', '')},
  number  = {pub_data.get('issue', '')},
  doi     = {{{pub_data.get('DOI', '')}}},
  url     = {{{pub_data.get('URL', '')}}}
}}
"""
    with open(os.path.join(valid_dir_name, "cite.bib"), "w", encoding='utf-8') as f:
        f.write(bib_content)

    print(f"Files generated in directory: {valid_dir_name}")


In [83]:
from collections import Counter

def most_common_string(strings):
    # Count occurrences of each string
    counts = Counter(strings)
    # Find the most common string
    most_common = max(counts, key=counts.get)
    return most_common

In [80]:
import os

def create_files_with_main_and_supplementary_versions(directory):
    main_versions = {}
    supplementary_versions = {}
    for paper_number in range(1, 89):  # Loop from 1 to 88
        padded_number = str(paper_number).zfill(3)  # Zero-pad the number to ensure three digits
        main_file = None
        sup_file1 = None
        sup_file2 = None
        count = 0
        for filename in os.listdir(directory):
            if filename.startswith(padded_number) and filename.endswith(".pdf"):
                count += 1
                if not any(keyword in filename for keyword in ["_si", "_SM", "-si", "-mmc1", ".sapp", "-s1", ".SM", ".si", "_sm", "_Sm", "_SI", "-SI", "-sm", "_ESM"]):
                    main_file = filename
                    # break  # Found main version, exit loop
                else:
                    if sup_file1:
                        sup_file2 = filename
                    else:
                        sup_file1 = filename
        if main_file:
            main_versions[padded_number] = main_file
        if sup_file1:
            if sup_file2:
                supplementary_versions[padded_number] = sup_file1 + ";" + sup_file2
                if count != 3:
                    raise ValueError(f"File number does not checkout 3 for {padded_number}")
            else:
                supplementary_versions[padded_number] = sup_file1
                if count != 2:
                    raise ValueError(f"File number does not checkout 2 for {padded_number}")
        else:
            if count != 1:
                raise ValueError(f"File number does not checkout 1 for {padded_number}")

        dois = extract_dois_from_pdf(os.path.join(directory, main_file))
        print(dois)
        if len(dois) < 1:
            print(f"No DOI possible for paper {padded_number}")
            continue
        doi = most_common_string(dois)
        pub_data = fetch_data(doi)
        
        # Use title for directory name, or DOI if title is unavailable
        dir_name = pub_data.get('DOI').split('/')[1]
        create_files(pub_data, dir_name, directory, main_file, sup_file1, sup_file2)
    
    return main_versions, supplementary_versions


In [81]:
import os

def create_files_with_main_and_supplementary_versions_test(directory):
    main_versions = {}
    supplementary_versions = {}
    for paper_number in range(1, 2):  # Loop from 1 to 88
        padded_number = str(paper_number).zfill(3)  # Zero-pad the number to ensure three digits
        main_file = None
        sup_file1 = None
        sup_file2 = None
        count = 0
        for filename in os.listdir(directory):
            if filename.startswith(padded_number) and filename.endswith(".pdf"):
                count += 1
                if not any(keyword in filename for keyword in ["_si", "_SM", "-si", "-mmc1", ".sapp", "-s1", ".SM", ".si", "_sm", "_Sm", "_SI", "-SI", "-sm", "_ESM"]):
                    main_file = filename
                    # break  # Found main version, exit loop
                else:
                    if sup_file1:
                        sup_file2 = filename
                    else:
                        sup_file1 = filename
        if main_file:
            main_versions[padded_number] = main_file
        if sup_file1:
            if sup_file2:
                supplementary_versions[padded_number] = sup_file1 + ";" + sup_file2
                if count != 3:
                    raise ValueError(f"File number does not checkout 3 for {padded_number}")
            else:
                supplementary_versions[padded_number] = sup_file1
                if count != 2:
                    raise ValueError(f"File number does not checkout 2 for {padded_number}")
        else:
            if count != 1:
                raise ValueError(f"File number does not checkout 1 for {padded_number}")

        dois = extract_dois_from_pdf(os.path.join(directory, main_file))
        print(dois)
        if len(dois) < 1:
            print(f"No DOI possible for paper {padded_number}")
            continue
        doi = most_common_string(dois)
        pub_data = fetch_data(doi)
        
        # Use title for directory name, or DOI if title is unavailable
        dir_name = pub_data.get('DOI').split('/')[1]
        create_files(pub_data, dir_name, directory, main_file, sup_file1, sup_file2)
    
    return main_versions, supplementary_versions


In [84]:
# Example usage:
directory = "published"
main_versions, supplementary_versions = create_files_with_main_and_supplementary_versions(directory)
print("Main versions of papers:")
for paper_number, filename in main_versions.items():
    print(f"Paper {paper_number}: {filename}")

print("Sup versions of papers:")
for paper_number, file_path in supplementary_versions.items():
    print(f"Paper {paper_number}: {file_path}")

['10.1093/nar/gkh931']
Files generated in directory: out/nar
['10.1529/biophysj.105.064733']
Files generated in directory: out/biophysj.105.064733
['10.1166/jnn.2007.704']
Files generated in directory: out/jnn.2007.704
[]
No DOI possible for paper 004
[]
No DOI possible for paper 005
['10.1021/bi901283p']
Files generated in directory: out/bi901283p
['10.1002/cphc.201000528', '10.1002/cphc.201000528']
Files generated in directory: out/cphc.201000528
['10.1063/1.3456543', '10.1063/1.3456543', '10.1063/1.3456543(cid:5', '10.1063/1.3456543']
Files generated in directory: out/1.3456543
[]
No DOI possible for paper 009
['10.1021/jz101391r', '10.1021/jz101391r', '10.1021/jz101391r', '10.1021/jz101391r', '10.1021/jz101391r', '10.1021/jz101391r', '10.1021/jz101391r', '10.1021/jz101391r', '10.1021/jz101391r']
Files generated in directory: out/jz101391r
['10.1039/c0cp01549k']
Files generated in directory: out/c0cp01549k
['10.1021/jz200453u', '10.1021/jz200453u', '10.1021/jz200453u', '10.1021/jz20

In [58]:
directory = "test"
main_versions, supplementary_versions = create_files_with_main_and_supplementary_versions_test(directory)
print("Main versions of papers:")
for paper_number, filename in main_versions.items():
    print(f"Paper {paper_number}: {filename}")

print("Sup versions of papers:")
for paper_number, file_path in supplementary_versions.items():
    print(f"Paper {paper_number}: {file_path}")

['10.1021/bi901283p']
Files generated in directory: out/bi901283p
Main versions of papers:
Paper 001: 001.BioChem.bi901283p.pdf
Sup versions of papers:
Paper 001: 001.BioChem.bi901283p_Sm.pdf
