# Publications markdown generator for academicpages

Takes a set of bibtex of publications and converts them for use with [academicpages.github.io](academicpages.github.io). This is an interactive Jupyter notebook ([see more info here](http://jupyter-notebook-beginner-guide.readthedocs.io/en/latest/what_is_jupyter.html)). 

The core python code is also in `pubsFromBibs.py`. 
Run either from the `markdown_generator` folder after replacing updating the publist dictionary with:
* bib file names
* specific venue keys based on your bib file preferences
* any specific pre-text for specific files
* Collection Name (future feature)

TODO: Make this work with other databases of citations, 
TODO: Merge this with the existing TSV parsing solution

In [2]:
from pybtex.database.input import bibtex
import pybtex.database.input.bibtex 
from time import strptime
import string
import html
import os
import re

In [3]:
#todo: incorporate different collection types rather than a catch all publications, requires other changes to template
publist = {
    # "proceeding": {
    #     "file" : "proceedings.bib",
    #     "venuekey": "booktitle",
    #     "venue-pretext": "In the proceedings of ",
    #     "collection" : {"name":"publications",
    #                     "permalink":"/publication/"}
        
    # },
    "journal":{
        "file": "pubs.bib",
        "venuekey" : "journal",
        "venue-pretext" : "",
        "collection" : {"name":"publications",
                        "permalink":"/publication/"}
    } 
}

In [4]:
html_escape_table = {
    "&": "&amp;",
    '"': "&quot;",
    "'": "&apos;"
    }

def html_escape(text):
    """Produce entities within text."""
    return "".join(html_escape_table.get(c,c) for c in text)

In [6]:
pubkey = {'\\apj': 'The Astrophysical Journal',
'Research in Astronomy and Astrophysics': 'Research in Astronomy and Astrophysics',
'Progress in Astronomy':'Progress in Astronomy'
}

In [8]:
publist[pubsource]

{'file': 'pubs.bib',
 'venuekey': 'journal',
 'venue-pretext': '',
 'collection': {'name': 'publications', 'permalink': '/publication/'}}

In [12]:
import re

def format_author(author_str):
    """将单个作者格式化为姓氏+首字母的格式"""
    match = re.match(r'^\s*(.*?)\s*{\s*(.+?)\s*}\s*$', author_str)
    if not match:
        return None
    given_names = re.sub(r'[-.]', ' ', match.group(1).strip())
    surname = match.group(2).strip()
    initials = [part[0].upper() for part in re.split(r'\s+', given_names) if part]
    return f"{surname} {' '.join(initials)}"

def format_citation(input_str):
    """主处理函数"""
    # 预处理HTML转义字符
    input_str = input_str.replace('&quot;', '"')
    
    # 分割作者部分
    authors_end = input_str.find('"')
    authors_part = input_str[:authors_end].strip(' ,')
    remaining_str = input_str[authors_end+1:]
    
    # 分割标题部分
    title_end = remaining_str.find('"')
    title = remaining_str[:title_end]
    remaining_str = remaining_str[title_end+1:].strip()
    
    # 提取期刊信息
    journal_info = re.match(r'^([^,]+),\s*(\d{4})(.*)$', remaining_str)
    if not journal_info:
        return "Invalid format"
    
    journal = journal_info.group(1).strip()
    year = journal_info.group(2)
    vol_info = journal_info.group(3).strip(' ,.')
    
    # 处理作者列表
    authors = [a.strip() for a in authors_part.split(',') if a.strip()]
    formatted_authors = []
    for author in authors:
        fa = format_author(author)
        if fa: formatted_authors.append(fa)
    
    # 构建作者字符串
    if len(formatted_authors) > 3:
        authors_str = ', '.join(formatted_authors[:3]) + ', et al.'
    else:
        authors_str = ', '.join(formatted_authors)
    
    # 构建最终引用
    citation = f"{authors_str} {journal}, {year}"
    if vol_info:
        citation += f", {vol_info}"
    return citation

# 示例用法
input_str = ' Zijian {Zhang},  Bin {Luo},  W. {Brandt},  Pu {Du},  Chen {Hu},  Jian {Huang},  Xingting {Pu},  Jian-Min {Wang},  Weimin {Yi}, "XMM-Newton Observations of Two Archival X-Ray Weak Type 1 Quasars: Obscuration Induced X-Ray Weakness and Variability." The Astrophysical Journal, 2023, 954(2): 159.'
print(format_citation(input_str))

Zhang Z, Luo B, Brandt W, et al. The Astrophysical Journal, 2023, 954(2): 159


In [13]:
for pubsource in publist:
    parser = bibtex.Parser()
    bibdata = parser.parse_file(publist[pubsource]["file"])

    #loop through the individual references in a given bibtex file
    for bib_id in bibdata.entries:
        #reset default date
        pub_year = "1900"
        pub_month = "01"
        pub_day = "01"
        
        b = bibdata.entries[bib_id].fields
        
        try:
            pub_year = f'{b["year"]}'

            #todo: this hack for month and day needs some cleanup
            if "month" in b.keys(): 
                if(len(b["month"])<3):
                    pub_month = "0"+b["month"]
                    pub_month = pub_month[-2:]
                elif(b["month"] not in range(12)):
                    tmnth = strptime(b["month"][:3],'%b').tm_mon   
                    pub_month = "{:02d}".format(tmnth) 
                else:
                    pub_month = str(b["month"])
            if "day" in b.keys(): 
                pub_day = str(b["day"])

                
            pub_date = pub_year+"-"+pub_month+"-"+pub_day
            
            #strip out {} as needed (some bibtex entries that maintain formatting)
            clean_title = b["title"].replace("{", "").replace("}","").replace("\\","").replace(" ","-")    

            url_slug = re.sub("\\[.*\\]|[^a-zA-Z0-9_-]", "", clean_title)
            url_slug = url_slug.replace("--","-")

            md_filename = (str(pub_date) + "-" + url_slug + ".md").replace("--","-")
            html_filename = (str(pub_date) + "-" + url_slug).replace("--","-")

            #Build Citation from text
            citation = ""

            #citation authors - todo - add highlighting for primary author?
            for author in bibdata.entries[bib_id].persons["author"]:
                citation = citation+" "+author.first_names[0]+" "+author.last_names[0]+", "

            #citation title
            citation = citation + "\"" + html_escape(b["title"].replace("{", "").replace("}","").replace("\\","")) + ".\""

            #add venue logic depending on citation type
            venue = pubkey[b['journal']]#publist[pubsource]["venue-pretext"]+b[publist[pubsource]["venuekey"]].replace("{", "").replace("}","").replace("\\","")

            citation = citation + " " + html_escape(venue)
            citation = citation + ", " + pub_year + "."
            
            
            ## YAML variables
            md = "---\ntitle: \""   + html_escape(b["title"].replace("{", "").replace("}","").replace("\\","")) + '"\n'
            
            md += """collection: """ +  publist[pubsource]["collection"]["name"]

            firstauthor = bibdata.entries[bib_id].persons["author"][0]
            if(firstauthor.first_names[0] == 'Zijian'):
                md += """\ncategory: firstauthor"""
            else:
                md += """\ncategory: coauthor"""

            md += """\npermalink: """ + publist[pubsource]["collection"]["permalink"]  + html_filename
            
            note = False
            if "note" in b.keys():
                if len(str(b["note"])) > 5:
                    md += "\nexcerpt: '" + html_escape(b["note"]) + "'"
                    note = True

            md += "\ndate: " + str(pub_date) 

            md += "\nvenue: '" + html_escape(venue) + "'"
            
            url = False
            if "url" in b.keys():
                if len(str(b["url"])) > 5:
                    md += "\npaperurl: '" + b["url"] + "'"
                    url = True

            md += "\ncitation: '" + format_citation(citation) + "'"   #html_escape(citation)
            md += "\nads_link: '" + b['adsurl'] + "'"

            md += "\n---"

            
            ## Markdown description for individual page
            if note:
                md += "\n" + html_escape(b["note"]) + "\n"

            # if url:
            #     md += "\n[Access paper here](" + b["url"] + "){:target=\"_blank\"}\n" 
            # else:
            #     md += "\nUse [Google Scholar](https://scholar.google.com/scholar?q="+html.escape(clean_title.replace("-","+"))+"){:target=\"_blank\"} for full citation"

            md_filename = os.path.basename(md_filename)

            with open("../_publications/" + md_filename, 'w', encoding="utf-8") as f:
                f.write(md)
            print(f'SUCESSFULLY PARSED {bib_id}: \"', b["title"][:60],"..."*(len(b['title'])>60),"\"")
        # field may not exist for a reference
        except KeyError as e:
            print(f'WARNING Missing Expected Field {e} from entry {bib_id}: \"', b["title"][:30],"..."*(len(b['title'])>30),"\"")
            continue


SUCESSFULLY PARSED 2025ApJ...991..137Z: " {On the Variability Features of Active Galactic Nuclei in Li ... "
SUCESSFULLY PARSED 2025ApJ...988..204C: " {Estimating Bolometric Luminosities of Type 1 Quasars with S ... "
SUCESSFULLY PARSED 2025ApJ...987..198P: " {Lyman-break Galaxies in the Megaparsec-scale Environments a ... "
SUCESSFULLY PARSED 2025ApJ...985..119Z: " {Analysis of Multi-epoch JWST Images of {\ensuremath{\sim}}3 ... "
SUCESSFULLY PARSED 2025ApJ...983...36Z: " {On the Extremely X-Ray Variable Active Galactic Nuclei in t ... "
SUCESSFULLY PARSED 2025ApJ...979..107H: " {Photometric Selection of Type 1 Quasars in the XMM-LSS Fiel ... "
SUCESSFULLY PARSED 2024ApJ...976...99S: " {GTC Optical/Near-infrared Upper Limits and NICER X-Ray Anal ... "
SUCESSFULLY PARSED 2023RAA....23k5023Z: " {GRB 220408B: A Three-episode Burst from a Precessing Jet}  "
SUCESSFULLY PARSED 2023ApJ...954..159Z: " {XMM-Newton Observations of Two Archival X-Ray Weak Type 1 Q ... "
SUCESSFULLY PARSED 2021A

In [3]:
import os
os.system('crds sync --contexts latest')

Traceback (most recent call last):
  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/Users/zijianzhang/miniconda3/envs/jwstpy311/lib/python3.11/site-packages/crds/sync.py", line 697, in <module>
    sys.exit(SyncScript()())
             ^^^^^^^^^^^^
  File "/Users/zijianzhang/miniconda3/envs/jwstpy311/lib/python3.11/site-packages/crds/core/cmdline.py", line 681, in __init__
    super(ContextsScript, self).__init__(*args, **keys)
  File "/Users/zijianzhang/miniconda3/envs/jwstpy311/lib/python3.11/site-packages/crds/core/cmdline.py", line 137, in __init__
    self.args = self.parser.parse_args(argv[1:])
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/zijianzhang/miniconda3/envs/jwstpy311/lib/python3.11/argparse.py", line 1869, in parse_args
    args, argv = self.parse_known_args(args, namespace)
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/zijianzhang/miniconda3/envs/jwstpy311/lib/py

256

Zhang Z, Luo B, Brandt W, et al. The Astrophysical Journal, 2023, 954(2): 159
