In [15]:
import csv

In [2]:
with open('VRS Authors - Sheet1.tsv', 'r') as f:
    dictreader = csv.DictReader(f, delimiter='\t')
    authors = list()
    contributors = list()
    for record in dictreader:
        if record['Include?'] in ['Author', 'Acknowledgements']:
            affiliations = {x.strip() for x in record['Merged Affiliation'].split(';')}
            record['affiliations'] = affiliations
            name = record['Author, as printed']
            components = [x.strip() for x in name.split(',')]
            record['normname'] = ' '.join(components[::-1])
            if record['Include?'] == 'Author':
                authors.append(record)
            contributors.append(record)

In [3]:
affiliations = set()
for author in authors:
    affiliations |= author['affiliations']

In [4]:
affiliations

{'Baylor College of Medicine, Houston, TX 77030, USA',
 'Bioinformatics Division, Walter and Eliza Hall Institute of Medical Research, Melbourne, Australia',
 'Center for Genomic Medicine, Massachusetts General Hospital, Cambridge, MA 02142, USA',
 'Department of Artificial Intelligence and Informatics, Mayo Clinic, Rochester, MN 55905, USA',
 'Department of Biomedical Informatics, Harvard Medical School, Boston MA 02115, USA',
 'Department of Medical Biology, University of Melbourne, Melbourne, Australia',
 "Department of Medicine, Brigham and Women's Hospital, Boston, MA 02115, USA",
 'Department of Pediatrics, The Ohio State University College of Medicine, Columbus, OH 43210, USA',
 'Essex Management LLC and National Cancer Institute, Rockville, MD 20850, USA',
 'European Molecular Biology Laboratory, European Bioinformatics Institute, Wellcome Genome Campus, Hinxton, Cambridge, CB10 1SD, United Kingdom',
 'Genomics England, London EC1M 6BQ, UK',
 'Gesinger Health, Danville, PA 1782

In [5]:
affiliation_dict = dict()
pointer = 1
out_string = ''
affiliation_list = list()
for author in authors:
    out_string += ', '
    out_string += author['normname']
    indexed_affiliations = dict()
    for affiliation in author['affiliations']:
        if affiliation in affiliation_dict:
            indexed_affiliations[affiliation_dict[affiliation]] = affiliation
        else:
            affiliation_dict[affiliation] = pointer
            indexed_affiliations[pointer] = affiliation
            pointer += 1
            affiliation_list.append(affiliation)
    out_string += ','.join([str(x) for x in sorted(indexed_affiliations.keys())])

out_string = out_string[2:]

In [6]:
out_string

'Alex H Wagner1,2, Lawrence Babb3, Gil Alterovitz4,5, Michael Baudis6, Matthew Brush7, Daniel L Cameron8,9, Melissa Cline10, Malachi Griffith11, Obi Lee Griffith11, Sarah E Hunt12, David Kreda13, Jennifer M Lee14, Stephanie Li15, Javier Lopez16, Eric Moyer17, Tristan Nelson18, Ronak Y Patel19, Kevin Riehle19, Peter N Robinson20, Shawn Rynearson21, Helen Schuilenburg12, Kirill Tsukanov12, Brian Walsh7, Melissa Konopko15, Heidi L Rehm3,22, Andrew D Yates12, Robert R Freimuth23, Reece K Hart3,24'

In [7]:
pointer = 1
for affiliation in affiliation_list:
    print(f"{pointer}{affiliation}")
    pointer += 1

1The Steve and Cindy Rasmussen Institute for Genomic Medicine, Nationwide Children's Hospital, Columbus, OH 43215, USA
2Department of Pediatrics, The Ohio State University College of Medicine, Columbus, OH 43210, USA
3Medical and Population Genetics, Broad Institute of MIT and Harvard, Cambridge, MA 02142, USA
4Harvard Medical School, Boston, MA 02115, USA
5Department of Medicine, Brigham and Women's Hospital, Boston, MA 02115, USA
6University of Zurich and Swiss Institute of Bioinformatics, Zurich, Switzerland
7Oregon Health & Science University, Portland, OR 97239, USA
8Bioinformatics Division, Walter and Eliza Hall Institute of Medical Research, Melbourne, Australia
9Department of Medical Biology, University of Melbourne, Melbourne, Australia
10UC Santa Cruz Genomics Institute, Santa Cruz, CA 95060, USA
11Washington University School of Medicine, St. Louis, MO 63108, USA
12European Molecular Biology Laboratory, European Bioinformatics Institute, Wellcome Genome Campus, Hinxton, Camb

In [8]:
from collections import Counter
c = Counter([author['Initials'] for author in authors])

In [9]:
c

Counter({'AHW': 1,
         'LB': 1,
         'GA': 1,
         'MB': 2,
         'DLC': 1,
         'MC': 1,
         'MG': 1,
         'OLG': 1,
         'SEH': 1,
         'DK': 1,
         'JML': 1,
         'SL': 1,
         'JL': 1,
         'EM': 1,
         'TN': 1,
         'RYP': 1,
         'KR': 1,
         'PNR': 1,
         'SR': 1,
         'HS': 1,
         'KT': 1,
         'BW': 1,
         'MK': 1,
         'HLR': 1,
         'ADY': 1,
         'RRF': 1,
         'RKH': 1})

In [10]:
authors[3]['Initials'] = 'MiB'
authors[4]['Initials'] = 'MaB'

In [11]:
authors[0].keys()

dict_keys(['Author, as printed', 'Order', 'Include?', 'Conceptualization', 'Methodology', 'Software', 'Validation', 'Formal analysis', 'Investigation', 'Resources', 'Data Curation', 'Writing - Original Draft', 'Writing - Review & Editing', 'Visualization', 'Supervision', 'Project administration', 'Funding acquisition', 'Affiliation', 'Address', 'Merged Affiliation', 'Email', 'Funding', 'ORCID', 'Comments', 'Role in VRS (edit if I got wrong!)', 'Initials', 'Funding statement', 'affiliations', 'normname'])

In [13]:
credits = ['Conceptualization', 'Methodology', 'Software', 'Validation', 'Formal analysis', 'Investigation', 
           'Resources', 'Data Curation', 'Writing - Original Draft', 'Writing - Review & Editing', 'Visualization', 
           'Supervision', 'Project administration', 'Funding acquisition']

In [14]:
credit_statements = list()
for credit in credits:
    credit_authors = [author['Initials'] for author in authors if author[credit] == 'TRUE']
    if len(credit_authors) == 1:
        credit_str = f"{credit_authors[0]} contributed to {credit}."
    elif len(credit_authors) == 2:
        credit_str = f"{credit_authors[0]} and {credit_authors[1]} contributed to {credit}."
    elif len(credit_authors) > 2:
        credit_str = f"{', '.join(credit_authors[:-1])}, and {credit_authors[-1]} contributed to {credit}."
    credit_statements.append(credit_str)
print(' '.join(credit_statements))

AHW, LB, GA, MiB, MaB, DLC, MC, OLG, SEH, DK, JML, JL, EM, TN, RYP, KR, SR, HS, KT, ADY, RRF, and RKH contributed to Conceptualization. AHW, LB, GA, DLC, JML, TN, SR, KT, RRF, and RKH contributed to Methodology. AHW, LB, MC, EM, TN, RYP, KR, SR, BW, and RKH contributed to Software. RYP, KR, BW, and RKH contributed to Validation. AHW, LB, DLC, and RKH contributed to Formal analysis. AHW, LB, GA, EM, TN, RRF, and RKH contributed to Investigation. AHW, LB, GA, MiB, MC, RYP, and KR contributed to Resources. AHW, LB, MiB, and RKH contributed to Data Curation. AHW and RKH contributed to Writing - Original Draft. AHW, LB, GA, MiB, MaB, MC, MG, OLG, SEH, DK, JML, SL, JL, EM, TN, KR, PNR, HS, KT, MK, HLR, ADY, RRF, and RKH contributed to Writing - Review & Editing. AHW, LB, SL, RRF, and RKH contributed to Visualization. LB, MiB, MG, OLG, MK, HLR, ADY, RRF, and RKH contributed to Supervision. AHW, LB, MiB, MK, HLR, ADY, RRF, and RKH contributed to Project administration. AHW, LB, OLG, HLR, ADY, 

## For bioRxiv

In [17]:
header = ['Email', 'Institution', 'First Name', 'Middle Name(s)/Initial(s)', 
          'Last Name', 'Suffix', 'Corresponding Author', 'Home Page URL', 
          'Collaborative Group/Consortium', 'ORCiD']
with open('biorxiv_authors.tsv', 'w') as f:
    writer = csv.DictWriter(f, header, delimiter='\t')
    writer.writeheader()
    for author in authors:
        record = dict()
        record['Email'] = author['Email']
        record['Institution'] = author['Merged Affiliation']
        name_components = author['normname'].split(' ')
        record['First Name'] = name_components[0]
        record['Middle Name(s)/Initial(s)'] = ' '.join(name_components[1:-1])
        record['Last Name'] = name_components[-1]
        if author['normname'] == 'Alex H Wagner':
            record['Corresponding Author'] = 'True'
        record['ORCiD'] = author['ORCID']
        writer.writerow(record)
    

In [18]:
authors[0]

{'Author, as printed': 'Wagner, Alex H',
 'Order': '1',
 'Include?': 'Author',
 'Conceptualization': 'TRUE',
 'Methodology': 'TRUE',
 'Software': 'TRUE',
 'Validation': 'FALSE',
 'Formal analysis': 'TRUE',
 'Investigation': 'TRUE',
 'Resources': 'TRUE',
 'Data Curation': 'TRUE',
 'Writing - Original Draft': 'TRUE',
 'Writing - Review & Editing': 'TRUE',
 'Visualization': 'TRUE',
 'Supervision': 'FALSE',
 'Project administration': 'TRUE',
 'Funding acquisition': 'TRUE',
 'Affiliation': "The Steve and Cindy Rasmussen Institute for Genomic Medicine, Nationwide Children's Hospital, Columbus, OH 43215, USA Department of Pediatrics, The Ohio State University College of Medicine, Columbus, OH 43210, USA",
 'Address': '<see affiliations>',
 'Merged Affiliation': "The Steve and Cindy Rasmussen Institute for Genomic Medicine, Nationwide Children's Hospital, Columbus, OH 43215, USA; Department of Pediatrics, The Ohio State University College of Medicine, Columbus, OH 43210, USA",
 'Email': 'Alex.

## For contributors doc

In [19]:
affiliations = set()
for contributor in contributors:
    affiliations |= contributor['affiliations']
    if '' in contributor['affiliations']:
        print(contributor['Author, as printed'])

In [20]:
affiliations

{'Baylor College of Medicine, Houston, TX 77030, USA',
 'Bioinformatics Division, Walter and Eliza Hall Institute of Medical Research, Melbourne, Australia',
 'Center for Genomic Medicine, Massachusetts General Hospital, Cambridge, MA 02142, USA',
 'Center for Individualized Medicine, Division of Digital Health Sciences, Mayo Clinic, Rochester, MN 55905, USA',
 'Department of Biomedical Informatics, Harvard Medical School, Boston MA 02115, USA',
 'Department of Human Genetics, University of Utah, Salt Lake City, UT 84112, USA',
 'Department of Medical Biology, University of Melbourne, Melbourne, Australia',
 "Department of Medicine, Brigham and Women's Hospital, Boston, MA 02115, USA",
 'Department of Pediatrics, The Ohio State University College of Medicine, Columbus, OH 43210, USA',
 'Essex Management LLC and National Cancer Institute, Rockville, MD 20850, USA',
 'European Molecular Biology Laboratory, European Bioinformatics Institute, Wellcome Genome Campus, Hinxton, Cambridge, CB1

In [21]:
contributors_reorder = [contributors[-1]] + contributors[:2]
contributors_reorder += sorted(contributors[2:-1], key=lambda x: x['Author, as printed'])

affiliation_dict = dict()
pointer = 1
out_string = ''
affiliation_list = list()
for contributor in contributors_reorder:
    out_string += '\n'
    out_string += '|' + contributor['normname']
    indexed_affiliations = dict()
    for affiliation in contributor['affiliations']:
        if affiliation in affiliation_dict:
            indexed_affiliations[affiliation_dict[affiliation]] = affiliation
        else:
            affiliation_dict[affiliation] = pointer
            indexed_affiliations[pointer] = affiliation
            pointer += 1
            affiliation_list.append(affiliation)
    refs = ''.join([f'[[{str(x)}](#{str(x)})]' for x in sorted(indexed_affiliations.keys())])
    out_string += f' | {refs} |'

out_string = out_string[1:]

In [22]:
header = """|Name             | Affiliation |
|-----------------| --------|
"""

In [23]:
with open('/Users/ahw001/git/vrs/CONTRIBUTORS.md', 'r') as f, \
     open('/Users/ahw001/git/vrs/CONTRIBUTORS.revised.md', 'w') as f2:
    inserted = False
    for line in f:
        if line.startswith('|'):
            if not inserted:
                f2.write(header + out_string)
                inserted = True
            continue
        f2.write(line)
    pointer = 1
    for affiliation in affiliation_list:
        f2.write(f'## {pointer}\n')
        f2.write(affiliation + '\n')
        pointer += 1