In [1]:
import pandas as pd

### Combine acknowledgement tables for all viruses and all strains used in this study.

Sequences from different databases will have different columns, so allow there to be blank columns

In [61]:
#add database column to SARS2 
def add_database_column(virus):

    ack_table = pd.read_csv(f'virus_specific_tables/acknowledgments_{virus}.tsv', sep='\t')

    ack_table['Database'] = 'GISAID'
    ack_table['Virus'] = virus
    ack_table.rename(columns={"gisaid_epi_isl": "Accession"}, inplace=True)
    ack_table.set_index('Accession')
    
    ack_table.rename(columns={"strain": "Strain", "author":"Author", 
                              "originating_lab":"Originating_Lab", "submitting_lab": "Submitting_Lab",
                              "genbank_accession":"Genbank_Accession"}, inplace=True)
        
    ack_table.to_csv(f'virus_specific_tables/acknowledgments_{virus}.tsv', sep='\t', index=False)

In [62]:
add_database_column('sars2')

In [63]:
add_database_column('sars2_21L')

In [70]:
#combine all tables
viruses = ['229e', 'adenovirusB_3', 'adenovirusB_7', 'denv1', 'denv2', 'denv3', 'denv4', 
           'enterovirusd68', 'h1n1pdm', 'h3n2', 'hepatitisA_IA', 'hepatitisB_A2', 'hepatitisB_D3', 
           'hpiv_1', 'hpiv_3', 'influenzaC', 'measles', 'mumps', 'nl63', 'norovirus', 'oc43_A', 
             'rotavirusA', 'rsvA', 'rsvB', 'sars2_21L', 'sars2', 'vic', 'yam']


all_virus_acknowledgments = pd.DataFrame(columns=['Accession', 'Virus', 'Strain', 'Author', 'Publication_Title', 'Journal',
                                                  'Originating_Lab', 'Submitting_Lab', 'Genbank_Accession', 'Database'])

for v in viruses:
    table = pd.read_csv(f'virus_specific_tables/acknowledgments_{v}.tsv', sep='\t')
    
    all_virus_acknowledgments = pd.concat([all_virus_acknowledgments, table], ignore_index=True, join='outer')

all_virus_acknowledgments.fillna('')

all_virus_acknowledgments.to_csv('sequence_acknowledgments.tsv', sep='\t', index=False)
    