In [1]:
import pandas as pd
from Bio import SeqIO

In [7]:
def write_acknowledgements_csv(virus, genotype):
    """
    Write an acknowlegements table for all sequences that were used
    """
    
    #read in metadata file to find get strains/acession numbers
    metadata = pd.read_csv(f'../results/metadata_{virus}_{genotype}.tsv', sep='\t')
    used_accessions = list(metadata['strain'])
        
    #read in orginal sequences to get the author information
    author_info_by_sequence = []

    for record in SeqIO.parse(open(f"genbank_sequences.gb","r"), "genbank"):
        accession = record.id
        if accession in used_accessions:
            author = record.annotations['references'][0].authors
            publication_title = record.annotations['references'][0].title
            journal = record.annotations['references'][0].journal
            author_info_by_sequence.append({'Accession': accession.split('.')[0], 'Virus': f'ParvovirusB19-{genotype}',
                                            'Author': author, 
                                            'Publication_Title': publication_title, 'Journal': journal, 
                                            'Database': 'Genbank'})

    ack_df = pd.DataFrame(author_info_by_sequence)
        
    #group the sequences that came from the same submission
#     grouped_submissions_count = authors_df.groupby(['author', 'publication_title', 'journal'])['accession'].count()
#     grouped_submissions_list_strains = authors_df.groupby(['author', 'publication_title', 'journal'])['accession'].apply(list)


#     ack_df = pd.DataFrame({'strains': grouped_submissions_list_strains, 'number(strains)':grouped_submissions_count}).reset_index()
    ack_df.set_index('Accession')    
    
    #save the acknowledgement table
    ack_df.to_csv(f'acknowledgments_{virus}_{genotype}.tsv', sep='\t', index=False)
    
    

In [8]:
write_acknowledgements_csv('parvovirusB19', '1a')