In [12]:
from Bio import SeqIO
from Bio import Entrez
import requests
import pandas as pd

In [43]:
def get_list_of_genbank_ids_used(alignment_file):
    """
    Make a list of all of the genbank ids used in this build
    Format as string with space between. Ex: "OQ593393 NC_006310"
    Make a list of chunks with 200 seqs each because that's all Entrez will let me fetch at once
    """
    accessions = ""
    
    total_records = 0
    for record in SeqIO.parse(alignment_file, "fasta"):
        accession = record.id
        accessions+=f"{accession} "
        total_records+=1
        
    #split into list of chunks of 200 or less
    accession_list = []
    count = 0
    while count<total_records:
        chunk = accessions.split(" ")[:200]
        accession_list.append(" ".join(chunk))
        accessions = " ".join(accessions.split(" ")[200:])
        count+=200
        
        
    return accession_list

In [69]:
def get_genbank_author_info(virus):
    """
    Get the author information for each sequence that was used
    """

    ack_info = []
    
    #for each segement
    segments = ['HEF', 'M', 'NP', 'NS', 'P3', 'PB1', 'PB2']
    
    for segment in segments:
    
        #get a list of accession numbers from the alignment file
        alignment_file = f'../results/aligned_fluC_{segment}.fasta'
        accession_list = get_list_of_genbank_ids_used(alignment_file)

        #Go througuh in chunks of 200 and get genbank file for each sequence
        for chunk in accession_list:
            #Inputs
            Entrez.email = "kkistler@fredhutch.org"

            # Download GenBank file
            handle = Entrez.efetch(db="nucleotide", id=chunk, rettype="gb", retmode="text")
            records = SeqIO.parse(handle, "genbank")

            # Print Authors, Papers, and Journal
            for record in records:
                ack_info.append({'Accession': record.id.split('.')[0], 'Segment': segment,
                                 'Strain': record.annotations['organism'],
                                 'Virus': virus,
                                 'Author': record.annotations["references"][0].authors, 
                                 'Publication_Title': record.annotations["references"][0].title, 
                                 'Journal': record.annotations["references"][0].journal, 
                                 'Database': 'NCBI Virus, Genbank'})
            handle.close()
        
    ack_df = pd.DataFrame(ack_info)
    
    #save the acknowledgement table
    ack_df.to_csv(f'acknowledgments_{virus}.tsv', sep='\t', index=False)
    

In [70]:
get_genbank_author_info('influenzaC')