In [3]:
from Bio.Seq import Seq
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from Bio.SeqFeature import SeqFeature, FeatureLocation

Because the nextstrain build can't deal with translating the circular genome, adjust the reference coordinates so that Large doesn't wrap around the "end" of the genome. Will only be able to get the aa muts for Large, not other proteins this way

In [9]:
def make_ref_and_data_files_relative_to_Large_start(subtype):

    ref_path = f'reference_hepatitisB_{subtype}_orig.gb'  
    for record in SeqIO.parse(open(ref_path,"r"), "genbank"):
        original_length = len(record.seq)
        original_seq = record.seq
        for feature in record.features:
            if feature.type == 'CDS':
                if feature.qualifiers['locus_tag'][0] == 'Large':
                    Large_location = feature.location
                    Large_start = feature.location.parts[0].start
                    Large_end = feature.location.parts[1].end
                    Large_seq = Large_location.extract(original_seq)
                    #get the genomic sequence between the end of Large until it starts again (circular genome)
                    seq_after_LargeEnd_until_LargeStart = SeqFeature(FeatureLocation(Large_end, Large_start)).extract(original_seq)                
                    genome_starting_from_Large= Large_seq+seq_after_LargeEnd_until_LargeStart
      
    #write new genbank file, where the genome starts from the beginning of Large
    genome_seq = Seq(genome_starting_from_Large)
    
    ref_id_by_subtype= {'D3':'JF754591', 'A2':'JN182318'}
    record = SeqRecord(genome_seq,
                       id=ref_id_by_subtype[subtype], 
                       name=f'HepatitisB_{subtype}',
                       description=f'HepatitisB {subtype}, coordinates starting from the beginning of Large',
                       annotations={"molecule_type": "DNA"})

    # Add source annotation
    feature = SeqFeature(FeatureLocation(start=0, end=len(genome_seq)), type='source')
    record.features.append(feature)

    # Add Large annotation
    Large_feature = SeqFeature(FeatureLocation(start=0, end=len(Large_seq)), type='CDS', qualifiers= {'gene':'Large'})
    record.features.append(Large_feature)

    # Save as GenBank file
    output_file = open(f'reference_hepatitisB_{subtype}.gb', 'w')
    SeqIO.write(record, output_file, 'genbank')
    
    #change all input sequences to same coordinates
    data_path = f"../data/aligned_hepatitisB_{subtype}.fasta"
    #get header line from original data file
    original_data = f"../data/hepatitisB_{subtype}_original.fasta"
    #make dict to make accession to all other header info
    header_mapper = {}
    for record in SeqIO.parse(open(original_data,"r"), "fasta"):
        header_mapper[record.id.split('|')[0]]= record.description

    new_records = []
    for record in SeqIO.parse(open(data_path,"r"), "fasta"):
        new_coordinates_seq = Large_location.extract(record.seq)+SeqFeature(FeatureLocation(Large_end, Large_start)).extract(record.seq)
        header_info = header_mapper[record.id]
        new_records.append(SeqRecord(Seq(new_coordinates_seq), id=header_info, description=''))
    SeqIO.write(new_records, f"../data/hepatitisB_{subtype}.fasta", "fasta")

                

In [10]:
make_ref_and_data_files_relative_to_Large_start('A2')