In [1]:
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

All strains were aligned to a full genome reference file. Look at the aligned sequences and find strains where the HN and L genes were sequences (at least 80% coverage). Write these gene-only sequences to a fasta file as input for gene-only build

In [20]:
subtypes = ['2','4']

for subtype in subtypes:
    #get location of HN and L
    for seq_record in SeqIO.parse(f'../config/reference_hpiv_{subtype}.gb', "genbank"):
        for feature in seq_record.features:
            if feature.type == 'CDS':
                if 'gene' in feature.qualifiers.keys():
                    if feature.qualifiers['gene'][0] =='HN':
                        HN_loc = feature.location
                    elif feature.qualifiers['gene'][0] =='L':
                        L_loc = feature.location
    
    #get sequences covering the HN and L genes
    L_seq_records = []
    HN_seq_records = []
    
    for record in SeqIO.parse(open(f'../results/aligned_hpiv_{subtype}.fasta',"r"), "fasta"):
        L_seq = L_loc.extract(record.seq)
        #only take strains where gene is at least 80% sequenced
        L_sequence_coverage= L_seq.count('N')/len(L_seq)
        if L_sequence_coverage <=0.2:
            L_seq_records.append(SeqRecord(L_seq, id=record.id, description=record.description))
            
        HN_seq = HN_loc.extract(record.seq)
        #only take strains where gene is at least 80% sequenced
        HN_sequence_coverage= HN_seq.count('N')/len(HN_seq)
        if HN_sequence_coverage <=0.2:
            HN_seq_records.append(SeqRecord(HN_seq, id=record.id, description=record.description))
            
    SeqIO.write(HN_seq_records, f'hpiv_{subtype}_HN.fasta', "fasta")
    SeqIO.write(L_seq_records, f'hpiv_{subtype}_L.fasta', "fasta")