In [105]:
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from Bio import AlignIO
from collections import Counter 

In [122]:
#From aligned .fasta, extract just the portion of the genome encoding Spike
def extract_spike(aligned_fasta, original_fasta, output_spike_fasta, output_he_fasta):
    
    spike_sequences = {}
    he_sequences = {}
    with open(aligned_fasta, "r") as handle:
        alignment = SeqIO.parse(handle, "fasta")
        for record in alignment:
            spike_nt = record.seq[23643:27729]
            spike_nt_str = str(spike_nt)
            #Throw out sequences that don't cover Spike
            num_unaligned_spike = Counter(spike_nt_str)['N']
            if num_unaligned_spike < (len(spike_nt)/2):
                spike_sequences[record.id] = spike_nt
            
            he_nt = record.seq[22354:23629]
            he_nt_str = str(he_nt)
            #Throw out sequences that don't cover HE
            num_unaligned_he = Counter(he_nt_str)['N']
            if num_unaligned_he < (len(he_nt)/2):
                he_sequences[record.id] = he_nt
                
    print(len(spike_sequences))
    print(len(he_sequences))
    
    spike_entries = []
    he_entries = []
    with open(original_fasta, "r") as handle_2:
        metadata = SeqIO.parse(handle_2, "fasta")
        for record in metadata:
            gb_id = record.id.split('|')[0]
            if gb_id in spike_sequences.keys():
                spike_record = SeqRecord(spike_sequences[gb_id], id=record.id, description=record.id)
                spike_entries.append(spike_record)
            if gb_id in he_sequences.keys():
                he_record = SeqRecord(he_sequences[gb_id], id=record.id, description=record.id)
                he_entries.append(he_record)
                
    SeqIO.write(spike_entries, output_spike_fasta, "fasta")
    SeqIO.write(he_entries, output_he_fasta, "fasta")

In [127]:
extract_spike("../nextstrain/seasonal-corona-genome/results/aligned_hku1.fasta", 
              "../nextstrain/seasonal-corona-genome/data/hku1_datefix.fasta", 
              "../nextstrain/seasonal-corona-beta/data/hku1_spike_genomealign.fasta", 
              "../nextstrain/seasonal-corona-beta/data/hku1_he_genomealign.fasta")


39
29


In [126]:
extract_spike("../nextstrain/seasonal-corona-genome/results/aligned_oc43.fasta", 
              "../nextstrain/seasonal-corona-genome/data/oc43_datefix.fasta", 
              "../nextstrain/seasonal-corona-beta/data/oc43_spike_genomealign.fasta", 
              "../nextstrain/seasonal-corona-beta/data/oc43_he_genomealign.fasta")


322
69


In [125]:
extract_spike("../nextstrain/seasonal-corona-genome/results/aligned_nl63.fasta", 
              "../nextstrain/seasonal-corona-genome/data/nl63_datefix.fasta", 
              "../nextstrain/seasonal-corona-alpha/data/nl63_spike_genomealign.fasta", 
              "../nextstrain/seasonal-corona-alpha/data/nl63_he_genomealign.fasta")

67
119


In [128]:
extract_spike("../nextstrain/seasonal-corona-genome/results/aligned_229e.fasta", 
              "../nextstrain/seasonal-corona-genome/data/229e_datefix.fasta", 
              "../nextstrain/seasonal-corona-alpha/data/229e_spike_genomealign.fasta", 
              "../nextstrain/seasonal-corona-alpha/data/229e_he_genomealign.fasta")

27
58
