In [1]:
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

In [33]:
#get location for each gene
all_genes = ["ORF1ab", "RdRp", "Spike", "S1", "S2", "Protein4a", "Protein4b", "E", "M", "N"]
gene_locations = {g:'' for g in all_genes}

reference_file = '../config/reference_229e_genome.gb'

for record in SeqIO.parse(open(reference_file,"r"), "genbank"):
    for feature in record.features:
        if feature.type == 'CDS':
            if 'locus_tag' in feature.qualifiers:
                for gene in all_genes:
                    if feature.qualifiers['locus_tag'] == [gene]:
                        gene_locations[gene] = feature.location


In [24]:
#alignment file will only have the accession number as id
#need to get date and country info from original fasta 

record_info_by_accession = {}
for record in SeqIO.parse(open('229e_genome.fasta', "r"), "fasta"):
    accession = record.description.split('|')[0]
    record_info_by_accession[accession] = record.description

In [25]:
#make a sequence fasta file for each gene by extracting the sequences out of the alignment
#make a dictionary of all seq records for each gene
gene_records_from_genome_alignment = {g:[] for g in all_genes}

for record in SeqIO.parse(open('../results/aligned_229e_genome.fasta', "r"), "fasta"):
    for gene in all_genes:
        record_info = record_info_by_accession[record.id]
        gene_record = SeqRecord(gene_locations[gene].extract(record.seq), id=record_info, description='')
        gene_records_from_genome_alignment[gene].append(gene_record)


In [26]:
#location of old gene-alignment files, by gene

old_gene_fasta_locations = {"ORF1ab": "../../../seasonal-cov_eLifepaper/229e/data/229e_replicase1ab.fasta", 
                            "RdRp": "../../../seasonal-cov_eLifepaper/229e/data/229e_rdrp.fasta", 
                            "S1": "../../../seasonal-cov_eLifepaper/229e/data/229e_s1.fasta", 
                            "S2": "../../../seasonal-cov_eLifepaper/229e/data/229e_s2.fasta", 
                            "Protein4a": "../../../seasonal-cov_eLifepaper/229e/data/229e_protein4a.fasta", 
                            "Protein4b": "../../../seasonal-cov_eLifepaper/229e/data/229e_protein4b.fasta", 
                            "E": "../../../seasonal-cov_eLifepaper/229e/data/229e_envelope.fasta", 
                            "M": "../../../seasonal-cov_eLifepaper/229e/data/229e_membrane.fasta",
                            "N": "../../../seasonal-cov_eLifepaper/229e/data/229e_nucleocapsid.fasta"}


In [30]:
already_seen = record_info_by_accession.keys()

gene_records_from_old_alignments = {g:[] for g in all_genes}

for gene in all_genes:
    if gene in old_gene_fasta_locations.keys():
        file_location = old_gene_fasta_locations[gene]
        for record in SeqIO.parse(open(file_location,"r"), "fasta"):
            if record.id.split('|')[1] not in already_seen:
                gene_length = len(record.seq)
                #only take seq if it covers 80% or more of gene
                if record.seq.count('N') < gene_length*0.2:
                    accession = record.id.split('|')[1]
                    strain_name = record.id.split('|')[0]
                    date = record.id.split('|')[3]
                    country = record.id.split('|')[5]
                    list_of_info = [accession, strain_name, date, country]
                    new_record_info = '|'.join(list_of_info)
                    gene_record = SeqRecord(record.seq, id=new_record_info, description='')
                    gene_records_from_old_alignments[gene].append(gene_record)


In [31]:
#add the sequences from old gene-specific alignments to the new data
for k,v in gene_records_from_old_alignments.items():
    gene_records_from_genome_alignment[k]+=v


In [32]:
for k,v in gene_records_from_genome_alignment.items():
    SeqIO.write(v, f'229e_{k}.fasta', "fasta")