In [1]:
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

In [2]:
#get location for each gene
all_genes = ["ORF1ab", "RdRp", "HE", "Spike", "S1", "S2", "NS2", "E", "M", "N"]
gene_locations = {g:'' for g in all_genes}

reference_file = '../config/reference_oc43_genome.gb'

for record in SeqIO.parse(open(reference_file,"r"), "genbank"):
    for feature in record.features:
        if feature.type == 'CDS':
            if 'locus_tag' in feature.qualifiers:
                for gene in all_genes:
                    if feature.qualifiers['locus_tag'] == [gene]:
                        gene_locations[gene] = feature.location


In [3]:
lineages = ['A', 'B', 'all']

In [4]:
#alignment file will only have the accession number as id
#need to get date and country info from original fasta 
#do it for each lineage (A, B, and all)


record_info_by_accession = {'A':{}, 'B':{}, 'all':{}}
for lineage in lineages:
    for record in SeqIO.parse(open(f'oc43_{lineage}_genome.fasta', "r"), "fasta"):
        accession = record.description.split('|')[0]
        record_info_by_accession[lineage][accession] = record.description

In [9]:
#make a sequence fasta file for each gene by extracting the sequences out of the alignment
#make a dictionary of all seq records for each gene
#do it for each lineage (A, B, and all)

gene_records_from_genome_alignment = {'A':{g:[] for g in all_genes}, 
                                      'B':{g:[] for g in all_genes}, 
                                      'all':{g:[] for g in all_genes}}

for lineage in lineages:
    for record in SeqIO.parse(open(f'../results/aligned_oc43_{lineage}_genome.fasta', "r"), "fasta"):
        for gene in all_genes:
            record_info = record_info_by_accession[lineage][record.id]
            gene_record = SeqRecord(gene_locations[gene].extract(record.seq), id=record_info, description='')
            gene_records_from_genome_alignment[lineage][gene].append(gene_record)


In [11]:
#location of old gene-alignment files, by gene

old_gene_fasta_locations_by_lineage = {}

for lineage in ['A', 'B']:
    old_gene_fasta_locations = {"ORF1ab": f"../../../seasonal-cov_eLifepaper/oc43/separate_lineages/data/oc43_{lineage.lower()}_replicase1ab.fasta", 
                                "RdRp": f"../../../seasonal-cov_eLifepaper/oc43/separate_lineages/data/oc43_{lineage.lower()}_rdrp.fasta", 
                                "S1": f"../../../seasonal-cov_eLifepaper/oc43/separate_lineages/data/oc43_{lineage.lower()}_s1.fasta", 
                                "S2": f"../../../seasonal-cov_eLifepaper/oc43/separate_lineages/data/oc43_{lineage.lower()}_s2.fasta", 
                                "Spike": f"../../../seasonal-cov_eLifepaper/oc43/separate_lineages/data/oc43_{lineage.lower()}_spike.fasta", 
                                "HE": f"../../../seasonal-cov_eLifepaper/oc43/separate_lineages/data/oc43_{lineage.lower()}_he.fasta", 
                                "M": f"../../../seasonal-cov_eLifepaper/oc43/separate_lineages/data/oc43_{lineage.lower()}_membrane.fasta",
                                "N": f"../../../seasonal-cov_eLifepaper/oc43/separate_lineages/data/oc43_{lineage.lower()}_nucleocapsid.fasta"}
    old_gene_fasta_locations_by_lineage[lineage] = old_gene_fasta_locations


In [17]:
gene_records_from_old_alignments_by_lineage = {}

for lineage in ['A', 'B']:
    already_seen = record_info_by_accession[lineage].keys()

    gene_records_from_old_alignments = {g:[] for g in all_genes}

    for gene in all_genes:
        if gene in old_gene_fasta_locations_by_lineage[lineage].keys():
            file_location = old_gene_fasta_locations_by_lineage[lineage][gene]
            for record in SeqIO.parse(open(file_location,"r"), "fasta"):
                if record.id.split('|')[1] not in already_seen:
                    gene_length = len(record.seq)
                    #only take seq if it covers 75% or more of gene
                    if record.seq.count('N') < gene_length*0.25:
                        accession = record.id.split('|')[1]
                        strain_name = record.id.split('|')[0]
                        date = record.id.split('|')[3]
                        country = record.id.split('|')[5]
                        list_of_info = [accession, strain_name, date, country]
                        new_record_info = '|'.join(list_of_info)
                        gene_record = SeqRecord(record.seq, id=new_record_info, description='')
                        gene_records_from_old_alignments[gene].append(gene_record)
                        
    gene_records_from_old_alignments_by_lineage[lineage] = gene_records_from_old_alignments
                        
    


In [18]:
#add the sequences from old gene-specific alignments to the new data
for lineage in ['A', 'B']:
    for k,v in gene_records_from_old_alignments_by_lineage[lineage].items():
        gene_records_from_genome_alignment[lineage][k]+=v


In [20]:
for lineage in lineages:
    for k,v in gene_records_from_genome_alignment[lineage].items():
        SeqIO.write(v, f'oc43_{lineage}_{k}.fasta', "fasta")