In [44]:
import re
import json
from os import path
import pandas as pd
import numpy as np
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio import AlignIO
from Bio.Align import MultipleSeqAlignment
from Bio.Align import AlignInfo

In [45]:
def edit_fasta_dates(cov, gene, separate_lineages=False):
    input_file_alignment = '../beast/'+str(cov)+'/'+str(gene)+'/aligned_'+str(cov)+'_'+str(gene)+'.fasta'
    
    sequences = []
    with open(input_file_alignment, "r") as aligned_handle:
        for virus in SeqIO.parse(aligned_handle, "fasta"):
            if separate_lineages:
                clade_df = separate_clades(cov, gene)
                for lineage in ['A', 'B']:
                    if virus.id in clade_df[clade_df['clade']==lineage]['strain'].tolist():
                        year = virus.id.split('/')[-1]
                        year_check = re.compile('\d{4}')
                        if year_check.match(year):
                            sequences.append(SeqRecord(virus.seq, id=virus.id, description=virus.description))
                
            else:
                year = virus.id.split('/')[-1]
                year_check = re.compile('\d{4}')
                if year_check.match(year):
                    sequences.append(SeqRecord(virus.seq, id=virus.id, description=virus.description))
                    
    if separate_lineages:
        for lineage in ['a', 'b']:
            output_file_alignment = '../beast/'+str(cov)+str(lineage)+'/'+str(gene)+'/aligned_'+str(cov)+str(lineage)+'_'+str(gene)+'.fasta'
            SeqIO.write(sequences, output_file_alignment, "fasta")
    else:
        SeqIO.write(sequences, input_file_alignment, "fasta")

In [48]:
covs= ['hku1']
genes = ['rdrp', 'spike', 's1', 's2']

for cov in covs:
    for gene in genes:
        edit_fasta_dates(cov, gene, separate_lineages=True)

In [43]:
covs= ['229e', 'nl63', 'oc43']
genes = ['rdrp', 'spike', 's1', 's2']

for cov in covs:
    for gene in genes:
        edit_fasta_dates(cov, gene)

In [46]:
#split oc43 and hku1 into clades

def separate_clades(cov, gene):

    if path.exists('../'+str(cov)+'/results/clades_'+str(gene)+'.json'):
        clade_file = '../'+str(cov)+'/results/clades_'+str(gene)+'.json'

    else:
        clade_file = '../'+str(cov)+'/results/clades_full.json'
    
    clade_lists = []
    with open(clade_file, "r") as clade_handle:
        clades = json.load(clade_handle)
        for node, v in clades['nodes'].items():
            if 'NODE' not in node:
                clade_lists.append({'clade':v['clade_membership'],
                                   'strain':node})

    clade_df = pd.DataFrame(clade_lists)
    return clade_df