In [94]:
import os
from ete3 import Tree
from Bio import SeqIO

In [86]:
#Make directory structure for virus/gene
def make_directories(virus_path, gene_path):
    if os.path.exists(virus_path):
        if os.path.exists(gene_path):
            pass
        else:
            os.mkdir(gene_path)
    else:
        os.mkdir(virus_path)
        os.mkdir(gene_path)

In [141]:
#Excludes internal nodes and branch lengths
#Write tree into paml4.8/virus/gene directory
def edit_newick_for_paml(virus, gene, gene_path):
    paml_newick = gene_path+'pamltree_'+str(virus)+'_'+str(gene)+'.nwk'
    if not os.path.exists(paml_newick):
        nextstrain_newick = '../../'+str(virus)+'/results/tree_'+str(virus)+'_'+str(gene)+'.nwk'
        tree=Tree(nextstrain_newick, format=1)
        tree.write(format=9, outfile = paml_newick)

In [142]:
#Remove reference sequence from aligned sequence file
def edit_aligned_for_paml(virus, gene, gene_path):
    paml_aligned = gene_path+'pamlaligned_'+str(virus)+'_'+str(gene)+'.fasta'
    if not os.path.exists(paml_aligned):
        ref_seqs = {'oc43': 'AY391777.1', '229e':'AF304460.1', 'nl63':'AY567487.2', 'hku1':'NC_006577.2'}
        nextstrain_aligned = '../../'+str(virus)+'/results/aligned_'+str(virus)+'_'+str(gene)+'.fasta'
        aligned = SeqIO.parse(nextstrain_aligned, 'fasta')
        paml_sequences = []
        for record in aligned:
            if ref_seqs[virus] in record.id:
                pass
            else:
                paml_sequences.append(record)
        SeqIO.write(paml_sequences, paml_aligned, "fasta")

In [143]:
#Edit codeml control file for virus, gene
def edit_control_file(virus, gene, gene_path):
    paml_control = gene_path+'codeml.ctl'
    if not os.path.exists(paml_control):
        codeml_control = 'paml4.8/codeml.ctl'

        new_control = []
        with open(codeml_control, 'r') as handle:
            for line in handle.readlines():
                if 'seqfile' in line:
                    new_seqfile = '      seqfile = pamlaligned_'+str(virus)+'_'+str(gene)+'.fasta * sequence data filename\n'
                    new_control.append(new_seqfile)
                elif 'treefile' in line:
                    new_treefile = '     treefile = pamltree_'+str(virus)+'_'+str(gene)+'.nwk      * tree structure file name\n'
                    new_control.append(new_treefile)
                elif 'outfile' in line:
                    new_outfile = '      outfile = '+str(virus)+'_'+str(gene)+'           * main result file name\n'
                    new_control.append(new_outfile)
                else:
                    new_control.append(line)

        with open(paml_control, 'w') as edit_handle:
            edit_handle.writelines(new_control)

In [136]:
#Set up all necessary files for codeml
def codeml_setup(virus, gene):
    virus_path = 'paml4.8/'+str(virus)+'/'
    gene_path = virus_path+str(gene)+'/'
    
    make_directories(virus_path, gene_path)
    edit_newick_for_paml(virus, gene, gene_path)
    edit_aligned_for_paml(virus, gene, gene_path)
    edit_control_file(virus, gene, gene_path)

In [147]:
viruses = ['oc43']
genes = ['spike', 's1', 's2', 'replicase1ab']
for virus in viruses:
    for gene in genes:
        codeml_setup(virus, gene)