# Goal
Programmatically run PAMl analysis with different input alignment, tree and options, and collect the results for output. This helps with reproducibility and makes documenting the analysis easier.

# Approach
Use `Biopython`'s integration for PAML.

## Setup environments

In [1]:
from Bio.Phylo.PAML import codeml
from scipy.stats import chi2
import os
import csv

In [2]:
# store the script directory
script_dir = os.path.abspath('')
print(script_dir)

/Users/bhe2/Documents/work/current/C037-Cand-auris-adhesin/02-case-studies/09-natural-selection/script


## Helper functions

In [78]:
def get_ll(res, m = 0):
    """
    extract the log likelihood score
    default to NSsite = 0
    """
    return(res['NSsites'][m]['lnL'])

def get_np(res, m = 0):
    """
    extract the log likelihood score
    default to NSsite = 0
    """
    par = res['NSsites'][m]['parameters']['parameter list']
    return(len(par.split(' ')))

In [8]:
# function to extract the useful information from each model fit
def get_site_model_fit(res):
    out = [] # list of lists to store the model output
    codon = res['codon model'] # codon model used
    for m, fit in res['NSsites'].items():
        # model number
        M = f'M{m}'
        # get the model description
        desc = dict.get(fit, 'description')
        # get the number of parameters
        par = dict.get(fit, 'parameters')
        l = len(par['parameter list'].split(' '))
        # get the log likelihood values
        lnL = dict.get(fit, 'lnL')
        # get the kappa estimate
        k = dict.get(par, 'kappa')
        # get the model-specific set of parameters
        omega = ''
        # separately deal with the different site models
        if m == 0:
            omega = f'w={par["omega"]:.3f}'
        elif m < 7:
            site_class = dict.get(par, 'site classes')
            #print(site_class)
            for i, v in site_class.items():
                omega += f'p{i}={v["proportion"]:.3f}, w{i}={float(v["omega"]):.3f}; '
        elif m == 7:
            omega = f'p={par["p"]:.3f}, q={par["q"]:.3f}'
        elif m == 8:
            omega = f'p0={par["p0"]:.3f}, p={par["p"]:.3f}, q={par["q"]:.3f}; p1={1-par["p0"]:.3f}, w={par["w"]:.3f}'
            if par['w'] == 1.0: # model M8a
                M = 'M8a'
                
        # print the results to the screen
        # print(f'{M}: {desc} has {l} parameters, lnL = {lnL:.3f}, kappa = {k:.3f}, \n\tsite classes: {omega}')
        # append the results as a row
        out.append([codon, M, l, lnL, k, omega, desc])
    return(out)


In [79]:
# define a few helper functions
def print_branch_test(res1, res0, m = 0):
    """
    extract and print the most important branch test result parameters, including
    codonmodel, lnL, npar, omega_tree
    also perform a log likelihood ratio test comparing the alternative with the null model
    designated as res1 and res0
    default to NSsites = 0
    """
    # get the lnL and npar for both models
    lnL1 = get_ll(res1, m); npar1 = get_np(res1, m)
    lnL0 = get_ll(res0, m); npar0 = get_np(res0, m)
    dlnL = lnL1 - lnL0; dnpar = npar1 - npar0
    llr_P = 1 - chi2.cdf(x = 2*dlnL, df = dnpar)
    # get the parameter lists for both models
    res1_fit = res1["NSsites"][m]; res1_par = res1_fit["parameters"]
    res0_fit = res0["NSsites"][m]; res0_par = res0_fit["parameters"]
    # print useful model output
    print(f'Main result (codon model = {res1["codon model"]})\n\
    lnL: {lnL1}; np: {npar1}; kappa: {res1_par["kappa"]:.3f}; w: {res1_par["omega"]}')
    # omega tree: {res1_fit["omega tree"]}')
    print(f'Null model (codon model = {res0["codon model"]})\n\
    lnL: {lnL0}; np: {npar0}; kappa: {res0_par["kappa"]:.3f}; w: {res0_par["omega"]}\n\
    2*dlnL = {2*dlnL:.3f}, df = {dnpar}, LLR P-value = {llr_P:.2e}')

# p1-414
## Site models

### M0,1,2,7,8 (codonfreq=1)

In [10]:
# to allow for this code chunk to be run out of order, first switch back to the script dir
os.chdir(script_dir)
# change to the first segment analysis dir
working_dir = "../output/paml/B8441-OG-part/p1-414/site-cf1/"
os.chdir(working_dir)

In [151]:
cml = codeml.Codeml()
cml.working_dir = "./"
cml.alignment = "../B8441-OG-Hil-PF11765-697-981.nuc"
cml.tree = "../p697-981.nwk"
cml.out_file = "mlc"

In [152]:
cml.set_options(
    noisy = 2,
    verbose = 0,
    seqtype = 1,
    CodonFreq = 1, # F1x4
    clock = 0,
    model = 0,     # one ratio for all branches
    NSsites = [0,1,2,7,8],
    icode = 8,
    fix_kappa = 0,
    kappa = 2,
    fix_omega = 0,
    omega = .4,
    cleandata = 1,
    fix_blength = 0
)

In [153]:
cml.ctl_file = "codeml.ctl"
cml.write_ctl_file()

In [154]:
res = cml.run()

### M8a (codonfreq=1)

In [106]:
# to allow for this code chunk to be run out of order, first switch back to the script dir
os.chdir(script_dir)
# change to the first segment analysis dir
working_dir = "../output/paml/B8441-OG-part/p697-981/site-M8a-cf1/"
os.chdir(working_dir)

FileNotFoundError: [Errno 2] No such file or directory: '../output/paml/B8441-OG-part/p697-981/site-M8a-cf1/'

In [12]:
cml = codeml.Codeml()
cml.working_dir = "./"
cml.alignment = "../B8441-OG-Hil-PF11765-1-414.nuc"
cml.tree = "../p1-414.nwk"
cml.out_file = "mlc"

In [157]:
cml.set_options(
    noisy = 2,
    verbose = 0,
    seqtype = 1,
    CodonFreq = 1, # F1x4
    clock = 0,
    model = 0,     # one ratio for all branches
    NSsites = [8], # M8a
    icode = 8,
    fix_kappa = 0,
    kappa = 2,
    fix_omega = 1,
    omega = 1,     # fix omega to be 1 as null model
    cleandata = 1,
    fix_blength = 0
)

In [158]:
cml.ctl_file = "codeml.ctl"
cml.write_ctl_file()

In [159]:
res = cml.run()

### M0,1,2,7,8 (codonfreq=2)

In [16]:
# to allow for this code chunk to be run out of order, first switch back to the script dir
os.chdir(script_dir)
# change to the first segment analysis dir
working_dir = "../output/paml/B8441-OG-part/p1-414/site-cf2/"
os.chdir(working_dir)

In [17]:
cml = codeml.Codeml()
cml.working_dir = "./"
cml.alignment = "../B8441-OG-Hil-PF11765-1-414.nuc"
cml.tree = "../p1-414.nwk"
cml.out_file = "mlc"

In [162]:
cml.set_options(
    noisy = 2,
    verbose = 0,
    seqtype = 1,
    CodonFreq = 2, # F3x4
    clock = 0,
    model = 0,     # one ratio for all branches
    NSsites = [0,1,2,7,8],
    icode = 8,
    fix_kappa = 0,
    kappa = 2,
    fix_omega = 0,
    omega = .4,
    cleandata = 1,
    fix_blength = 0
)

In [163]:
cml.ctl_file = "codeml.ctl"
cml.write_ctl_file()

In [164]:
res = cml.run()

### M8a (codonfreq=2)

In [21]:
# to allow for this code chunk to be run out of order, first switch back to the script dir
os.chdir(script_dir)
# change to the first segment analysis dir
working_dir = "../output/paml/B8441-OG-part/p1-414/site-M8a-cf2/"
os.chdir(working_dir)

In [22]:
cml = codeml.Codeml()
cml.working_dir = "./"
cml.alignment = "../B8441-OG-Hil-PF11765-1-414.nuc"
cml.tree = "../p1-414.nwk"
cml.out_file = "mlc"

In [167]:
cml.set_options(
    noisy = 2,
    verbose = 0,
    seqtype = 1,
    CodonFreq = 2, # F3x4
    clock = 0,
    model = 0,     # one ratio for all branches
    NSsites = [8], # M8a
    icode = 8,
    fix_kappa = 0,
    kappa = 2,
    fix_omega = 1, # fix omega = 1 as null
    omega = 1,
    cleandata = 1,
    fix_blength = 0
)

In [168]:
cml.ctl_file = "codeml.ctl"
cml.write_ctl_file()

In [169]:
res = cml.run()

### Site model test result

In [170]:
# to allow for this code chunk to be run out of order, first switch back to the script dir
os.chdir(script_dir)
# change to the first segment analysis dir
working_dir = "../output/paml/B8441-OG-part/p697-981/"
os.chdir(working_dir)


In [114]:
# read site model results into a dictionary
site_res = {
    'site-cf1': codeml.read('site-cf1/mlc'),
    'site-cf2': codeml.read('site-cf2/mlc'),
    'site-M8a-cf1': codeml.read('site-M8a-cf1/mlc'),
    'site-M8a-cf2': codeml.read('site-M8a-cf2/mlc')
}

In [172]:
# process the data and write the output to a csv file
data_rows = []
for t in site_res1.values():
    data_rows += get_site_model_fit(t)
fields = ['codonfreq', 'model', 'npar', 'lnL', 'kappa', 'omega', 'description']
with open("20220729-p697-981-site-model-summary-table.tsv", "w") as f:
    write = csv.writer(f, delimiter = "\t")
    write.writerow(fields)
    write.writerows(data_rows)

No evidence was found to support positive selection by any of the tests.

## Branch models

### one ratio (codonfreq = 1)

In [48]:
# to allow for this code chunk to be run out of order, first switch back to the script dir
os.chdir(script_dir)
# change to the first segment analysis dir
working_dir = "../output/paml/B8441-OG-part/p1-414/branch-1r-cf1/"
os.chdir(working_dir)

In [49]:
cml = codeml.Codeml()
cml.working_dir = "./"
cml.alignment = "../B8441-OG-Hil-PF11765-1-414.nuc"
cml.tree = "../p1-414.nwk"
cml.out_file = "mlc"

In [50]:
cml.set_options(
    noisy = 2,
    verbose = 0,
    seqtype = 1,
    CodonFreq = 1,
    model = 0,
    NSsites = [0],
    icode = 8,
    fix_kappa = 0,
    kappa = 2,
    fix_omega = 0,
    omega = .4,
    cleandata = 1,
    fix_blength = 0
)

In [51]:
cml.ctl_file = "codeml.ctl"
cml.write_ctl_file()

In [52]:
res = cml.run()

### free ratio (codonfreq = 1)

In [53]:
# to allow for this code chunk to be run out of order, first switch back to the script dir
os.chdir(script_dir)
# change to the first segment analysis dir
working_dir = "../output/paml/B8441-OG-part/p1-414/branch-freer-cf1/"
os.chdir(working_dir)

In [54]:
cml = codeml.Codeml()
cml.working_dir = "./"
cml.alignment = "../B8441-OG-Hil-PF11765-1-414.nuc"
cml.tree = "../p1-414.nwk"
cml.out_file = "mlc"

In [55]:
cml.set_options(
    noisy = 2,
    verbose = 0,
    seqtype = 1,
    CodonFreq = 1,
    clock = 0,
    model = 1,
    NSsites = [0],
    icode = 8,
    fix_kappa = 0,
    kappa = 2,
    fix_omega = 0,
    omega = .4,
    cleandata = 1,
    fix_blength = 0
)

In [56]:
cml.ctl_file = "codeml.ctl"
cml.write_ctl_file()

In [57]:
res = cml.run()

### one ratio (codonfreq = 2)

In [None]:
# to allow for this code chunk to be run out of order, first switch back to the script dir
os.chdir(script_dir)
# change to the first segment analysis dir
working_dir = "../output/paml/B8441-OG-part/p1-414/branch-1r-cf2/"
os.chdir(working_dir)

In [40]:
cml = codeml.Codeml()
cml.working_dir = "./"
cml.alignment = "../B8441-OG-Hil-PF11765-1-414.nuc"
cml.tree = "../p1-414.nwk"
cml.out_file = "mlc"

In [84]:
cml.set_options(
    noisy = 2,
    verbose = 0,
    seqtype = 1,
    CodonFreq = 2,
    model = 0,
    NSsites = [0],
    icode = 8,
    fix_kappa = 0,
    kappa = 2,
    fix_omega = 0,
    omega = .4,
    cleandata = 1,
    fix_blength = 0
)

In [85]:
cml.ctl_file = "codeml.ctl"
cml.write_ctl_file()

In [86]:
res = cml.run()

### free ratio (codonfreq = 2)

In [None]:
# to allow for this code chunk to be run out of order, first switch back to the script dir
os.chdir(script_dir)
# change to the first segment analysis dir
working_dir = "../output/paml/B8441-OG-part/p1-414/branch-freer-cf2/"
os.chdir(working_dir)

In [89]:
cml = codeml.Codeml()
cml.working_dir = "./"
cml.alignment = "../B8441-OG-Hil-PF11765-1-414.nuc"
cml.tree = "../p1-414.nwk"
cml.out_file = "mlc"

In [90]:
cml.set_options(
    noisy = 2,
    verbose = 0,
    seqtype = 1,
    CodonFreq = 2,
    clock = 0,
    model = 1,
    NSsites = [0],
    icode = 8,
    fix_kappa = 0,
    kappa = 2,
    fix_omega = 0,
    omega = .4,
    cleandata = 1,
    fix_blength = 0
)

In [91]:
cml.ctl_file = "codeml.ctl"
cml.write_ctl_file()

In [92]:
res = cml.run()

### Free vs one-omega models test result
First, we reimport all the model results

In [64]:
# to allow for this code chunk to be run out of order, first switch back to the script dir
os.chdir(script_dir)
# change to the first segment analysis dir
working_dir = "../output/paml/B8441-OG-part/p1-414/"
os.chdir(working_dir)
branch_res = {
    'branch-1r-cf1': codeml.read('branch-1r-cf1/mlc'),
    'branch-1r-cf2': codeml.read('branch-1r-cf2/mlc'),
    'branch-freer-cf1': codeml.read('branch-freer-cf1/mlc'),
    'branch-freer-cf2': codeml.read('branch-freer-cf2/mlc')
}

In [80]:
dl_cf1 = get_ll(branch_res['branch-freer-cf1']) - get_ll(branch_res['branch-1r-cf1'])
df_cf1 = get_np(branch_res['branch-freer-cf1']) - get_np(branch_res['branch-1r-cf1'])
p_cf1 = 1 - chi2.cdf(2*dl_cf1, df = df_cf1)
print("Testing one ratio vs free ratio models, with CodonFreq=1")
print(f'df={df_cf1}, 2*∆lnL={2*dl_cf1:.2f}, p={p_cf1:.2e}')

Testing one ratio vs free ratio models, with CodonFreq=1
df=14, 2*∆lnL=55.73, p=6.47e-07


In [81]:
dl_cf2 = get_ll(branch_res['branch-freer-cf2']) - get_ll(branch_res['branch-1r-cf2'])
df_cf2 = get_np(branch_res['branch-freer-cf2']) - get_np(branch_res['branch-1r-cf2'])
p_cf2 = 1 - chi2.cdf(2*dl_cf2, df = df_cf2)
# p_cf2 = 1.427e-02 # calculated with the chi2 program from PAML package
print("Testing one ratio vs free ratio models, with CodonFreq=1")
print(f'df={df_cf2}, 2*∆lnL={2*dl_cf2:.2f}, p={p_cf2:.2e}')

Testing one ratio vs free ratio models, with CodonFreq=1
df=14, 2*∆lnL=27.99, p=1.43e-02


there is thus strong evidence for non-equal dN/dS along the branches. But this test is almost always significant because the null hypothesis of homogeneous dN/dS across the entire tree is not realistic under most cases.

I plotted the free ratio model estimates on the gene tree using the `ggtree` package in R:
![free ratio estimates](../output/figure/20220727-B8441-OG-p1-414-branch-freer-tree.png)

By comparing the results obtained with CodonFreq=2 or CodonFreq=1, I see that two of the three branches identified as having elevated dN/dS by the F3x4 model was also implicated by the F1x4 model. Given Ziheng's [suggestion](https://groups.google.com/g/pamlsoftware/c/i7-NFSgnhq8/m/80rWE37kBgAJ), I decided to focus on the F3x4 model result and test the three branches together and separately (labeled as ω1, ω2 and ω3, respectively).

My next goal is to separately test whether there are statistical support for these two branches having a dN/dS > 1.

### F1x4, two ratio (w0, w1=w2=w3=w4)

In [150]:
# to allow for this code chunk to be run out of order, first switch back to the script dir
os.chdir(script_dir)
# change to the first segment analysis dir
working_dir = "../output/paml/B8441-OG-part/p697-981/site-cf1/"
if not os.path.exists(working_dir):
    os.mkdir(working_dir)
os.chdir(working_dir)

In [59]:
cml = codeml.Codeml()
cml.working_dir = "./"
cml.alignment = "../B8441-OG-Hil-PF11765-1-414.nuc"
cml.tree = "../p1-414-2r_all-cf1.nwk"
cml.out_file = "mlc"

In [60]:
cml.set_options(
    noisy = 2,
    verbose = 0,
    seqtype = 1,
    CodonFreq = 1, # F1x4 codon
    model = 2,     # 2 or more dN/dS ratios for branches
    NSsites = [0], # one site class 
    icode = 8,     # yeast alternative nuclear code
    fix_kappa = 0,
    kappa = 2,
    fix_omega = 0,
    omega = .4,
    cleandata = 1,
    fix_blength = 0
)

In [61]:
cml.ctl_file = "codeml.ctl"
cml.write_ctl_file()

In [62]:
res = cml.run()

In [83]:
branch_res['branch-2r_all-cf1'] = codeml.read("mlc")
print_branch_test(branch_res['branch-2r_all-cf1'], branch_res['branch-1r-cf1'])

Main result (codon model = F1x4)
    lnL: -3421.85936; np: 18; kappa: 1.583; w: [0.12695, 1.77137]
Null model (codon model = F1x4)
    lnL: -3427.952669; np: 17; kappa: 1.592; w: 0.13685
    2*dlnL = 12.187, df = 1, LLR P-value = 4.81e-04


### F1x4, two ratio constrained (w0, w1=w2=w3=w4=1)

In [84]:
# to allow for this code chunk to be run out of order, first switch back to the script dir
os.chdir(script_dir)
# change to the first segment analysis dir
working_dir = "../output/paml/B8441-OG-part/p1-414/branch-2r_alleq1-cf1/"
if not os.path.exists(working_dir):
    os.mkdir(working_dir)
os.chdir(working_dir)

In [33]:
cml = codeml.Codeml()
cml.working_dir = "./"
cml.alignment = "../B8441-OG-Hil-PF11765-1-414.nuc"
cml.tree = "../p1-414-2r_all-cf1.nwk"
cml.out_file = "mlc"

In [34]:
cml.set_options(
    noisy = 2,
    verbose = 0,
    seqtype = 1,
    CodonFreq = 1, # F1x4 codon
    model = 2,     # 2 or more dN/dS ratios for branches
    NSsites = [0], # one site class 
    icode = 8,     # yeast alternative nuclear code
    fix_kappa = 0,
    kappa = 2,
    fix_omega = 1,
    omega = 1,
    cleandata = 1,
    fix_blength = 0
)

In [35]:
cml.ctl_file = "codeml.ctl"
cml.write_ctl_file()

In [36]:
res = cml.run()

In [85]:
branch_res['branch-2r_alleq1-cf1'] = codeml.read("mlc")
print_branch_test(branch_res['branch-2r_all-cf1'], branch_res['branch-2r_alleq1-cf1'])

Main result (codon model = F1x4)
    lnL: -3421.85936; np: 18; kappa: 1.583; w: [0.12695, 1.77137]
Null model (codon model = F1x4)
    lnL: -3421.992047; np: 17; kappa: 1.581; w: [0.1292, 1.0]
    2*dlnL = 0.265, df = 1, LLR P-value = 6.06e-01


### two ratio (w0, w1=w2=w3, F3x4)

In [79]:
# to allow for this code chunk to be run out of order, first switch back to the script dir
os.chdir(script_dir)
# change to the first segment analysis dir
working_dir = "../output/paml/B8441-OG-part/p1-414/branch-2r_all-cf2/"
if not os.path.exists(working_dir):
    os.mkdir(working_dir)
os.chdir(working_dir)

In [80]:
cml = codeml.Codeml()
cml.working_dir = "./"
cml.alignment = "../B8441-OG-Hil-PF11765-1-414.nuc"
cml.tree = "../p1-414-2r_all-cf2.nwk"
cml.out_file = "mlc"

In [81]:
cml.set_options(
    noisy = 2,
    verbose = 0,
    seqtype = 1,
    CodonFreq = 2, # F3x4 model
    model = 2,     # 2 or more dN/dS ratios for branches
    NSsites = [0], # one site class 
    icode = 8,     # yeast alternative nuclear code
    fix_kappa = 0,
    kappa = 2,
    fix_omega = 0,
    omega = .4,
    cleandata = 1,
    fix_blength = 0
)

In [82]:
cml.ctl_file = "codeml.ctl"
cml.write_ctl_file()

In [83]:
res = cml.run()

In [84]:
branch_res['branch-2r_all-cf2'] = codeml.read("mlc")
print_branch_test(branch_res['branch-2r_all-cf2'], branch_res['branch-1r-cf2'])

Main result (codon model = F3x4)
    lnL: -3360.672152; np: 18; kappa: 1.316; w: [0.0292, 0.00277]
    omega tree: ((Hil7 #0.0291987 , (((Hil1 #0.0291987 , Hil2 #0.0291987 ) #0.0291987 , (Hil4 #0.0291987 , Hil3 #0.0291987 ) #0.0291987 ) #0.0291987 , (Hil8 #0.0291987 , Hil6 #0.0291987 ) #0.00277474 ) #0.00277474 ) #0.00277474 , Hil5 #0.0291987 , OG #0.0291987 );
Null model (codon model = F3x4)
    lnL: -3362.613054; np: 17; kappa: 1.342; w: 0.01825
    2*dlnL = 3.882, df = 1, LLR P-value = 4.88e-02


Note that the foreground lineages in the alternative model, which were selected based on them having significantly elevated dN/dS in the free ratio model, actually had a lower dN/dS estimate than the background. This is difficult to interpret.

## Branch-site model
This model tests selection on a subset of the sites in a subset of lineages (foreground). The alternative model is specified by `Model A: model = 2, NSsites = 2, fix_omega = 0` and the null model is specified by `Model A1: model = 2, NSsites = 2, fix_omega = 1, omega = 1`
### Alternative model, (F1x4)

In [72]:
# to allow for this code chunk to be run out of order, first switch back to the script dir
os.chdir(script_dir)
# change to the first segment analysis dir
working_dir = "../output/paml/B8441-OG-part/p1-414/brnsite-alt-2r_all-cf1/"
if not os.path.exists(working_dir):
    os.mkdir(working_dir)
os.chdir(working_dir)

In [39]:
cml = codeml.Codeml()
cml.working_dir = "./"
cml.alignment = "../B8441-OG-Hil-PF11765-1-414.nuc"
cml.tree = "../p1-414-2r_all-cf1.nwk"
cml.out_file = "mlc"

In [40]:
cml.set_options(
    noisy = 2,
    verbose = 0,
    seqtype = 1,
    CodonFreq = 2, # F3x4 model
    model = 2,     # 2 or more dN/dS ratios for branches
    NSsites = [2], # three classes, constrained, neutral and positive
    icode = 8,     # yeast alternative nuclear code
    fix_kappa = 0,
    kappa = 2,
    fix_omega = 0,
    omega = .4,
    cleandata = 1,
    fix_blength = 0
)

In [41]:
cml.ctl_file = "codeml.ctl"
cml.write_ctl_file()

In [42]:
res = cml.run()

### Null model (F1x4)

In [43]:
# to allow for this code chunk to be run out of order, first switch back to the script dir
os.chdir(script_dir)
# change to the first segment analysis dir
working_dir = "../output/paml/B8441-OG-part/p1-414/brnsite-null-2r_all-cf1/"
if not os.path.exists(working_dir):
    os.mkdir(working_dir)
os.chdir(working_dir)

In [44]:
cml = codeml.Codeml()
cml.working_dir = "./"
cml.alignment = "../B8441-OG-Hil-PF11765-1-414.nuc"
cml.tree = "../p1-414-2r_all-cf1.nwk"
cml.out_file = "mlc"

In [45]:
cml.set_options(
    noisy = 2,
    verbose = 0,
    seqtype = 1,
    CodonFreq = 2, # F3x4 model
    model = 2,     # 2 or more dN/dS ratios for branches
    NSsites = [2], # three classes, constrained, neutral and positive
    icode = 8,     # yeast alternative nuclear code
    fix_kappa = 0,
    kappa = 2,
    fix_omega = 1,
    omega = 1,
    cleandata = 1,
    fix_blength = 0
)

In [46]:
cml.ctl_file = "codeml.ctl"
cml.write_ctl_file()

In [47]:
res = cml.run()

### Branch-site model test

In [87]:
# to allow for this code chunk to be run out of order, first switch back to the script dir
os.chdir(script_dir)
# change to the first segment analysis dir
working_dir = "../output/paml/B8441-OG-part/p1-414/"
os.chdir(working_dir)
brnsite_res = {
    'alt-2r_all-cf1': codeml.read('brnsite-alt-2r_all-cf1/mlc'),
    'null-2r_all-cf1': codeml.read('brnsite-null-2r_all-cf1/mlc')
}

In [90]:
dl_bs_cf1 = get_ll(brnsite_res['alt-2r_all-cf1'], m = 2) - get_ll(brnsite_res['null-2r_all-cf1'], m = 2)
df_bs_cf1 = get_np(brnsite_res['alt-2r_all-cf1'], m = 2) - get_ll(brnsite_res['null-2r_all-cf1'], m = 2)
p_bs_cf1 = 1 - chi2.cdf(2*dl_bs_cf1, df = df_bs_cf1)
print("Testing one ratio vs free ratio models, with CodonFreq=1")
print(f'df={df_bs_cf1}, 2*∆lnL={2*dl_bs_cf1:.2f}, p={p_bs_cf1:.2e}')

Testing one ratio vs free ratio models, with CodonFreq=1
df=3362.942643, 2*∆lnL=0.16, p=1.00e+00


No evidence of positive selection.

# p697-981

## Site models

### M0,1,2,7,8 (codonfreq=1)

In [150]:
# to allow for this code chunk to be run out of order, first switch back to the script dir
os.chdir(script_dir)
# change to the first segment analysis dir
working_dir = "../output/paml/B8441-OG-part/p697-981/site-cf1/"
if not os.path.exists(working_dir):
    os.mkdir(working_dir)
os.chdir(working_dir)

In [151]:
cml = codeml.Codeml()
cml.working_dir = "./"
cml.alignment = "../B8441-OG-Hil-PF11765-697-981.nuc"
cml.tree = "../p697-981.nwk"
cml.out_file = "mlc"

In [152]:
cml.set_options(
    noisy = 2,
    verbose = 0,
    seqtype = 1,
    CodonFreq = 1, # F1x4
    clock = 0,
    model = 0,     # one ratio for all branches
    NSsites = [0,1,2,7,8],
    icode = 8,
    fix_kappa = 0,
    kappa = 2,
    fix_omega = 0,
    omega = .4,
    cleandata = 1,
    fix_blength = 0
)

In [153]:
cml.ctl_file = "codeml.ctl"
cml.write_ctl_file()

In [154]:
res = cml.run()

### M8a (codonfreq=1)

In [155]:
# to allow for this code chunk to be run out of order, first switch back to the script dir
os.chdir(script_dir)
# change to the first segment analysis dir
working_dir = "../output/paml/B8441-OG-part/p697-981/site-M8a-cf1/"
if not os.path.exists(working_dir):
    os.mkdir(working_dir)
os.chdir(working_dir)

In [156]:
cml = codeml.Codeml()
cml.working_dir = "./"
cml.alignment = "../B8441-OG-Hil-PF11765-697-981.nuc"
cml.tree = "../p697-981.nwk"
cml.out_file = "mlc"

In [157]:
cml.set_options(
    noisy = 2,
    verbose = 0,
    seqtype = 1,
    CodonFreq = 1, # F1x4
    clock = 0,
    model = 0,     # one ratio for all branches
    NSsites = [8], # M8a
    icode = 8,
    fix_kappa = 0,
    kappa = 2,
    fix_omega = 1,
    omega = 1,     # fix omega to be 1 as null model
    cleandata = 1,
    fix_blength = 0
)

In [158]:
cml.ctl_file = "codeml.ctl"
cml.write_ctl_file()

In [159]:
res = cml.run()

### M0,1,2,7,8 (codonfreq=2)

In [160]:
# to allow for this code chunk to be run out of order, first switch back to the script dir
os.chdir(script_dir)
# change to the first segment analysis dir
working_dir = "../output/paml/B8441-OG-part/p697-981/site-cf2/"
if not os.path.exists(working_dir):
    os.mkdir(working_dir)
os.chdir(working_dir)

In [161]:
cml = codeml.Codeml()
cml.working_dir = "./"
cml.alignment = "../B8441-OG-Hil-PF11765-697-981.nuc"
cml.tree = "../p697-981.nwk"
cml.out_file = "mlc"

In [162]:
cml.set_options(
    noisy = 2,
    verbose = 0,
    seqtype = 1,
    CodonFreq = 2, # F3x4
    clock = 0,
    model = 0,     # one ratio for all branches
    NSsites = [0,1,2,7,8],
    icode = 8,
    fix_kappa = 0,
    kappa = 2,
    fix_omega = 0,
    omega = .4,
    cleandata = 1,
    fix_blength = 0
)

In [163]:
cml.ctl_file = "codeml.ctl"
cml.write_ctl_file()

In [164]:
res = cml.run()

### M8a (codonfreq=2)

In [165]:
# to allow for this code chunk to be run out of order, first switch back to the script dir
os.chdir(script_dir)
# change to the first segment analysis dir
working_dir = "../output/paml/B8441-OG-part/p697-981/site-M8a-cf2/"
if not os.path.exists(working_dir):
    os.mkdir(working_dir)
os.chdir(working_dir)

In [166]:
cml = codeml.Codeml()
cml.working_dir = "./"
cml.alignment = "../B8441-OG-Hil-PF11765-697-981.nuc"
cml.tree = "../p697-981.nwk"
cml.out_file = "mlc"

In [167]:
cml.set_options(
    noisy = 2,
    verbose = 0,
    seqtype = 1,
    CodonFreq = 2, # F3x4
    clock = 0,
    model = 0,     # one ratio for all branches
    NSsites = [8], # M8a
    icode = 8,
    fix_kappa = 0,
    kappa = 2,
    fix_omega = 1, # fix omega = 1 as null
    omega = 1,
    cleandata = 1,
    fix_blength = 0
)

In [168]:
cml.ctl_file = "codeml.ctl"
cml.write_ctl_file()

In [169]:
res = cml.run()

### Site model test result

In [170]:
# to allow for this code chunk to be run out of order, first switch back to the script dir
os.chdir(script_dir)
# change to the first segment analysis dir
working_dir = "../output/paml/B8441-OG-part/p697-981/"
os.chdir(working_dir)


In [171]:
# read site model results into a dictionary
site_res1 = {
    'cf1': codeml.read('site-cf1/mlc'),
    'cf2': codeml.read('site-cf2/mlc'),
    'M8a-cf1': codeml.read('site-M8a-cf1/mlc'),
    'M8a-cf2': codeml.read('site-M8a-cf2/mlc')
}

In [172]:
# process the data and write the output to a csv file
data_rows = []
for t in site_res1.values():
    data_rows += get_site_model_fit(t)
fields = ['codonfreq', 'model', 'npar', 'lnL', 'kappa', 'omega', 'description']
with open("20220729-p697-981-site-model-summary-table.tsv", "w") as f:
    write = csv.writer(f, delimiter = "\t")
    write.writerow(fields)
    write.writerows(data_rows)

No evidence was found to support positive selection by any of the tests.

## Branch models

### one ratio (F1x4)

In [173]:
# to allow for this code chunk to be run out of order, first switch back to the script dir
os.chdir(script_dir)
# change to the first segment analysis dir
working_dir = "../output/paml/B8441-OG-part/p697-981/branch-1r-cf1/"
if not os.path.exists(working_dir):
    os.mkdir(working_dir)
os.chdir(working_dir)

In [174]:
cml = codeml.Codeml()
cml.working_dir = "./"
cml.alignment = "../B8441-OG-Hil-PF11765-697-981.nuc"
cml.tree = "../p697-981.nwk"
cml.out_file = "mlc"

In [175]:
cml.set_options(
    noisy = 2,
    verbose = 0,
    seqtype = 1,
    CodonFreq = 1,  # F1x4
    model = 0,      # 1 ratio
    NSsites = [0],
    icode = 8,
    fix_kappa = 0,
    kappa = 2,
    fix_omega = 0,
    omega = .4,
    cleandata = 1,
    fix_blength = 0
)

In [176]:
cml.ctl_file = "codeml.ctl"
cml.write_ctl_file()

In [177]:
res = cml.run()

### free ratio (F1x4)

In [178]:
# to allow for this code chunk to be run out of order, first switch back to the script dir
os.chdir(script_dir)
# change to the first segment analysis dir
working_dir = "../output/paml/B8441-OG-part/p697-981/branch-freer-cf1/"
if not os.path.exists(working_dir):
    os.mkdir(working_dir)
os.chdir(working_dir)

In [179]:
cml = codeml.Codeml()
cml.working_dir = "./"
cml.alignment = "../B8441-OG-Hil-PF11765-697-981.nuc"
cml.tree = "../p697-981.nwk"
cml.out_file = "mlc"

In [180]:
cml.set_options(
    noisy = 2,
    verbose = 0,
    seqtype = 1,
    CodonFreq = 1,  # F1x4
    clock = 0,
    model = 1,      # free ratio
    NSsites = [0],
    icode = 8,
    fix_kappa = 0,
    kappa = 2,
    fix_omega = 0,
    omega = .4,
    cleandata = 1,
    fix_blength = 0
)

In [181]:
cml.ctl_file = "codeml.ctl"
cml.write_ctl_file()

In [182]:
res = cml.run()

### one ratio (F3x4)

In [183]:
# to allow for this code chunk to be run out of order, first switch back to the script dir
os.chdir(script_dir)
# change to the first segment analysis dir
working_dir = "../output/paml/B8441-OG-part/p697-981/branch-1r-cf2/"
if not os.path.exists(working_dir):
    os.mkdir(working_dir)
os.chdir(working_dir)

In [184]:
cml = codeml.Codeml()
cml.working_dir = "./"
cml.alignment = "../B8441-OG-Hil-PF11765-697-981.nuc"
cml.tree = "../p697-981.nwk"
cml.out_file = "mlc"

In [185]:
cml.set_options(
    noisy = 2,
    verbose = 0,
    seqtype = 1,
    CodonFreq = 2,  # F3x4
    model = 0,      # one ratio
    NSsites = [0],
    icode = 8,
    fix_kappa = 0,
    kappa = 2,
    fix_omega = 0,
    omega = .4,
    cleandata = 1,
    fix_blength = 0
)

In [186]:
cml.ctl_file = "codeml.ctl"
cml.write_ctl_file()

In [187]:
res = cml.run()

### free ratio (F3x4)

In [188]:
# to allow for this code chunk to be run out of order, first switch back to the script dir
os.chdir(script_dir)
# change to the first segment analysis dir
working_dir = "../output/paml/B8441-OG-part/p697-981/branch-freer-cf2/"
if not os.path.exists(working_dir):
    os.mkdir(working_dir)
os.chdir(working_dir)

In [189]:
cml = codeml.Codeml()
cml.working_dir = "./"
cml.alignment = "../B8441-OG-Hil-PF11765-697-981.nuc"
cml.tree = "../p697-981.nwk"
cml.out_file = "mlc"

In [190]:
cml.set_options(
    noisy = 2,
    verbose = 0,
    seqtype = 1,
    CodonFreq = 2,  # F3x4
    clock = 0,
    model = 1,      # free ratio
    NSsites = [0],
    icode = 8,
    fix_kappa = 0,
    kappa = 2,
    fix_omega = 0,
    omega = .4,
    cleandata = 1,
    fix_blength = 0
)

In [191]:
cml.ctl_file = "codeml.ctl"
cml.write_ctl_file()

In [192]:
res = cml.run()

### Free vs one-omega models test result
First, we reimport all the model results

In [193]:
# to allow for this code chunk to be run out of order, first switch back to the script dir
os.chdir(script_dir)
# change to the first segment analysis dir
working_dir = "../output/paml/B8441-OG-part/p697-981/"
os.chdir(working_dir)
branch_res = {
    '1r-cf1': codeml.read('branch-1r-cf1/mlc'),
    '1r-cf2': codeml.read('branch-1r-cf2/mlc'),
    'freer-cf1': codeml.read('branch-freer-cf1/mlc'),
    'freer-cf2': codeml.read('branch-freer-cf2/mlc')
}

In [194]:
print_branch_test(branch_res['freer-cf1'], branch_res['1r-cf1'])

Main result (codon model = F1x4)
    lnL: -2391.612283; np: 31; kappa: 1.429; w: [0.23095, 0.00445, 998.99991, 0.14377, 0.07403, 999.0, 92.44628, 511.2296, 0.1592, 0.11967, 0.10617, 957.23845, 0.04349, 0.62278, 0.17973]
Null model (codon model = F1x4)
    lnL: -2411.367957; np: 17; kappa: 1.413; w: 0.18181
    2*dlnL = 39.511, df = 14, LLR P-value = 3.04e-04


In [195]:
print_branch_test(branch_res['freer-cf2'], branch_res['1r-cf2'])

Main result (codon model = F3x4)
    lnL: -2380.28829; np: 31; kappa: 1.194; w: [0.65755, 0.00425, 999.0, 0.12997, 0.01553, 999.0, 0.1662, 999.0, 0.1177, 0.07865, 999.0, 0.11371, 0.0142, 0.28089, 0.02319]
Null model (codon model = F3x4)
    lnL: -2391.8604; np: 17; kappa: 1.443; w: 0.03329
    2*dlnL = 23.144, df = 14, LLR P-value = 5.80e-02


there is thus strong evidence for non-equal dN/dS along the branches. But this test is almost always significant because the null hypothesis of homogeneous dN/dS across the entire tree is not realistic under most cases.

I plotted the free ratio model estimates on the gene tree using the `ggtree` package in R:
![free ratio estimates](../output/figure/20220727-B8441-OG-p1-414-branch-freer-tree.png)

By comparing the results obtained with CodonFreq=2 or CodonFreq=1, I see that two of the three branches identified as having elevated dN/dS by the F3x4 model was also implicated by the F1x4 model. Given Ziheng's [suggestion](https://groups.google.com/g/pamlsoftware/c/i7-NFSgnhq8/m/80rWE37kBgAJ), I decided to focus on the F3x4 model result and test the three branches together and separately (labeled as ω1, ω2 and ω3, respectively).

My next goal is to separately test whether there are statistical support for these two branches having a dN/dS > 1.