# Goal
Programmatically run PAMl analysis with different input alignment, tree and options, and collect the results for output. This helps with reproducibility and makes documenting the analysis easier.

# Approach
Use `Biopython`'s integration for PAML.

## Setup environments

In [1]:
from Bio.Phylo.PAML import codeml
from scipy.stats import chi2
import os
import csv

In [2]:
# store the script directory
script_dir = os.path.abspath('')
print(script_dir)

/Users/bhe2/Documents/work/current/C037-Cand-auris-adhesin/02-case-studies/09-natural-selection/script


## Helper functions

In [3]:
def get_ll(res, m = 0):
    """
    extract the log likelihood score
    default to NSsite = 0
    """
    return(res['NSsites'][m]['lnL'])

def get_np(res, m = 0):
    """
    extract the log likelihood score
    default to NSsite = 0
    """
    par = res['NSsites'][m]['parameters']['parameter list']
    return(len(par.split(' ')))

In [76]:
# helper function to extract the model fit from NSsite model objects
def get_omega(fit, m):
    # model number
    M = f'M{m}'
    par = dict.get(fit, 'parameters')
    # get the model-specific set of parameters
    omega = ''
    # separately deal with the different site models
    if m == 0:
        omega = f'w={par["omega"]:.3f}'
    elif m < 7:
        site_class = dict.get(par, 'site classes')
        #print(site_class)
        for i, v in site_class.items():
            omega += f'p{i}={v["proportion"]:.3f}, w{i}={float(v["omega"]):.3f}; '
    elif m == 7:
        omega = f'p={par["p"]:.3f}, q={par["q"]:.3f}'
    elif m == 8:
        omega = f'p0={par["p0"]:.3f}, p={par["p"]:.3f}, q={par["q"]:.3f}; p1={1-par["p0"]:.3f}, w={par["w"]:.3f}'
        if par['w'] == 1.0: # model M8a
            M = 'M8a'
    return([M, omega])
   

In [74]:
# function to extract the useful information from each model fit
def get_site_model_fit(res):
    out = []
    codon = res['codon model'] # codon model used
    for m, fit in res['NSsites'].items():
        # model number
        M = f'M{m}'
        # get the model description
        desc = dict.get(fit, 'description')
        # get the number of parameters
        par = dict.get(fit, 'parameters')
        l = len(par['parameter list'].split(' '))
        # get the log likelihood values
        lnL = dict.get(fit, 'lnL')
        # get the kappa estimate
        k = dict.get(par, 'kappa')
        # get the model-specific set of parameters
        omega = ''
        # separately deal with the different site models
        if m == 0:
            omega = f'w={par["omega"]:.3f}'
        elif m < 7:
            site_class = dict.get(par, 'site classes')
            #print(site_class)
            for i, v in site_class.items():
                omega += f'p{i}={v["proportion"]:.3f}, w{i}={float(v["omega"]):.3f}; '
        elif m == 7:
            omega = f'p={par["p"]:.3f}, q={par["q"]:.3f}'
        elif m == 8:
            omega = f'p0={par["p0"]:.3f}, p={par["p"]:.3f}, q={par["q"]:.3f}; p1={1-par["p0"]:.3f}, w={par["w"]:.3f}'
            if par['w'] == 1.0: # model M8a
                M = 'M8a'
                
        # print the results to the screen
        # print(f'{M}: {desc} has {l} parameters, lnL = {lnL:.3f}, kappa = {k:.3f}, \n\tsite classes: {omega}')
        # append the results as a row
        out.append([codon, M, l, lnL, k, omega, desc])
    return(out)

In [119]:
def site_test(res1, m1, m0, res0 = None):
    """
    given two result objects and model number, print the test result
    by default, the two NSsite model results are presumed to be in the 
    same object. an optional second result object can be passed to res1
    when comparing M8a with M8
    """
    if res0 is None:
        res0 = res1
    # get the lnL and npar for both models
    lnL1 = get_ll(res1, m1); npar1 = get_np(res1, m1)
    lnL0 = get_ll(res0, m0); npar0 = get_np(res0, m0)
    dlnL = lnL1 - lnL0; dnpar = npar1 - npar0
    llr_P = 1 - chi2.cdf(x = 2*dlnL, df = dnpar)
    # get the parameter lists for both models
    res1_fit = res1["NSsites"][m1]; res1_par = res1_fit["parameters"]
    res0_fit = res0["NSsites"][m0]; res0_par = res0_fit["parameters"]
    M1, omega1 = get_omega(res1_fit, m1)
    M0, omega0 = get_omega(res0_fit, m0)
    # print test result
    print(f'2*dlnL = {2*dlnL:.3f}, df = {dnpar}, LLR P-value = {llr_P:.2e}')
    print('')
    print(f'{M1} ({res1["codon model"]}): lnL: {lnL1:.3f}; np: {npar1}; kappa: {res1_par["kappa"]:.3f}\n\
    w: {omega1}')
    print('')
    print(f'{M0} ({res0["codon model"]}): lnL: {lnL0:.3f}; np: {npar0}; kappa: {res0_par["kappa"]:.3f}\n\
    w: {omega0}')
    

In [123]:
# Test M2a vs M1a, M8 vs M7 and M8a vs M8
def print_site_test(res, resM8a):
    """
    perform a log likelihood ratio test comparing M2a vs M1a, M8 vs M7
    and M8a vs M8. resM8a is the output specifically for M8a
    """
    print('-- M2a vs M1a --')
    site_test(res, 2, 1)
    print('///\n')
    print("-- M8 vs M7 --")
    site_test(res, 8, 7)
    print('///\n')
    print("-- M8a vs M8 --")
    site_test(res, 8, 8, resM8a)
    print('///\n')

In [102]:
# print branch test results
def print_branch_test(res1, res0, m = 0):
    """
    extract and print the most important branch test result parameters, including
    codonmodel, lnL, npar, omega_tree
    also perform a log likelihood ratio test comparing the alternative with the null model
    designated as res1 and res0
    default to NSsites = 0
    """
    # get the lnL and npar for both models
    lnL1 = get_ll(res1, m); npar1 = get_np(res1, m)
    lnL0 = get_ll(res0, m); npar0 = get_np(res0, m)
    dlnL = lnL1 - lnL0; dnpar = npar1 - npar0
    llr_P = 1 - chi2.cdf(x = 2*dlnL, df = dnpar)
    # get the parameter lists for both models
    res1_fit = res1["NSsites"][m]; res1_par = res1_fit["parameters"]
    res0_fit = res0["NSsites"][m]; res0_par = res0_fit["parameters"]
    # print useful model output
    print(f'Main result (codon model = {res1["codon model"]})\n\
    lnL: {lnL1}; np: {npar1}; kappa: {res1_par["kappa"]:.3f}; w: {res1_par["omega"]}')
    # omega tree: {res1_fit["omega tree"]}')
    print(f'Null model (codon model = {res0["codon model"]})\n\
    lnL: {lnL0}; np: {npar0}; kappa: {res0_par["kappa"]:.3f}; w: {res0_par["omega"]}\n\
    2*dlnL = {2*dlnL:.3f}, df = {dnpar}, LLR P-value = {llr_P:.2e}')

# p1-414
## Site models

### M0,1,2,7,8 (codonfreq=1)

In [6]:
# to allow for this code chunk to be run out of order, first switch back to the script dir
os.chdir(script_dir)
# change to the first segment analysis dir
working_dir = "../output/paml/B8441-OG-part/p1-414/site-cf1/"
os.chdir(working_dir)

In [7]:
cml = codeml.Codeml()
cml.working_dir = "./"
cml.alignment = "../B8441-OG-Hil-PF11765-1-414.nuc"
cml.tree = "../p1-414.nwk"
cml.out_file = "mlc"

In [8]:
cml.set_options(
    noisy = 2,
    verbose = 0,
    seqtype = 1,
    CodonFreq = 1, # F1x4
    clock = 0,
    model = 0,     # one ratio for all branches
    NSsites = [0,1,2,7,8],
    icode = 8,
    fix_kappa = 0,
    kappa = 2,
    fix_omega = 0,
    omega = .4,
    cleandata = 1,
    fix_blength = 0
)

In [9]:
cml.ctl_file = "codeml.ctl"
cml.write_ctl_file()

In [10]:
res = cml.run()

### M8a (codonfreq=1)

In [11]:
# to allow for this code chunk to be run out of order, first switch back to the script dir
os.chdir(script_dir)
# change to the first segment analysis dir
working_dir = "../output/paml/B8441-OG-part/p1-414/site-M8a-cf1/"
os.chdir(working_dir)

In [12]:
cml = codeml.Codeml()
cml.working_dir = "./"
cml.alignment = "../B8441-OG-Hil-PF11765-1-414.nuc"
cml.tree = "../p1-414.nwk"
cml.out_file = "mlc"

In [13]:
cml.set_options(
    noisy = 2,
    verbose = 0,
    seqtype = 1,
    CodonFreq = 1, # F1x4
    clock = 0,
    model = 0,     # one ratio for all branches
    NSsites = [8], # M8a
    icode = 8,
    fix_kappa = 0,
    kappa = 2,
    fix_omega = 1,
    omega = 1,     # fix omega to be 1 as null model
    cleandata = 1,
    fix_blength = 0
)

In [14]:
cml.ctl_file = "codeml.ctl"
cml.write_ctl_file()

In [15]:
res = cml.run()

### M0,1,2,7,8 (codonfreq=2)

In [17]:
# to allow for this code chunk to be run out of order, first switch back to the script dir
os.chdir(script_dir)
# change to the first segment analysis dir
working_dir = "../output/paml/B8441-OG-part/p1-414/site-cf2/"
if not os.path.exists(working_dir):
    os.mkdir(working_dir)
os.chdir(working_dir)

In [18]:
cml = codeml.Codeml()
cml.working_dir = "./"
cml.alignment = "../B8441-OG-Hil-PF11765-1-414.nuc"
cml.tree = "../p1-414.nwk"
cml.out_file = "mlc"

In [19]:
cml.set_options(
    noisy = 2,
    verbose = 0,
    seqtype = 1,
    CodonFreq = 2, # F3x4
    clock = 0,
    model = 0,     # one ratio for all branches
    NSsites = [0,1,2,7,8],
    icode = 8,
    fix_kappa = 0,
    kappa = 2,
    fix_omega = 0,
    omega = .4,
    cleandata = 1,
    fix_blength = 0
)

In [20]:
cml.ctl_file = "codeml.ctl"
cml.write_ctl_file()

In [21]:
res = cml.run()

### M8a (codonfreq=2)

In [22]:
# to allow for this code chunk to be run out of order, first switch back to the script dir
os.chdir(script_dir)
# change to the first segment analysis dir
working_dir = "../output/paml/B8441-OG-part/p1-414/site-M8a-cf2/"
if not os.path.exists(working_dir):
    os.mkdir(working_dir)
os.chdir(working_dir)

In [23]:
cml = codeml.Codeml()
cml.working_dir = "./"
cml.alignment = "../B8441-OG-Hil-PF11765-1-414.nuc"
cml.tree = "../p1-414.nwk"
cml.out_file = "mlc"

In [24]:
cml.set_options(
    noisy = 2,
    verbose = 0,
    seqtype = 1,
    CodonFreq = 2, # F3x4
    clock = 0,
    model = 0,     # one ratio for all branches
    NSsites = [8], # M8a
    icode = 8,
    fix_kappa = 0,
    kappa = 2,
    fix_omega = 1, # fix omega = 1 as null
    omega = 1,
    cleandata = 1,
    fix_blength = 0
)

In [25]:
cml.ctl_file = "codeml.ctl"
cml.write_ctl_file()

In [26]:
res = cml.run()

### Site model test result

In [132]:
# to allow for this code chunk to be run out of order, first switch back to the script dir
os.chdir(script_dir)
# change to the first segment analysis dir
working_dir = "../output/paml/B8441-OG-part/p1-414/"
os.chdir(working_dir)


In [133]:
# read site model results into a dictionary
site_res = {
    'site-cf1': codeml.read('site-cf1/mlc'),
    'site-cf2': codeml.read('site-cf2/mlc'),
    'site-M8a-cf1': codeml.read('site-M8a-cf1/mlc'),
    'site-M8a-cf2': codeml.read('site-M8a-cf2/mlc')
}

In [134]:
# process the data and write the output to a csv file
data_rows = []
for t in site_res.values():
    data_rows += get_site_model_fit(t)
fields = ['codonfreq', 'model', 'npar', 'lnL', 'kappa', 'omega', 'description']
with open("20220727-p1-414-site-model-summary-table.tsv", "w") as f:
    write = csv.writer(f, delimiter = "\t")
    write.writerow(fields)
    write.writerows(data_rows)

In [135]:
# perform the three pairs of tests
print("CodonFreq = 1")
print_site_test(site_res['site-cf1'], site_res['site-M8a-cf1'])
print('')                    
print("CodonFreq = 2")
print_site_test(site_res['site-cf2'], site_res['site-M8a-cf2'])

CodonFreq = 1
-- M2a vs M1a --
2*dlnL = 0.000, df = 2, LLR P-value = 1.00e+00

M2 (F1x4): lnL: -3412.984; np: 20; kappa: 1.758
    w: p0=0.884, w0=0.130; p1=0.057, w1=1.000; p2=0.060, w2=1.000; 

M1 (F1x4): lnL: -3412.984; np: 18; kappa: 1.758
    w: p0=0.884, w0=0.130; p1=0.116, w1=1.000; 
///

-- M8 vs M7 --
2*dlnL = -0.001, df = 2, LLR P-value = 1.00e+00

M8 (F1x4): lnL: -3390.595; np: 20; kappa: 1.696
    w: p0=1.000, p=1.275, q=6.783; p1=0.000, w=4.777

M7 (F1x4): lnL: -3390.594; np: 18; kappa: 1.696
    w: p=1.275, q=6.783
///

-- M8a vs M8 --
2*dlnL = 1.164, df = 1, LLR P-value = 2.81e-01

M8 (F1x4): lnL: -3390.595; np: 20; kappa: 1.696
    w: p0=1.000, p=1.275, q=6.783; p1=0.000, w=4.777

M8a (F1x4): lnL: -3391.177; np: 19; kappa: 1.705
    w: p0=0.990, p=1.217, q=6.490; p1=0.010, w=1.000
///


CodonFreq = 2
-- M2a vs M1a --
2*dlnL = 0.000, df = 2, LLR P-value = 1.00e+00

M2 (F3x4): lnL: -3351.508; np: 20; kappa: 1.471
    w: p0=0.903, w0=0.058; p1=0.016, w1=1.000; p2=0.081, w2

No evidence was found to support positive selection by any of the tests.

## Branch models

### one ratio (codonfreq = 1)

In [48]:
# to allow for this code chunk to be run out of order, first switch back to the script dir
os.chdir(script_dir)
# change to the first segment analysis dir
working_dir = "../output/paml/B8441-OG-part/p1-414/branch-1r-cf1/"
if not os.path.exists(working_dir):
    os.mkdir(working_dir)
os.chdir(working_dir)

In [49]:
cml = codeml.Codeml()
cml.working_dir = "./"
cml.alignment = "../B8441-OG-Hil-PF11765-1-414.nuc"
cml.tree = "../p1-414.nwk"
cml.out_file = "mlc"

In [50]:
cml.set_options(
    noisy = 2,
    verbose = 0,
    seqtype = 1,
    CodonFreq = 1,
    model = 0,
    NSsites = [0],
    icode = 8,
    fix_kappa = 0,
    kappa = 2,
    fix_omega = 0,
    omega = .4,
    cleandata = 1,
    fix_blength = 0
)

In [51]:
cml.ctl_file = "codeml.ctl"
cml.write_ctl_file()

In [52]:
res = cml.run()

### free ratio (codonfreq = 1)

In [53]:
# to allow for this code chunk to be run out of order, first switch back to the script dir
os.chdir(script_dir)
# change to the first segment analysis dir
working_dir = "../output/paml/B8441-OG-part/p1-414/branch-freer-cf1/"
os.chdir(working_dir)

In [54]:
cml = codeml.Codeml()
cml.working_dir = "./"
cml.alignment = "../B8441-OG-Hil-PF11765-1-414.nuc"
cml.tree = "../p1-414.nwk"
cml.out_file = "mlc"

In [55]:
cml.set_options(
    noisy = 2,
    verbose = 0,
    seqtype = 1,
    CodonFreq = 1,
    clock = 0,
    model = 1,
    NSsites = [0],
    icode = 8,
    fix_kappa = 0,
    kappa = 2,
    fix_omega = 0,
    omega = .4,
    cleandata = 1,
    fix_blength = 0
)

In [56]:
cml.ctl_file = "codeml.ctl"
cml.write_ctl_file()

In [57]:
res = cml.run()

### one ratio (codonfreq = 2)

In [None]:
# to allow for this code chunk to be run out of order, first switch back to the script dir
os.chdir(script_dir)
# change to the first segment analysis dir
working_dir = "../output/paml/B8441-OG-part/p1-414/branch-1r-cf2/"
os.chdir(working_dir)

In [40]:
cml = codeml.Codeml()
cml.working_dir = "./"
cml.alignment = "../B8441-OG-Hil-PF11765-1-414.nuc"
cml.tree = "../p1-414.nwk"
cml.out_file = "mlc"

In [84]:
cml.set_options(
    noisy = 2,
    verbose = 0,
    seqtype = 1,
    CodonFreq = 2,
    model = 0,
    NSsites = [0],
    icode = 8,
    fix_kappa = 0,
    kappa = 2,
    fix_omega = 0,
    omega = .4,
    cleandata = 1,
    fix_blength = 0
)

In [85]:
cml.ctl_file = "codeml.ctl"
cml.write_ctl_file()

In [86]:
res = cml.run()

### free ratio (codonfreq = 2)

In [None]:
# to allow for this code chunk to be run out of order, first switch back to the script dir
os.chdir(script_dir)
# change to the first segment analysis dir
working_dir = "../output/paml/B8441-OG-part/p1-414/branch-freer-cf2/"
os.chdir(working_dir)

In [89]:
cml = codeml.Codeml()
cml.working_dir = "./"
cml.alignment = "../B8441-OG-Hil-PF11765-1-414.nuc"
cml.tree = "../p1-414.nwk"
cml.out_file = "mlc"

In [90]:
cml.set_options(
    noisy = 2,
    verbose = 0,
    seqtype = 1,
    CodonFreq = 2,
    clock = 0,
    model = 1,
    NSsites = [0],
    icode = 8,
    fix_kappa = 0,
    kappa = 2,
    fix_omega = 0,
    omega = .4,
    cleandata = 1,
    fix_blength = 0
)

In [91]:
cml.ctl_file = "codeml.ctl"
cml.write_ctl_file()

In [92]:
res = cml.run()

### Free vs one-omega models test result
First, we reimport all the model results

In [31]:
# to allow for this code chunk to be run out of order, first switch back to the script dir
os.chdir(script_dir)
# change to the first segment analysis dir
working_dir = "../output/paml/B8441-OG-part/p1-414/"
os.chdir(working_dir)
branch_res = {
    '1r-cf1': codeml.read('branch-1r-cf1/mlc'),
    '1r-cf2': codeml.read('branch-1r-cf2/mlc'),
    'freer-cf1': codeml.read('branch-freer-cf1/mlc'),
    'freer-cf2': codeml.read('branch-freer-cf2/mlc')
}

In [33]:
dl_cf1 = get_ll(branch_res['freer-cf1']) - get_ll(branch_res['1r-cf1'])
df_cf1 = get_np(branch_res['freer-cf1']) - get_np(branch_res['1r-cf1'])
p_cf1 = 1 - chi2.cdf(2*dl_cf1, df = df_cf1)
print("Testing one ratio vs free ratio models, with CodonFreq=1")
print(f'df={df_cf1}, 2*∆lnL={2*dl_cf1:.2f}, p={p_cf1:.2e}')

Testing one ratio vs free ratio models, with CodonFreq=1
df=14, 2*∆lnL=55.73, p=6.47e-07


In [35]:
dl_cf2 = get_ll(branch_res['freer-cf2']) - get_ll(branch_res['1r-cf2'])
df_cf2 = get_np(branch_res['freer-cf2']) - get_np(branch_res['1r-cf2'])
p_cf2 = 1 - chi2.cdf(2*dl_cf2, df = df_cf2)
# p_cf2 = 1.427e-02 # calculated with the chi2 program from PAML package
print("Testing one ratio vs free ratio models, with CodonFreq=1")
print(f'df={df_cf2}, 2*∆lnL={2*dl_cf2:.2f}, p={p_cf2:.2e}')

Testing one ratio vs free ratio models, with CodonFreq=1
df=14, 2*∆lnL=27.99, p=1.43e-02


there is thus strong evidence for non-equal dN/dS along the branches. But this test is almost always significant because the null hypothesis of homogeneous dN/dS across the entire tree is not realistic under most cases.

I plotted the free ratio model estimates on the gene tree using the `ggtree` package in R:
![free ratio estimates](../output/figure/20220727-B8441-OG-p1-414-branch-freer-tree.png)

By comparing the results obtained with CodonFreq=2 or CodonFreq=1, I see that two of the three branches identified as having elevated dN/dS by the F3x4 model was also implicated by the F1x4 model. Given Ziheng's [suggestion](https://groups.google.com/g/pamlsoftware/c/i7-NFSgnhq8/m/80rWE37kBgAJ), I decided to focus on the F3x4 model result and test the three branches together and separately (labeled as ω1, ω2 and ω3, respectively).

My next goal is to separately test whether there are statistical support for these two branches having a dN/dS > 1.

### F1x4, two ratio (w0, w1=w2=w3=w4)

In [46]:
# to allow for this code chunk to be run out of order, first switch back to the script dir
os.chdir(script_dir)
# change to the first segment analysis dir
working_dir = "../output/paml/B8441-OG-part/p1-414/branch-2r_all-cf1/"
if not os.path.exists(working_dir):
    os.mkdir(working_dir)
os.chdir(working_dir)

In [41]:
cml = codeml.Codeml()
cml.working_dir = "./"
cml.alignment = "../B8441-OG-Hil-PF11765-1-414.nuc"
cml.tree = "../p1-414-2r_all-cf1.nwk"
cml.out_file = "mlc"

In [42]:
cml.set_options(
    noisy = 2,
    verbose = 0,
    seqtype = 1,
    CodonFreq = 1, # F1x4 codon
    model = 2,     # 2 or more dN/dS ratios for branches
    NSsites = [0], # one site class 
    icode = 8,     # yeast alternative nuclear code
    fix_kappa = 0,
    kappa = 2,
    fix_omega = 0,
    omega = .4,
    cleandata = 1,
    fix_blength = 0
)

In [43]:
cml.ctl_file = "codeml.ctl"
cml.write_ctl_file()

In [44]:
res = cml.run()

In [47]:
branch_res['2r_all-cf1'] = codeml.read("mlc")
print_branch_test(branch_res['2r_all-cf1'], branch_res['1r-cf1'])

Main result (codon model = F1x4)
    lnL: -3421.85936; np: 18; kappa: 1.583; w: [0.12695, 1.77137]
Null model (codon model = F1x4)
    lnL: -3427.952669; np: 17; kappa: 1.592; w: 0.13685
    2*dlnL = 12.187, df = 1, LLR P-value = 4.81e-04


### F1x4, two ratio constrained (w0, w1=w2=w3=w4=1)

In [48]:
# to allow for this code chunk to be run out of order, first switch back to the script dir
os.chdir(script_dir)
# change to the first segment analysis dir
working_dir = "../output/paml/B8441-OG-part/p1-414/branch-2r_alleq1-cf1/"
if not os.path.exists(working_dir):
    os.mkdir(working_dir)
os.chdir(working_dir)

In [33]:
cml = codeml.Codeml()
cml.working_dir = "./"
cml.alignment = "../B8441-OG-Hil-PF11765-1-414.nuc"
cml.tree = "../p1-414-2r_all-cf1.nwk"
cml.out_file = "mlc"

In [34]:
cml.set_options(
    noisy = 2,
    verbose = 0,
    seqtype = 1,
    CodonFreq = 1, # F1x4 codon
    model = 2,     # 2 or more dN/dS ratios for branches
    NSsites = [0], # one site class 
    icode = 8,     # yeast alternative nuclear code
    fix_kappa = 0,
    kappa = 2,
    fix_omega = 1,
    omega = 1,
    cleandata = 1,
    fix_blength = 0
)

In [35]:
cml.ctl_file = "codeml.ctl"
cml.write_ctl_file()

In [36]:
res = cml.run()

In [49]:
branch_res['2r_alleq1-cf1'] = codeml.read("mlc")
print_branch_test(branch_res['2r_all-cf1'], branch_res['2r_alleq1-cf1'])

Main result (codon model = F1x4)
    lnL: -3421.85936; np: 18; kappa: 1.583; w: [0.12695, 1.77137]
Null model (codon model = F1x4)
    lnL: -3421.992047; np: 17; kappa: 1.581; w: [0.1292, 1.0]
    2*dlnL = 0.265, df = 1, LLR P-value = 6.06e-01


No evidence of positive selection on the selected branches

### F3x4, two ratio (w0, w1=w2=w3)

In [79]:
# to allow for this code chunk to be run out of order, first switch back to the script dir
os.chdir(script_dir)
# change to the first segment analysis dir
working_dir = "../output/paml/B8441-OG-part/p1-414/branch-2r_all-cf2/"
if not os.path.exists(working_dir):
    os.mkdir(working_dir)
os.chdir(working_dir)

In [80]:
cml = codeml.Codeml()
cml.working_dir = "./"
cml.alignment = "../B8441-OG-Hil-PF11765-1-414.nuc"
cml.tree = "../p1-414-2r_all-cf2.nwk"
cml.out_file = "mlc"

In [81]:
cml.set_options(
    noisy = 2,
    verbose = 0,
    seqtype = 1,
    CodonFreq = 2, # F3x4 model
    model = 2,     # 2 or more dN/dS ratios for branches
    NSsites = [0], # one site class 
    icode = 8,     # yeast alternative nuclear code
    fix_kappa = 0,
    kappa = 2,
    fix_omega = 0,
    omega = .4,
    cleandata = 1,
    fix_blength = 0
)

In [82]:
cml.ctl_file = "codeml.ctl"
cml.write_ctl_file()

In [83]:
res = cml.run()

In [84]:
branch_res['branch-2r_all-cf2'] = codeml.read("mlc")
print_branch_test(branch_res['branch-2r_all-cf2'], branch_res['branch-1r-cf2'])

Main result (codon model = F3x4)
    lnL: -3360.672152; np: 18; kappa: 1.316; w: [0.0292, 0.00277]
    omega tree: ((Hil7 #0.0291987 , (((Hil1 #0.0291987 , Hil2 #0.0291987 ) #0.0291987 , (Hil4 #0.0291987 , Hil3 #0.0291987 ) #0.0291987 ) #0.0291987 , (Hil8 #0.0291987 , Hil6 #0.0291987 ) #0.00277474 ) #0.00277474 ) #0.00277474 , Hil5 #0.0291987 , OG #0.0291987 );
Null model (codon model = F3x4)
    lnL: -3362.613054; np: 17; kappa: 1.342; w: 0.01825
    2*dlnL = 3.882, df = 1, LLR P-value = 4.88e-02


Note that the foreground lineages in the alternative model, which were selected based on them having significantly elevated dN/dS in the free ratio model, actually had a lower dN/dS estimate than the background. This is difficult to interpret.

## Branch-site model
This model tests selection on a subset of the sites in a subset of lineages (foreground). The alternative model is specified by `Model A: model = 2, NSsites = 2, fix_omega = 0` and the null model is specified by `Model A1: model = 2, NSsites = 2, fix_omega = 1, omega = 1`
### Alternative model, (F1x4)

In [50]:
# to allow for this code chunk to be run out of order, first switch back to the script dir
os.chdir(script_dir)
# change to the first segment analysis dir
working_dir = "../output/paml/B8441-OG-part/p1-414/brnsite-alt-2r_all-cf1/"
if not os.path.exists(working_dir):
    os.mkdir(working_dir)
os.chdir(working_dir)

In [39]:
cml = codeml.Codeml()
cml.working_dir = "./"
cml.alignment = "../B8441-OG-Hil-PF11765-1-414.nuc"
cml.tree = "../p1-414-2r_all-cf1.nwk"
cml.out_file = "mlc"

In [40]:
cml.set_options(
    noisy = 2,
    verbose = 0,
    seqtype = 1,
    CodonFreq = 2, # F3x4 model
    model = 2,     # 2 or more dN/dS ratios for branches
    NSsites = [2], # three classes, constrained, neutral and positive
    icode = 8,     # yeast alternative nuclear code
    fix_kappa = 0,
    kappa = 2,
    fix_omega = 0,
    omega = .4,
    cleandata = 1,
    fix_blength = 0
)

In [41]:
cml.ctl_file = "codeml.ctl"
cml.write_ctl_file()

In [42]:
res = cml.run()

### Null model (F1x4)

In [43]:
# to allow for this code chunk to be run out of order, first switch back to the script dir
os.chdir(script_dir)
# change to the first segment analysis dir
working_dir = "../output/paml/B8441-OG-part/p1-414/brnsite-null-2r_all-cf1/"
if not os.path.exists(working_dir):
    os.mkdir(working_dir)
os.chdir(working_dir)

In [44]:
cml = codeml.Codeml()
cml.working_dir = "./"
cml.alignment = "../B8441-OG-Hil-PF11765-1-414.nuc"
cml.tree = "../p1-414-2r_all-cf1.nwk"
cml.out_file = "mlc"

In [45]:
cml.set_options(
    noisy = 2,
    verbose = 0,
    seqtype = 1,
    CodonFreq = 2, # F3x4 model
    model = 2,     # 2 or more dN/dS ratios for branches
    NSsites = [2], # three classes, constrained, neutral and positive
    icode = 8,     # yeast alternative nuclear code
    fix_kappa = 0,
    kappa = 2,
    fix_omega = 1,
    omega = 1,
    cleandata = 1,
    fix_blength = 0
)

In [46]:
cml.ctl_file = "codeml.ctl"
cml.write_ctl_file()

In [47]:
res = cml.run()

### Branch-site model test

In [51]:
# to allow for this code chunk to be run out of order, first switch back to the script dir
os.chdir(script_dir)
# change to the first segment analysis dir
working_dir = "../output/paml/B8441-OG-part/p1-414/"
os.chdir(working_dir)
brnsite_res = {
    'alt-2r_all-cf1': codeml.read('brnsite-alt-2r_all-cf1/mlc'),
    'null-2r_all-cf1': codeml.read('brnsite-null-2r_all-cf1/mlc')
}

In [52]:
dl_bs_cf1 = get_ll(brnsite_res['alt-2r_all-cf1'], m = 2) - get_ll(brnsite_res['null-2r_all-cf1'], m = 2)
df_bs_cf1 = get_np(brnsite_res['alt-2r_all-cf1'], m = 2) - get_np(brnsite_res['null-2r_all-cf1'], m = 2)
p_bs_cf1 = 1 - chi2.cdf(2*dl_bs_cf1, df = df_bs_cf1)
print("Testing branch site model, with CodonFreq=1")
print(f'df={df_bs_cf1}, 2*∆lnL={2*dl_bs_cf1:.2f}, p={p_bs_cf1:.2e}')

Testing branch site model, with CodonFreq=1
df=1, 2*∆lnL=0.16, p=6.89e-01


No evidence of positive selection.

# p697-981

## Site models

### M0,1,2,7,8 (codonfreq=1)

In [150]:
# to allow for this code chunk to be run out of order, first switch back to the script dir
os.chdir(script_dir)
# change to the first segment analysis dir
working_dir = "../output/paml/B8441-OG-part/p697-981/site-cf1/"
if not os.path.exists(working_dir):
    os.mkdir(working_dir)
os.chdir(working_dir)

In [151]:
cml = codeml.Codeml()
cml.working_dir = "./"
cml.alignment = "../B8441-OG-Hil-PF11765-697-981.nuc"
cml.tree = "../p697-981.nwk"
cml.out_file = "mlc"

In [152]:
cml.set_options(
    noisy = 2,
    verbose = 0,
    seqtype = 1,
    CodonFreq = 1, # F1x4
    clock = 0,
    model = 0,     # one ratio for all branches
    NSsites = [0,1,2,7,8],
    icode = 8,
    fix_kappa = 0,
    kappa = 2,
    fix_omega = 0,
    omega = .4,
    cleandata = 1,
    fix_blength = 0
)

In [153]:
cml.ctl_file = "codeml.ctl"
cml.write_ctl_file()

In [154]:
res = cml.run()

### M8a (codonfreq=1)

In [155]:
# to allow for this code chunk to be run out of order, first switch back to the script dir
os.chdir(script_dir)
# change to the first segment analysis dir
working_dir = "../output/paml/B8441-OG-part/p697-981/site-M8a-cf1/"
if not os.path.exists(working_dir):
    os.mkdir(working_dir)
os.chdir(working_dir)

In [156]:
cml = codeml.Codeml()
cml.working_dir = "./"
cml.alignment = "../B8441-OG-Hil-PF11765-697-981.nuc"
cml.tree = "../p697-981.nwk"
cml.out_file = "mlc"

In [157]:
cml.set_options(
    noisy = 2,
    verbose = 0,
    seqtype = 1,
    CodonFreq = 1, # F1x4
    clock = 0,
    model = 0,     # one ratio for all branches
    NSsites = [8], # M8a
    icode = 8,
    fix_kappa = 0,
    kappa = 2,
    fix_omega = 1,
    omega = 1,     # fix omega to be 1 as null model
    cleandata = 1,
    fix_blength = 0
)

In [158]:
cml.ctl_file = "codeml.ctl"
cml.write_ctl_file()

In [159]:
res = cml.run()

### M0,1,2,7,8 (codonfreq=2)

In [160]:
# to allow for this code chunk to be run out of order, first switch back to the script dir
os.chdir(script_dir)
# change to the first segment analysis dir
working_dir = "../output/paml/B8441-OG-part/p697-981/site-cf2/"
if not os.path.exists(working_dir):
    os.mkdir(working_dir)
os.chdir(working_dir)

In [161]:
cml = codeml.Codeml()
cml.working_dir = "./"
cml.alignment = "../B8441-OG-Hil-PF11765-697-981.nuc"
cml.tree = "../p697-981.nwk"
cml.out_file = "mlc"

In [162]:
cml.set_options(
    noisy = 2,
    verbose = 0,
    seqtype = 1,
    CodonFreq = 2, # F3x4
    clock = 0,
    model = 0,     # one ratio for all branches
    NSsites = [0,1,2,7,8],
    icode = 8,
    fix_kappa = 0,
    kappa = 2,
    fix_omega = 0,
    omega = .4,
    cleandata = 1,
    fix_blength = 0
)

In [163]:
cml.ctl_file = "codeml.ctl"
cml.write_ctl_file()

In [164]:
res = cml.run()

### M8a (codonfreq=2)

In [165]:
# to allow for this code chunk to be run out of order, first switch back to the script dir
os.chdir(script_dir)
# change to the first segment analysis dir
working_dir = "../output/paml/B8441-OG-part/p697-981/site-M8a-cf2/"
if not os.path.exists(working_dir):
    os.mkdir(working_dir)
os.chdir(working_dir)

In [166]:
cml = codeml.Codeml()
cml.working_dir = "./"
cml.alignment = "../B8441-OG-Hil-PF11765-697-981.nuc"
cml.tree = "../p697-981.nwk"
cml.out_file = "mlc"

In [167]:
cml.set_options(
    noisy = 2,
    verbose = 0,
    seqtype = 1,
    CodonFreq = 2, # F3x4
    clock = 0,
    model = 0,     # one ratio for all branches
    NSsites = [8], # M8a
    icode = 8,
    fix_kappa = 0,
    kappa = 2,
    fix_omega = 1, # fix omega = 1 as null
    omega = 1,
    cleandata = 1,
    fix_blength = 0
)

In [168]:
cml.ctl_file = "codeml.ctl"
cml.write_ctl_file()

In [169]:
res = cml.run()

### Site model test result

In [136]:
# to allow for this code chunk to be run out of order, first switch back to the script dir
os.chdir(script_dir)
# change to the first segment analysis dir
working_dir = "../output/paml/B8441-OG-part/p697-981/"
os.chdir(working_dir)


In [137]:
# read site model results into a dictionary
site_res1 = {
    'cf1': codeml.read('site-cf1/mlc'),
    'cf2': codeml.read('site-cf2/mlc'),
    'M8a-cf1': codeml.read('site-M8a-cf1/mlc'),
    'M8a-cf2': codeml.read('site-M8a-cf2/mlc')
}

In [138]:
# process the data and write the output to a csv file
data_rows = []
for t in site_res1.values():
    data_rows += get_site_model_fit(t)
fields = ['codonfreq', 'model', 'npar', 'lnL', 'kappa', 'omega', 'description']
with open("20220729-p697-981-site-model-summary-table.tsv", "w") as f:
    write = csv.writer(f, delimiter = "\t")
    write.writerow(fields)
    write.writerows(data_rows)

In [140]:
# perform the three pairs of tests
print("CodonFreq = 1")
print_site_test(site_res1['cf1'], site_res1['M8a-cf1'])
print('')                    
print("CodonFreq = 2")
print_site_test(site_res1['cf2'], site_res1['M8a-cf2'])

CodonFreq = 1
-- M2a vs M1a --
2*dlnL = 0.000, df = 2, LLR P-value = 1.00e+00

M2 (F1x4): lnL: -2342.393; np: 20; kappa: 1.641
    w: p0=0.598, w0=0.077; p1=0.223, w1=1.000; p2=0.179, w2=1.000; 

M1 (F1x4): lnL: -2342.393; np: 18; kappa: 1.641
    w: p0=0.598, w0=0.077; p1=0.402, w1=1.000; 
///

-- M8 vs M7 --
2*dlnL = 9.744, df = 2, LLR P-value = 7.66e-03

M8 (F1x4): lnL: -2310.397; np: 20; kappa: 1.507
    w: p0=0.948, p=0.514, q=1.907; p1=0.052, w=14.926

M7 (F1x4): lnL: -2315.269; np: 18; kappa: 1.448
    w: p=0.442, q=1.319
///

-- M8a vs M8 --
2*dlnL = 7.817, df = 1, LLR P-value = 5.18e-03

M8 (F1x4): lnL: -2310.397; np: 20; kappa: 1.507
    w: p0=0.948, p=0.514, q=1.907; p1=0.052, w=14.926

M8a (F1x4): lnL: -2314.305; np: 19; kappa: 1.449
    w: p0=0.902, p=0.510, q=2.314; p1=0.098, w=1.000
///


CodonFreq = 2
-- M2a vs M1a --
2*dlnL = 4.299, df = 2, LLR P-value = 1.17e-01

M2 (F3x4): lnL: -2326.418; np: 20; kappa: 1.346
    w: p0=0.660, w0=0.064; p1=0.288, w1=1.000; p2=0.052, w

to be filled in

## Branch models

### one ratio (F1x4)

In [173]:
# to allow for this code chunk to be run out of order, first switch back to the script dir
os.chdir(script_dir)
# change to the first segment analysis dir
working_dir = "../output/paml/B8441-OG-part/p697-981/branch-1r-cf1/"
if not os.path.exists(working_dir):
    os.mkdir(working_dir)
os.chdir(working_dir)

In [174]:
cml = codeml.Codeml()
cml.working_dir = "./"
cml.alignment = "../B8441-OG-Hil-PF11765-697-981.nuc"
cml.tree = "../p697-981.nwk"
cml.out_file = "mlc"

In [175]:
cml.set_options(
    noisy = 2,
    verbose = 0,
    seqtype = 1,
    CodonFreq = 1,  # F1x4
    model = 0,      # 1 ratio
    NSsites = [0],
    icode = 8,
    fix_kappa = 0,
    kappa = 2,
    fix_omega = 0,
    omega = .4,
    cleandata = 1,
    fix_blength = 0
)

In [176]:
cml.ctl_file = "codeml.ctl"
cml.write_ctl_file()

In [177]:
res = cml.run()

### free ratio (F1x4)

In [178]:
# to allow for this code chunk to be run out of order, first switch back to the script dir
os.chdir(script_dir)
# change to the first segment analysis dir
working_dir = "../output/paml/B8441-OG-part/p697-981/branch-freer-cf1/"
if not os.path.exists(working_dir):
    os.mkdir(working_dir)
os.chdir(working_dir)

In [179]:
cml = codeml.Codeml()
cml.working_dir = "./"
cml.alignment = "../B8441-OG-Hil-PF11765-697-981.nuc"
cml.tree = "../p697-981.nwk"
cml.out_file = "mlc"

In [180]:
cml.set_options(
    noisy = 2,
    verbose = 0,
    seqtype = 1,
    CodonFreq = 1,  # F1x4
    clock = 0,
    model = 1,      # free ratio
    NSsites = [0],
    icode = 8,
    fix_kappa = 0,
    kappa = 2,
    fix_omega = 0,
    omega = .4,
    cleandata = 1,
    fix_blength = 0
)

In [181]:
cml.ctl_file = "codeml.ctl"
cml.write_ctl_file()

In [182]:
res = cml.run()

### one ratio (F3x4)

In [183]:
# to allow for this code chunk to be run out of order, first switch back to the script dir
os.chdir(script_dir)
# change to the first segment analysis dir
working_dir = "../output/paml/B8441-OG-part/p697-981/branch-1r-cf2/"
if not os.path.exists(working_dir):
    os.mkdir(working_dir)
os.chdir(working_dir)

In [184]:
cml = codeml.Codeml()
cml.working_dir = "./"
cml.alignment = "../B8441-OG-Hil-PF11765-697-981.nuc"
cml.tree = "../p697-981.nwk"
cml.out_file = "mlc"

In [185]:
cml.set_options(
    noisy = 2,
    verbose = 0,
    seqtype = 1,
    CodonFreq = 2,  # F3x4
    model = 0,      # one ratio
    NSsites = [0],
    icode = 8,
    fix_kappa = 0,
    kappa = 2,
    fix_omega = 0,
    omega = .4,
    cleandata = 1,
    fix_blength = 0
)

In [186]:
cml.ctl_file = "codeml.ctl"
cml.write_ctl_file()

In [187]:
res = cml.run()

### free ratio (F3x4)

In [188]:
# to allow for this code chunk to be run out of order, first switch back to the script dir
os.chdir(script_dir)
# change to the first segment analysis dir
working_dir = "../output/paml/B8441-OG-part/p697-981/branch-freer-cf2/"
if not os.path.exists(working_dir):
    os.mkdir(working_dir)
os.chdir(working_dir)

In [189]:
cml = codeml.Codeml()
cml.working_dir = "./"
cml.alignment = "../B8441-OG-Hil-PF11765-697-981.nuc"
cml.tree = "../p697-981.nwk"
cml.out_file = "mlc"

In [190]:
cml.set_options(
    noisy = 2,
    verbose = 0,
    seqtype = 1,
    CodonFreq = 2,  # F3x4
    clock = 0,
    model = 1,      # free ratio
    NSsites = [0],
    icode = 8,
    fix_kappa = 0,
    kappa = 2,
    fix_omega = 0,
    omega = .4,
    cleandata = 1,
    fix_blength = 0
)

In [191]:
cml.ctl_file = "codeml.ctl"
cml.write_ctl_file()

In [192]:
res = cml.run()

### Free vs one-omega models test result
First, we reimport all the model results

In [268]:
# to allow for this code chunk to be run out of order, first switch back to the script dir
os.chdir(script_dir)
# change to the first segment analysis dir
working_dir = "../output/paml/B8441-OG-part/p697-981/"
os.chdir(working_dir)
branch_res = {
    '1r-cf1': codeml.read('branch-1r-cf1/mlc'),
    '1r-cf2': codeml.read('branch-1r-cf2/mlc'),
    'freer-cf1': codeml.read('branch-freer-cf1/mlc'),
    'freer-cf2': codeml.read('branch-freer-cf2/mlc')
}

In [194]:
print_branch_test(branch_res['freer-cf1'], branch_res['1r-cf1'])

Main result (codon model = F1x4)
    lnL: -2391.612283; np: 31; kappa: 1.429; w: [0.23095, 0.00445, 998.99991, 0.14377, 0.07403, 999.0, 92.44628, 511.2296, 0.1592, 0.11967, 0.10617, 957.23845, 0.04349, 0.62278, 0.17973]
Null model (codon model = F1x4)
    lnL: -2411.367957; np: 17; kappa: 1.413; w: 0.18181
    2*dlnL = 39.511, df = 14, LLR P-value = 3.04e-04


In [195]:
print_branch_test(branch_res['freer-cf2'], branch_res['1r-cf2'])

Main result (codon model = F3x4)
    lnL: -2380.28829; np: 31; kappa: 1.194; w: [0.65755, 0.00425, 999.0, 0.12997, 0.01553, 999.0, 0.1662, 999.0, 0.1177, 0.07865, 999.0, 0.11371, 0.0142, 0.28089, 0.02319]
Null model (codon model = F3x4)
    lnL: -2391.8604; np: 17; kappa: 1.443; w: 0.03329
    2*dlnL = 23.144, df = 14, LLR P-value = 5.80e-02


there is thus strong evidence for non-equal dN/dS along the branches. But this test is almost always significant because the null hypothesis of homogeneous dN/dS across the entire tree is not realistic under most cases.

I plotted the free ratio model estimates on the gene tree using the `ggtree` package in R:
![free ratio estimates](../output/figure/20220730-B8441-OG-p697-981-branch-freer-tree.png)

For this partition, the two models gave quite similar results.


Comparing the two results, I hypothesize that there has been accelerated evolution in the internal branches 11..12, 11..13, 13..14, 14..15 and 15..16. Considering that the placement of Hil7 in the PF11765 domain tree is uncertain (low BS support), and since p1-414 tree placed Hil7 as the outgroup for Hil1, 2, 3, 4, 6 and 8, it is conceivable that Hil7 is still the outgroup here and that the foreground branches encompasses all the internal branches after the divergence of Hil1, 2, 3, 4, 6 and 8 from Hil5 and Hil7.

### F1x4, two ratio (w0, w1=w2=w3=w4)

In [305]:
# to allow for this code chunk to be run out of order, first switch back to the script dir
os.chdir(script_dir)
# change to the first segment analysis dir
working_dir = "../output/paml/B8441-OG-part/p697-981/branch-2r_all-cf1/"
if not os.path.exists(working_dir):
    os.mkdir(working_dir)
os.chdir(working_dir)

In [306]:
cml = codeml.Codeml()
cml.working_dir = "./"
cml.alignment = "../B8441-OG-Hil-PF11765-697-981.nuc"
cml.tree = "../p697-981-2r_all-cf1.nwk"
cml.out_file = "mlc"

In [307]:
cml.set_options(
    noisy = 2,
    verbose = 0,
    seqtype = 1,
    CodonFreq = 1, # F1x4 codon
    model = 2,     # 2 or more dN/dS ratios for branches
    NSsites = [0], # one site class 
    icode = 8,     # yeast alternative nuclear code
    fix_kappa = 0,
    kappa = 2,
    fix_omega = 0,
    omega = .4,
    cleandata = 1,
    fix_blength = 0
)

In [308]:
cml.ctl_file = "codeml.ctl"
cml.write_ctl_file()

In [309]:
res = cml.run()

In [310]:
branch_res['2r_all-cf1'] = codeml.read("mlc")
print_branch_test(branch_res['2r_all-cf1'], branch_res['1r-cf1'])

Main result (codon model = F1x4)
    lnL: -2406.811492; np: 18; kappa: 1.407; w: [0.13764, 1.52567]
Null model (codon model = F1x4)
    lnL: -2411.367957; np: 17; kappa: 1.413; w: 0.18181
    2*dlnL = 9.113, df = 1, LLR P-value = 2.54e-03


### F3x4, two ratio (w0, w1=w2=w3=w4)

In [311]:
# to allow for this code chunk to be run out of order, first switch back to the script dir
os.chdir(script_dir)
# change to the first segment analysis dir
working_dir = "../output/paml/B8441-OG-part/p697-981/branch-2r_all-cf2/"
if not os.path.exists(working_dir):
    os.mkdir(working_dir)
os.chdir(working_dir)

In [312]:
cml = codeml.Codeml()
cml.working_dir = "./"
cml.alignment = "../B8441-OG-Hil-PF11765-697-981.nuc"
# following the same foreground group choices as for F1x4 model
cml.tree = "../p697-981-2r_all-cf1.nwk"
cml.out_file = "mlc"

In [313]:
cml.set_options(
    noisy = 2,
    verbose = 0,
    seqtype = 1,
    CodonFreq = 2, # F3x4 codon
    model = 2,     # 2 or more dN/dS ratios for branches
    NSsites = [0], # one site class 
    icode = 8,     # yeast alternative nuclear code
    fix_kappa = 0,
    kappa = 2,
    fix_omega = 0,
    omega = .4,
    cleandata = 1,
    fix_blength = 0
)

In [314]:
cml.ctl_file = "codeml.ctl"
cml.write_ctl_file()

In [315]:
res = cml.run()

In [316]:
branch_res['2r_all-cf2'] = codeml.read("mlc")
print_branch_test(branch_res['2r_all-cf2'], branch_res['1r-cf2'])

Main result (codon model = F3x4)
    lnL: -2389.812147; np: 18; kappa: 1.412; w: [0.03353, 999.0]
Null model (codon model = F3x4)
    lnL: -2391.8604; np: 17; kappa: 1.443; w: 0.03329
    2*dlnL = 4.097, df = 1, LLR P-value = 4.30e-02


### F1x4, two ratio constrained (w0, w1=w2=w3=w4=1)

In [317]:
# to allow for this code chunk to be run out of order, first switch back to the script dir
os.chdir(script_dir)
# change to the first segment analysis dir
working_dir = "../output/paml/B8441-OG-part/p697-981/branch-2r_alleq1-cf1/"
if not os.path.exists(working_dir):
    os.mkdir(working_dir)
os.chdir(working_dir)

In [318]:
cml = codeml.Codeml()
cml.working_dir = "./"
cml.alignment = "../B8441-OG-Hil-PF11765-697-981.nuc"
cml.tree = "../p697-981-2r_all-cf1.nwk"
cml.out_file = "mlc"

In [319]:
cml.set_options(
    noisy = 2,
    verbose = 0,
    seqtype = 1,
    CodonFreq = 1, # F1x4 codon
    model = 2,     # 2 or more dN/dS ratios for branches
    NSsites = [0], # one site class 
    icode = 8,     # yeast alternative nuclear code
    fix_kappa = 0,
    kappa = 2,
    fix_omega = 1,
    omega = 1,
    cleandata = 1,
    fix_blength = 0
)

In [320]:
cml.ctl_file = "codeml.ctl"
cml.write_ctl_file()

In [321]:
res = cml.run()

In [322]:
branch_res['2r_alleq1-cf1'] = codeml.read("mlc")
print_branch_test(branch_res['2r_all-cf1'], branch_res['2r_alleq1-cf1'])

Main result (codon model = F1x4)
    lnL: -2406.811492; np: 18; kappa: 1.407; w: [0.13764, 1.52567]
Null model (codon model = F1x4)
    lnL: -2406.843304; np: 17; kappa: 1.406; w: [0.14099, 1.0]
    2*dlnL = 0.064, df = 1, LLR P-value = 8.01e-01


No evidence of positive selection

### F3x4, two ratio constrained (w0, w1=w2=w3=w4=1)

In [323]:
# to allow for this code chunk to be run out of order, first switch back to the script dir
os.chdir(script_dir)
# change to the first segment analysis dir
working_dir = "../output/paml/B8441-OG-part/p697-981/branch-2r_alleq1-cf2/"
if not os.path.exists(working_dir):
    os.mkdir(working_dir)
os.chdir(working_dir)

In [324]:
cml = codeml.Codeml()
cml.working_dir = "./"
cml.alignment = "../B8441-OG-Hil-PF11765-697-981.nuc"
cml.tree = "../p697-981-2r_all-cf1.nwk"
cml.out_file = "mlc"

In [325]:
cml.set_options(
    noisy = 2,
    verbose = 0,
    seqtype = 1,
    CodonFreq = 2, # F3x4 codon
    model = 2,     # 2 or more dN/dS ratios for branches
    NSsites = [0], # one site class 
    icode = 8,     # yeast alternative nuclear code
    fix_kappa = 0,
    kappa = 2,
    fix_omega = 1,
    omega = 1,
    cleandata = 1,
    fix_blength = 0
)

In [326]:
cml.ctl_file = "codeml.ctl"
cml.write_ctl_file()

In [327]:
res = cml.run()

In [328]:
branch_res['2r_alleq1-cf2'] = codeml.read("mlc")
print_branch_test(branch_res['2r_all-cf2'], branch_res['2r_alleq1-cf2'])

Main result (codon model = F3x4)
    lnL: -2389.812147; np: 18; kappa: 1.412; w: [0.03353, 999.0]
Null model (codon model = F3x4)
    lnL: -2389.952596; np: 17; kappa: 1.414; w: [0.03357, 1.0]
    2*dlnL = 0.281, df = 1, LLR P-value = 5.96e-01


No evidence of positive selection

## Branch-site model
This model tests selection on a subset of the sites in a subset of lineages (foreground). The alternative model is specified by `Model A: model = 2, NSsites = 2, fix_omega = 0` and the null model is specified by `Model A1: model = 2, NSsites = 2, fix_omega = 1, omega = 1`

### Alternative model, (F1x4)

In [329]:
# to allow for this code chunk to be run out of order, first switch back to the script dir
os.chdir(script_dir)
# change to the first segment analysis dir
working_dir = "../output/paml/B8441-OG-part/p697-981/brnsite-alt-2r_all-cf1/"
if not os.path.exists(working_dir):
    os.mkdir(working_dir)
os.chdir(working_dir)

In [330]:
cml = codeml.Codeml()
cml.working_dir = "./"
cml.alignment = "../B8441-OG-Hil-PF11765-697-981.nuc"
cml.tree = "../p697-981-2r_all-cf1.nwk"
cml.out_file = "mlc"

In [331]:
cml.set_options(
    noisy = 2,
    verbose = 0,
    seqtype = 1,
    CodonFreq = 1, # F1x4 model
    model = 2,     # 2 or more dN/dS ratios for branches
    NSsites = [2], # three classes, constrained, neutral and positive
    icode = 8,     # yeast alternative nuclear code
    fix_kappa = 0,
    kappa = 2,
    fix_omega = 0,
    omega = .4,
    cleandata = 1,
    fix_blength = 0
)

In [332]:
cml.ctl_file = "codeml.ctl"
cml.write_ctl_file()

In [333]:
res = cml.run()

### Null model (F1x4)

In [334]:
# to allow for this code chunk to be run out of order, first switch back to the script dir
os.chdir(script_dir)
# change to the first segment analysis dir
working_dir = "../output/paml/B8441-OG-part/p697-981/brnsite-null-2r_all-cf1/"
if not os.path.exists(working_dir):
    os.mkdir(working_dir)
os.chdir(working_dir)

In [335]:
cml = codeml.Codeml()
cml.working_dir = "./"
cml.alignment = "../B8441-OG-Hil-PF11765-697-981.nuc"
cml.tree = "../p697-981-2r_all-cf1.nwk"
cml.out_file = "mlc"

In [336]:
cml.set_options(
    noisy = 2,
    verbose = 0,
    seqtype = 1,
    CodonFreq = 1, # F1x4 model
    model = 2,     # 2 or more dN/dS ratios for branches
    NSsites = [2], # three classes, constrained, neutral and positive
    icode = 8,     # yeast alternative nuclear code
    fix_kappa = 0,
    kappa = 2,
    fix_omega = 1,
    omega = 1,
    cleandata = 1,
    fix_blength = 0
)

In [337]:
cml.ctl_file = "codeml.ctl"
cml.write_ctl_file()

In [338]:
res = cml.run()

### Alternative model, (F3x4)

In [141]:
# to allow for this code chunk to be run out of order, first switch back to the script dir
os.chdir(script_dir)
# change to the first segment analysis dir
working_dir = "../output/paml/B8441-OG-part/p697-981/brnsite-alt-2r_all-cf2/"
if not os.path.exists(working_dir):
    os.mkdir(working_dir)
os.chdir(working_dir)

In [142]:
cml = codeml.Codeml()
cml.working_dir = "./"
cml.alignment = "../B8441-OG-Hil-PF11765-697-981.nuc"
cml.tree = "../p697-981-2r_all-cf1.nwk"
cml.out_file = "mlc"

In [143]:
cml.set_options(
    noisy = 2,
    verbose = 0,
    seqtype = 1,
    CodonFreq = 2, # F3x4 model
    model = 2,     # 2 or more dN/dS ratios for branches
    NSsites = [2], # three classes, constrained, neutral and positive
    icode = 8,     # yeast alternative nuclear code
    fix_kappa = 0,
    kappa = 2,
    fix_omega = 0,
    omega = .4,
    cleandata = 1,
    fix_blength = 0
)

In [144]:
cml.ctl_file = "codeml.ctl"
cml.write_ctl_file()

In [145]:
res = cml.run()

### Null model (F3x4)

In [146]:
# to allow for this code chunk to be run out of order, first switch back to the script dir
os.chdir(script_dir)
# change to the first segment analysis dir
working_dir = "../output/paml/B8441-OG-part/p697-981/brnsite-null-2r_all-cf2/"
if not os.path.exists(working_dir):
    os.mkdir(working_dir)
os.chdir(working_dir)

In [147]:
cml = codeml.Codeml()
cml.working_dir = "./"
cml.alignment = "../B8441-OG-Hil-PF11765-697-981.nuc"
cml.tree = "../p697-981-2r_all-cf1.nwk"
cml.out_file = "mlc"

In [148]:
cml.set_options(
    noisy = 2,
    verbose = 0,
    seqtype = 1,
    CodonFreq = 2, # F3x4 model
    model = 2,     # 2 or more dN/dS ratios for branches
    NSsites = [2], # three classes, constrained, neutral and positive
    icode = 8,     # yeast alternative nuclear code
    fix_kappa = 0,
    kappa = 2,
    fix_omega = 1,
    omega = 1,
    cleandata = 1,
    fix_blength = 0
)

In [149]:
cml.ctl_file = "codeml.ctl"
cml.write_ctl_file()

In [150]:
res = cml.run()

### Branch-site model test

In [151]:
# to allow for this code chunk to be run out of order, first switch back to the script dir
os.chdir(script_dir)
# change to the first segment analysis dir
working_dir = "../output/paml/B8441-OG-part/p697-981/"
os.chdir(working_dir)
brnsite_res = {
    'alt-2r_all-cf1': codeml.read('brnsite-alt-2r_all-cf1/mlc'),
    'null-2r_all-cf1': codeml.read('brnsite-null-2r_all-cf1/mlc'),
    'alt-2r_all-cf2': codeml.read('brnsite-alt-2r_all-cf2/mlc'),
    'null-2r_all-cf2': codeml.read('brnsite-null-2r_all-cf2/mlc')
}

In [152]:
dl_bs_cf1 = get_ll(brnsite_res['alt-2r_all-cf1'], m = 2) - get_ll(brnsite_res['null-2r_all-cf1'], m = 2)
df_bs_cf1 = get_np(brnsite_res['alt-2r_all-cf1'], m = 2) - get_np(brnsite_res['null-2r_all-cf1'], m = 2)
p_bs_cf1 = 1 - chi2.cdf(2*dl_bs_cf1, df = df_bs_cf1)
print("Testing branch site models, with CodonFreq=1")
print(f'df={df_bs_cf1}, 2*∆lnL={2*dl_bs_cf1:.2f}, p={p_bs_cf1:.2e}')

Testing branch site models, with CodonFreq=1
df=1, 2*∆lnL=11.82, p=5.85e-04


In [153]:
dl_bs_cf2 = get_ll(brnsite_res['alt-2r_all-cf2'], m = 2) - get_ll(brnsite_res['null-2r_all-cf2'], m = 2)
df_bs_cf2 = get_np(brnsite_res['alt-2r_all-cf2'], m = 2) - get_np(brnsite_res['null-2r_all-cf2'], m = 2)
p_bs_cf2 = 1 - chi2.cdf(2*dl_bs_cf2, df = df_bs_cf2)
print("Testing branch site models, with CodonFreq=2")
print(f'df={df_bs_cf2}, 2*∆lnL={2*dl_bs_cf2:.2f}, p={p_bs_cf2:.2e}')

Testing branch site models, with CodonFreq=2
df=1, 2*∆lnL=8.94, p=2.78e-03


# p1-414+697-981
Joining the first two partitions

## Site models

### M0,1,2,7,8 (codonfreq=1)

In [196]:
# to allow for this code chunk to be run out of order, first switch back to the script dir
os.chdir(script_dir)
# change to the first segment analysis dir
working_dir = "../output/paml/B8441-OG-part/p1-414_697-981/site-cf1/"
if not os.path.exists(working_dir):
    os.mkdir(working_dir)
os.chdir(working_dir)

In [197]:
cml = codeml.Codeml()
cml.working_dir = "./"
cml.alignment = "../B8441-OG-Hil-PF11765-1-414_697-981.nuc"
cml.tree = "../p1-414_697-981.nwk"
cml.out_file = "mlc"

In [198]:
cml.set_options(
    noisy = 2,
    verbose = 0,
    seqtype = 1,
    CodonFreq = 1, # F1x4
    clock = 0,
    model = 0,     # one ratio for all branches
    NSsites = [0,1,2,7,8],
    icode = 8,
    fix_kappa = 0,
    kappa = 2,
    fix_omega = 0,
    omega = .4,
    cleandata = 1,
    fix_blength = 0
)

In [199]:
cml.ctl_file = "codeml.ctl"
cml.write_ctl_file()

In [200]:
res = cml.run()

### M8a (codonfreq=1)

In [201]:
# to allow for this code chunk to be run out of order, first switch back to the script dir
os.chdir(script_dir)
# change to the first segment analysis dir
working_dir = "../output/paml/B8441-OG-part/p1-414_697-981/site-M8a-cf1/"
if not os.path.exists(working_dir):
    os.mkdir(working_dir)
os.chdir(working_dir)

In [202]:
cml = codeml.Codeml()
cml.working_dir = "./"
cml.alignment = "../B8441-OG-Hil-PF11765-1-414_697-981.nuc"
cml.tree = "../p1-414_697-981.nwk"
cml.out_file = "mlc"

In [203]:
cml.set_options(
    noisy = 2,
    verbose = 0,
    seqtype = 1,
    CodonFreq = 1, # F1x4
    clock = 0,
    model = 0,     # one ratio for all branches
    NSsites = [8], # M8a
    icode = 8,
    fix_kappa = 0,
    kappa = 2,
    fix_omega = 1,
    omega = 1,     # fix omega to be 1 as null model
    cleandata = 1,
    fix_blength = 0
)

In [204]:
cml.ctl_file = "codeml.ctl"
cml.write_ctl_file()

In [205]:
res = cml.run()

### M0,1,2,7,8 (codonfreq=2)

In [206]:
# to allow for this code chunk to be run out of order, first switch back to the script dir
os.chdir(script_dir)
# change to the first segment analysis dir
working_dir = "../output/paml/B8441-OG-part/p1-414_697-981/site-cf2/"
if not os.path.exists(working_dir):
    os.mkdir(working_dir)
os.chdir(working_dir)

In [207]:
cml = codeml.Codeml()
cml.working_dir = "./"
cml.alignment = "../B8441-OG-Hil-PF11765-1-414_697-981.nuc"
cml.tree = "../p1-414_697-981.nwk"
cml.out_file = "mlc"

In [208]:
cml.set_options(
    noisy = 2,
    verbose = 0,
    seqtype = 1,
    CodonFreq = 2, # F3x4
    clock = 0,
    model = 0,     # one ratio for all branches
    NSsites = [0,1,2,7,8],
    icode = 8,
    fix_kappa = 0,
    kappa = 2,
    fix_omega = 0,
    omega = .4,
    cleandata = 1,
    fix_blength = 0
)

In [209]:
cml.ctl_file = "codeml.ctl"
cml.write_ctl_file()

In [210]:
res = cml.run()

### M8a (codonfreq=2)

In [211]:
# to allow for this code chunk to be run out of order, first switch back to the script dir
os.chdir(script_dir)
# change to the first segment analysis dir
working_dir = "../output/paml/B8441-OG-part/p1-414_697-981/site-M8a-cf2/"
if not os.path.exists(working_dir):
    os.mkdir(working_dir)
os.chdir(working_dir)

In [212]:
cml = codeml.Codeml()
cml.working_dir = "./"
cml.alignment = "../B8441-OG-Hil-PF11765-1-414_697-981.nuc"
cml.tree = "../p1-414_697-981.nwk"
cml.out_file = "mlc"

In [213]:
cml.set_options(
    noisy = 2,
    verbose = 0,
    seqtype = 1,
    CodonFreq = 2, # F3x4
    clock = 0,
    model = 0,     # one ratio for all branches
    NSsites = [8], # M8a
    icode = 8,
    fix_kappa = 0,
    kappa = 2,
    fix_omega = 1, # fix omega = 1 as null
    omega = 1,
    cleandata = 1,
    fix_blength = 0
)

In [214]:
cml.ctl_file = "codeml.ctl"
cml.write_ctl_file()

In [215]:
res = cml.run()

### Site model test result

In [216]:
# to allow for this code chunk to be run out of order, first switch back to the script dir
os.chdir(script_dir)
# change to the first segment analysis dir
working_dir = "../output/paml/B8441-OG-part/p1-414_697-981/"
os.chdir(working_dir)


In [217]:
# read site model results into a dictionary
site_res1 = {
    'cf1': codeml.read('site-cf1/mlc'),
    'cf2': codeml.read('site-cf2/mlc'),
    'M8a-cf1': codeml.read('site-M8a-cf1/mlc'),
    'M8a-cf2': codeml.read('site-M8a-cf2/mlc')
}

In [218]:
# process the data and write the output to a csv file
data_rows = []
for t in site_res1.values():
    data_rows += get_site_model_fit(t)
fields = ['codonfreq', 'model', 'npar', 'lnL', 'kappa', 'omega', 'description']
with open("20220730-p1-414_697-981-site-model-summary-table.tsv", "w") as f:
    write = csv.writer(f, delimiter = "\t")
    write.writerow(fields)
    write.writerows(data_rows)

## Branch models

### one ratio (F1x4)

In [219]:
# to allow for this code chunk to be run out of order, first switch back to the script dir
os.chdir(script_dir)
# change to the first segment analysis dir
working_dir = "../output/paml/B8441-OG-part/p1-414_697-981/branch-1r-cf1/"
if not os.path.exists(working_dir):
    os.mkdir(working_dir)
os.chdir(working_dir)

In [241]:
cml = codeml.Codeml()
cml.working_dir = "./"
cml.alignment = "../B8441-OG-Hil-PF11765-1-414_697-981.nuc"
cml.tree = "../p1-414_697-981.nwk"
cml.out_file = "mlc"

In [242]:
cml.set_options(
    noisy = 2,
    verbose = 0,
    seqtype = 1,
    CodonFreq = 1,  # F1x4
    model = 0,      # 1 ratio
    NSsites = [0],
    icode = 8,
    fix_kappa = 0,
    kappa = 2,
    fix_omega = 0,
    omega = .4,
    cleandata = 1,
    fix_blength = 0
)

In [243]:
cml.ctl_file = "codeml.ctl"
cml.write_ctl_file()

In [244]:
res = cml.run()

### free ratio (F1x4)

In [245]:
# to allow for this code chunk to be run out of order, first switch back to the script dir
os.chdir(script_dir)
# change to the first segment analysis dir
working_dir = "../output/paml/B8441-OG-part/p1-414_697-981/branch-freer-cf1/"
if not os.path.exists(working_dir):
    os.mkdir(working_dir)
os.chdir(working_dir)

In [246]:
cml = codeml.Codeml()
cml.working_dir = "./"
cml.alignment = "../B8441-OG-Hil-PF11765-1-414_697-981.nuc"
cml.tree = "../p1-414_697-981.nwk"
cml.out_file = "mlc"

In [247]:
cml.set_options(
    noisy = 2,
    verbose = 0,
    seqtype = 1,
    CodonFreq = 1,  # F1x4
    clock = 0,
    model = 1,      # free ratio
    NSsites = [0],
    icode = 8,
    fix_kappa = 0,
    kappa = 2,
    fix_omega = 0,
    omega = .4,
    cleandata = 1,
    fix_blength = 0
)

In [248]:
cml.ctl_file = "codeml.ctl"
cml.write_ctl_file()

In [249]:
res = cml.run()

### one ratio (F3x4)

In [250]:
# to allow for this code chunk to be run out of order, first switch back to the script dir
os.chdir(script_dir)
# change to the first segment analysis dir
working_dir = "../output/paml/B8441-OG-part/p1-414_697-981/branch-1r-cf2/"
if not os.path.exists(working_dir):
    os.mkdir(working_dir)
os.chdir(working_dir)

In [251]:
cml = codeml.Codeml()
cml.working_dir = "./"
cml.alignment = "../B8441-OG-Hil-PF11765-1-414_697-981.nuc"
cml.tree = "../p1-414_697-981.nwk"
cml.out_file = "mlc"

In [252]:
cml.set_options(
    noisy = 2,
    verbose = 0,
    seqtype = 1,
    CodonFreq = 2,  # F3x4
    model = 0,      # one ratio
    NSsites = [0],
    icode = 8,
    fix_kappa = 0,
    kappa = 2,
    fix_omega = 0,
    omega = .4,
    cleandata = 1,
    fix_blength = 0
)

In [253]:
cml.ctl_file = "codeml.ctl"
cml.write_ctl_file()

In [254]:
res = cml.run()

### free ratio (F3x4)

In [255]:
# to allow for this code chunk to be run out of order, first switch back to the script dir
os.chdir(script_dir)
# change to the first segment analysis dir
working_dir = "../output/paml/B8441-OG-part/p1-414_697-981/branch-freer-cf2/"
if not os.path.exists(working_dir):
    os.mkdir(working_dir)
os.chdir(working_dir)

In [256]:
cml = codeml.Codeml()
cml.working_dir = "./"
cml.alignment = "../B8441-OG-Hil-PF11765-1-414_697-981.nuc"
cml.tree = "../p1-414_697-981.nwk"
cml.out_file = "mlc"

In [257]:
cml.set_options(
    noisy = 2,
    verbose = 0,
    seqtype = 1,
    CodonFreq = 2,  # F3x4
    clock = 0,
    model = 1,      # free ratio
    NSsites = [0],
    icode = 8,
    fix_kappa = 0,
    kappa = 2,
    fix_omega = 0,
    omega = .4,
    cleandata = 1,
    fix_blength = 0
)

In [258]:
cml.ctl_file = "codeml.ctl"
cml.write_ctl_file()

In [259]:
res = cml.run()

### Free vs one-omega models test result
First, we reimport all the model results

In [260]:
# to allow for this code chunk to be run out of order, first switch back to the script dir
os.chdir(script_dir)
# change to the first segment analysis dir
working_dir = "../output/paml/B8441-OG-part/p1-414_697-981/"
os.chdir(working_dir)
branch_res = {
    '1r-cf1': codeml.read('branch-1r-cf1/mlc'),
    '1r-cf2': codeml.read('branch-1r-cf2/mlc'),
    'freer-cf1': codeml.read('branch-freer-cf1/mlc'),
    'freer-cf2': codeml.read('branch-freer-cf2/mlc')
}

In [261]:
print_branch_test(branch_res['freer-cf1'], branch_res['1r-cf1'])

Main result (codon model = F1x4)
    lnL: -5836.118516; np: 31; kappa: 1.566; w: [0.12668, 0.78876, 999.0, 0.44644, 0.14999, 0.0555, 240.40584, 0.15117, 0.0878, 0.39503, 999.0, 0.15505, 0.11936, 0.63334, 0.07837]
Null model (codon model = F1x4)
    lnL: -5877.793871; np: 17; kappa: 1.517; w: 0.15386
    2*dlnL = 83.351, df = 14, LLR P-value = 6.73e-12


In [262]:
print_branch_test(branch_res['freer-cf2'], branch_res['1r-cf2'])

Main result (codon model = F3x4)
    lnL: -5778.514525; np: 31; kappa: 1.361; w: [0.00937, 0.00514, 0.00466, 0.01248, 0.03504, 0.09473, 35.69055, 0.00986, 0.04317, 0.11246, 0.00796, 0.01923, 0.01071, 0.17742, 0.01633]
Null model (codon model = F3x4)
    lnL: -5786.491306; np: 17; kappa: 1.394; w: 0.01933
    2*dlnL = 15.954, df = 14, LLR P-value = 3.16e-01


there is thus strong evidence for non-equal dN/dS along the branches. But this test is almost always significant because the null hypothesis of homogeneous dN/dS across the entire tree is not realistic under most cases.

I plotted the free ratio model estimates on the gene tree using the `ggtree` package in R:
![free ratio estimates](../output/figure/20220727-B8441-OG-p1-414-branch-freer-tree.png)

By comparing the results obtained with CodonFreq=2 or CodonFreq=1, I see that two of the three branches identified as having elevated dN/dS by the F3x4 model was also implicated by the F1x4 model. Given Ziheng's [suggestion](https://groups.google.com/g/pamlsoftware/c/i7-NFSgnhq8/m/80rWE37kBgAJ), I decided to focus on the F3x4 model result and test the three branches together and separately (labeled as ω1, ω2 and ω3, respectively).

My next goal is to separately test whether there are statistical support for these two branches having a dN/dS > 1.