# Log likelihood ratio tests for codon evolution of the PF11765 domain in _C. auris_

---
- author: Bin He
- date: 2022-08-02


## Setup environments

In [2]:
from Bio.Phylo.PAML import codeml
from scipy.stats import chi2
import os
import csv

In [3]:
# store the ipython notebook path so we can be sure to return to the same place
script_dir = os.path.abspath('')
print(script_dir)

/Users/bhe2/Documents/work/current/C037-Cand-auris-adhesin/02-case-studies/09-natural-selection/analysis


## Helper functions

In [4]:
def get_ll(res, m = 0):
    """
    extract the log likelihood score
    default to NSsite = 0
    """
    return(res['NSsites'][m]['lnL'])

def get_np(res, m = 0):
    """
    extract the log likelihood score
    default to NSsite = 0
    """
    par = res['NSsites'][m]['parameters']['parameter list']
    return(len(par.split(' ')))

In [73]:
# extract the model fit from NSsite model objects
def get_omega(fit, m):
    # model number
    M = f'M{m}'
    par = dict.get(fit, 'parameters')
    # get the model-specific set of parameters
    omega = ''
    # separately deal with the different site models
    if m == 0:
        omega = f'w={par["omega"]:.3f}'
    elif m == 1 or m == 2:
        M = M + 'a'
        site_class = dict.get(par, 'site classes')
        #print(site_class)
        for i, v in site_class.items():
            omega += f'p{i}={v["proportion"]:.3f}, w{i}={float(v["omega"]):.3f}; '
    elif m == 7:
        omega = f'p={par["p"]:.3f}, q={par["q"]:.3f}'
    elif m == 8:
        omega = f'p0={par["p0"]:.3f}, p={par["p"]:.3f}, q={par["q"]:.3f}; p1={1-par["p0"]:.3f}, w={par["w"]:.3f}'
        if par['w'] == 1.0: # model M8a
            M = 'M8a'
    return([M, omega])
   

In [36]:
def sig_code(p):
    """
    return r anova() type signficance code
    """
    sig = ' ' # record significance code, following R's anova()
    if p < 0.001:
        sig = '***'
    elif p < 0.01:
        sig = '**'
    elif p < 0.05:
        sig = '*'
    elif p < 0.1:
        sig = '.'
    return(sig)

In [32]:
def site_test(res1, m1, m0, res0 = None):
    """
    given two result objects and model number, print the test result
    by default, the two NSsite model results are presumed to be in the 
    same object. an optional second result object can be passed to res1
    when comparing M8a with M8
    """
    if res0 is None:
        res0 = res1
    # get the lnL and npar for both models
    lnL1 = get_ll(res1, m1); npar1 = get_np(res1, m1)
    lnL0 = get_ll(res0, m0); npar0 = get_np(res0, m0)
    dlnL = lnL1 - lnL0; dnpar = npar1 - npar0
    llr_P = 1 - chi2.cdf(x = 2*dlnL, df = dnpar)
    # get the significance code
    sig = sig_code(llr_P) 
    # get the parameter lists for both models
    res1_fit = res1["NSsites"][m1]; res1_par = res1_fit["parameters"]
    res0_fit = res0["NSsites"][m0]; res0_par = res0_fit["parameters"]
    M1, omega1 = get_omega(res1_fit, m1)
    M0, omega0 = get_omega(res0_fit, m0)
    # print test result
    print(f'-- {M1} vs {M0} --')
    print(f'2*dlnL = {2*dlnL:.3f}, df = {dnpar}, LLR P-value = {llr_P:.2e} {sig}')
    print('')
    print(f'{M1} ({res1["codon model"]}): lnL: {lnL1:.3f}; np: {npar1}; kappa: {res1_par["kappa"]:.3f}\n\
    w: {omega1}')
    print('')
    print(f'{M0} ({res0["codon model"]}): lnL: {lnL0:.3f}; np: {npar0}; kappa: {res0_par["kappa"]:.3f}\n\
    w: {omega0}')
    print("///\n")
    

In [20]:
# Test M2a vs M1a, M8 vs M7 and M8a vs M8
def print_site_test(res, resM8a):
    """
    perform a log likelihood ratio test comparing M2a vs M1a, M8 vs M7
    and M8a vs M8. resM8a is the output specifically for M8a
    """
    site_test(res, 2, 1)
    site_test(res, 8, 7)
    site_test(res, 8, 8, resM8a)
    print("---")
    print("Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1")

In [81]:
# print branch test results
def print_branch_test(res1, res0, m = 0):
    """
    extract and print the most important branch test result parameters, including
    codonmodel, lnL, npar, omega_tree
    also perform a log likelihood ratio test comparing the alternative with the null model
    designated as res1 and res0
    default to NSsites = 0
    """
    # get the model names
    model1 = res1['model']; model0 = res0['model']
    # get the lnL and npar for both models
    lnL1 = get_ll(res1, m); npar1 = get_np(res1, m)
    lnL0 = get_ll(res0, m); npar0 = get_np(res0, m)
    dlnL = lnL1 - lnL0; dnpar = npar1 - npar0
    llr_P = 1 - chi2.cdf(x = 2*dlnL, df = dnpar)
    sig = sig_code(llr_P) # get the significance code
    # get the parameter lists for both models
    res1_fit = res1["NSsites"][m]; res1_par = res1_fit["parameters"]
    res0_fit = res0["NSsites"][m]; res0_par = res0_fit["parameters"]
    # print useful model output
    print(f'H1 ({res1["codon model"]}): {model1}\n\
    lnL: {lnL1}; np: {npar1}; kappa: {res1_par["kappa"]:.3f}; w: {res1_par["omega"]}')
    # omega tree: {res1_fit["omega tree"]}')
    print(f'H0 ({res0["codon model"]}): {model0}\n\
    lnL: {lnL0}; np: {npar0}; kappa: {res0_par["kappa"]:.3f}; w: {res0_par["omega"]}')
    print(f'LLR test\n\
    2*dlnL = {2*dlnL:.3f}, df = {dnpar}, LLR P-value = {llr_P:.2e} {sig}')
    print("---")
    print("Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1")

## p1-414
### Site model tests

In [10]:
# to allow for this code chunk to be run out of order, first switch back to the script dir
os.chdir(script_dir)
# change to the first segment analysis dir
working_dir = "../output/paml/B8441-OG-part/p1-414/"
os.chdir(working_dir)


In [11]:
# read site model results into a dictionary
site_res = {
    'cf1': codeml.read('site-cf1/mlc'),
    'cf2': codeml.read('site-cf2/mlc'),
    'M8a-cf1': codeml.read('site-M8a-cf1/mlc'),
    'M8a-cf2': codeml.read('site-M8a-cf2/mlc')
}

- F1x4

In [59]:
print_site_test(site_res['cf1'], site_res['M8a-cf1'])

-- M2a vs M1a --
2*dlnL = 0.000, df = 2, LLR P-value = 1.00e+00  

M2a (F1x4): lnL: -3412.984; np: 20; kappa: 1.758
    w: p0=0.884, w0=0.130; p1=0.057, w1=1.000; p2=0.060, w2=1.000; 

M1a (F1x4): lnL: -3412.984; np: 18; kappa: 1.758
    w: p0=0.884, w0=0.130; p1=0.116, w1=1.000; 
///

-- M8 vs M7 --
2*dlnL = -0.001, df = 2, LLR P-value = 1.00e+00  

M8 (F1x4): lnL: -3390.595; np: 20; kappa: 1.696
    w: p0=1.000, p=1.275, q=6.783; p1=0.000, w=4.777

M7 (F1x4): lnL: -3390.594; np: 18; kappa: 1.696
    w: p=1.275, q=6.783
///

-- M8 vs M8a --
2*dlnL = 1.164, df = 1, LLR P-value = 2.81e-01  

M8 (F1x4): lnL: -3390.595; np: 20; kappa: 1.696
    w: p0=1.000, p=1.275, q=6.783; p1=0.000, w=4.777

M8a (F1x4): lnL: -3391.177; np: 19; kappa: 1.705
    w: p0=0.990, p=1.217, q=6.490; p1=0.010, w=1.000
///

---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1


- F3x4

In [22]:
print_site_test(site_res['cf2'], site_res['M8a-cf2'])

-- M2a vs M1a --
2*dlnL = 0.000, df = 2, LLR P-value = 1.00e+00  

M2a (F3x4): lnL: -3351.508; np: 20; kappa: 1.471
    w: p0=0.903, w0=0.058; p1=0.016, w1=1.000; p2=0.081, w2=1.000; 

M1a (F3x4): lnL: -3351.508; np: 18; kappa: 1.471
    w: p0=0.903, w0=0.058; p1=0.097, w1=1.000; 
///

-- M8 vs M7 --
2*dlnL = -0.000, df = 2, LLR P-value = 1.00e+00  

M8 (F3x4): lnL: -3316.619; np: 20; kappa: 1.392
    w: p0=1.000, p=1.120, q=36.018; p1=0.000, w=35.572

M7 (F3x4): lnL: -3316.618; np: 18; kappa: 1.392
    w: p=1.120, q=36.008
///

-- M8 vs M8a --
2*dlnL = 2.400, df = 1, LLR P-value = 1.21e-01  

M8 (F3x4): lnL: -3316.619; np: 20; kappa: 1.392
    w: p0=1.000, p=1.120, q=36.018; p1=0.000, w=35.572

M8a (F3x4): lnL: -3317.819; np: 19; kappa: 1.421
    w: p0=0.991, p=1.087, q=32.633; p1=0.009, w=1.000
///

---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1


### Branch model tests

In [24]:
# to allow for this code chunk to be run out of order, first switch back to the script dir
os.chdir(script_dir)
# change to the first segment analysis dir
working_dir = "../output/paml/B8441-OG-part/p1-414/"
os.chdir(working_dir)
branch_res = {
    '1r-cf1': codeml.read('branch-1r-cf1/mlc'),
    '1r-cf2': codeml.read('branch-1r-cf2/mlc'),
    'freer-cf1': codeml.read('branch-freer-cf1/mlc'),
    'freer-cf2': codeml.read('branch-freer-cf2/mlc'),
    '2r_all-cf1': codeml.read('branch-2r_all-cf1/mlc'),
    '2r_all-cf2': codeml.read('branch-2r_all-cf2/mlc'),
    '2r_alleq1-cf1': codeml.read('branch-2r_alleq1-cf1/mlc')
}

#### Free vs one-omega models

- F1x4

In [83]:
print_branch_test(branch_res['freer-cf1'], branch_res['1r-cf1'])

H1 (F1x4): free dN/dS Ratios for branches for branches, 
    lnL: -3400.08549; np: 31; kappa: 1.653; w: [0.09547, 1.01075, 999.0, 597.86035, 0.29957, 0.03466, 0.10976, 0.22273, 0.26868, 0.13454, 700.72088, 0.08481, 0.18013, 999.0, 0.07511]
H0 (F1x4): One dN/dS ratio for branches, 
    lnL: -3427.952669; np: 17; kappa: 1.592; w: 0.13685
LLR test
    2*dlnL = 55.734, df = 14, LLR P-value = 6.47e-07 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1


- F3x4

In [84]:
print_branch_test(branch_res['freer-cf2'], branch_res['1r-cf2'])

H1 (F3x4): free dN/dS Ratios for branches for branches, 
    lnL: -3348.618799; np: 31; kappa: 1.278; w: [390.96881, 0.00822, 307.96948, 0.00156, 0.00657, 0.02765, 0.043, 0.00589, 0.11662, 0.13653, 999.0, 0.0092, 0.07179, 0.17532, 0.04439]
H0 (F3x4): One dN/dS ratio for branches, 
    lnL: -3362.613054; np: 17; kappa: 1.342; w: 0.01825
LLR test
    2*dlnL = 27.989, df = 14, LLR P-value = 1.43e-02 *
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1


there is thus strong evidence for non-equal dN/dS along the branches. But this test is almost always significant because the null hypothesis of homogeneous dN/dS across the entire tree is not realistic under most cases.

I plotted the free ratio model estimates on the gene tree using the `ggtree` package in R:
![free ratio estimates](../output/figure/20220727-B8441-OG-p1-414-branch-freer-tree.png)

By comparing the results obtained with CodonFreq=2 or CodonFreq=1, I see that two of the three branches identified as having elevated dN/dS by the F3x4 model was also implicated by the F1x4 model. Ziheng [suggesed](https://groups.google.com/g/pamlsoftware/c/i7-NFSgnhq8/m/80rWE37kBgAJ) using F3x4 as the default model. I decided to test both F1x4 and F3x4 model results.

My next goal is to separately test whether there are statistical support for these two branches having a dN/dS > 1.

#### two ratio vs one ratio
Here we are testing the alternative model with one foreground group vs the null model of equal omega across the tree. The foreground groups were designated based on the free ratio model estimates (see above, Hil7 branch in F1x4 pic is not considered a foreground, as its omega is closer to the background than to the four designated as the foreground).

- F1x4

In [85]:
print_branch_test(branch_res['2r_all-cf1'], branch_res['1r-cf1'])

H1 (F1x4): several dN/dS ratios for branches for branches, 
    lnL: -3421.85936; np: 18; kappa: 1.583; w: [0.12695, 1.77137]
H0 (F1x4): One dN/dS ratio for branches, 
    lnL: -3427.952669; np: 17; kappa: 1.592; w: 0.13685
LLR test
    2*dlnL = 12.187, df = 1, LLR P-value = 4.81e-04 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1


The result provided support for the foreground groups having a different omega than the background. Next we will test for evidence of the foreground omega significantly greater than 1. We do so by comparing the same two ratio model result with a different null, where the same foreground group are allowed to have a different omega than the background but at the same time, the foreground omega is fixed at 1 (neutral). This test directly test for evidence for positive selection.

In [86]:
print_branch_test(branch_res['2r_all-cf1'], branch_res['2r_alleq1-cf1'])

H1 (F1x4): several dN/dS ratios for branches for branches, 
    lnL: -3421.85936; np: 18; kappa: 1.583; w: [0.12695, 1.77137]
H0 (F1x4): several dN/dS ratios for branches for branches,  omega = 1.000 fixed
    lnL: -3421.992047; np: 17; kappa: 1.581; w: [0.1292, 1.0]
LLR test
    2*dlnL = 0.265, df = 1, LLR P-value = 6.06e-01  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1


Here the result is not significant. Therefore, while we have evidence for the foreground groups having significantly elevated dN/dS compared with the background groups, we cannot determine if that elevated dN/dS is all due to relaxed constraint or if there has been positive selection.

- F3x4

In [87]:
print_branch_test(branch_res['2r_all-cf2'], branch_res['1r-cf2'])

H1 (F3x4): several dN/dS ratios for branches for branches, 
    lnL: -3360.672152; np: 18; kappa: 1.316; w: [0.0292, 0.00277]
H0 (F3x4): One dN/dS ratio for branches, 
    lnL: -3362.613054; np: 17; kappa: 1.342; w: 0.01825
LLR test
    2*dlnL = 3.882, df = 1, LLR P-value = 4.88e-02 *
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1


The P-value is smaller than 0.05, but the significance level is lower than F1x4. Moreover, the omega for the foreground was surprisingly estimated to be lower than that for the background. Given this result, I decided not to test for positive selection as I did above.

### Branch site test
Formally known as the **branch-site test for positive selection**, or as "test 2" in Zhang _et al._ 2005 (PMID: 16107592), it contrasts two models: the alternative model, specified by `model = 2; NSsites = 2`, is compared to the null model, which is a modified version of the previous one with `w2 = 1` fixed (`fix_omega = 1; omega = 1`). In the alternative model, both the foreground and background branches have the constrained and neutral classes, i.e., `0<w<1` and `w=1`. What's different between them is that the foreground branches have a third class, with w > 1. At those sites, the corresponding positions in the background branches would belong to either the constrained or neutral class. In the null model, `w2 = 1` is fixed and thus the LLR test specifically tests for evidence of positive selection on a **subset of the sites** and **only along the foreground branches**.

In [93]:
# to allow for this code chunk to be run out of order, first switch back to the script dir
os.chdir(script_dir)
# change to the first segment analysis dir
working_dir = "../output/paml/B8441-OG-part/p1-414/"
os.chdir(working_dir)
brnsite_res = {
    'alt-2r_all-cf1': codeml.read('brnsite-alt-2r_all-cf1/mlc'),
    'null-2r_all-cf1': codeml.read('brnsite-null-2r_all-cf1/mlc')
}

In [94]:
dl_bs_cf1 = get_ll(brnsite_res['alt-2r_all-cf1'], m = 2) - get_ll(brnsite_res['null-2r_all-cf1'], m = 2)
df_bs_cf1 = get_np(brnsite_res['alt-2r_all-cf1'], m = 2) - get_np(brnsite_res['null-2r_all-cf1'], m = 2)
p_bs_cf1 = 1 - chi2.cdf(2*dl_bs_cf1, df = df_bs_cf1)
sig_bs_cf1 = sig_code(p_bs_cf1)
print("Testing branch site model, with CodonFreq=1")
print(f'df={df_bs_cf1}, 2*∆lnL={2*dl_bs_cf1:.2f}, p={p_bs_cf1:.2e} {sig_bs_cf1}')

Testing branch site model, with CodonFreq=1
df=1, 2*∆lnL=0.16, p=6.89e-01  


No evidence for positive selection.

I didn't perform the branch-site test with F3x4 because the 2r vs 1r test above showed unexpected result (omega estimate for the foreground is smaller than that for the background)

### Conclusion for p1-414
- No evidence for positive selection from the site test
- **Evidence for dN/dS rate heterogeneity along the branches**.
    - Statistical support for the foreground branches having a higher dN/dS ratio than the background when using the F1x4 model.
    - The test for positive selection (foreground dN/dS > 1) is not significant.
    - The same test with F3x4 model gave an unexpected result, where the foreground dN/dS estimate is lower than that for the background, even though in the free ratio model these branches were identified as having very large dN/dS
- Branch-site test using F1x4 didn't provide statistical support for positive selection on the foreground branches.

## p697-981
### Site model tests

In [95]:
# to allow for this code chunk to be run out of order, first switch back to the script dir
os.chdir(script_dir)
# change to the first segment analysis dir
working_dir = "../output/paml/B8441-OG-part/p697-981/"
os.chdir(working_dir)


In [96]:
# read site model results into a dictionary
site_res = {
    'cf1': codeml.read('site-cf1/mlc'),
    'cf2': codeml.read('site-cf2/mlc'),
    'M8a-cf1': codeml.read('site-M8a-cf1/mlc'),
    'M8a-cf2': codeml.read('site-M8a-cf2/mlc')
}

- F1x4

In [97]:
print_site_test(site_res['cf1'], site_res['M8a-cf1'])

-- M2a vs M1a --
2*dlnL = 0.000, df = 2, LLR P-value = 1.00e+00  

M2a (F1x4): lnL: -2342.393; np: 20; kappa: 1.641
    w: p0=0.598, w0=0.077; p1=0.223, w1=1.000; p2=0.179, w2=1.000; 

M1a (F1x4): lnL: -2342.393; np: 18; kappa: 1.641
    w: p0=0.598, w0=0.077; p1=0.402, w1=1.000; 
///

-- M8 vs M7 --
2*dlnL = 9.744, df = 2, LLR P-value = 7.66e-03 **

M8 (F1x4): lnL: -2310.397; np: 20; kappa: 1.507
    w: p0=0.948, p=0.514, q=1.907; p1=0.052, w=14.926

M7 (F1x4): lnL: -2315.269; np: 18; kappa: 1.448
    w: p=0.442, q=1.319
///

-- M8 vs M8a --
2*dlnL = 7.817, df = 1, LLR P-value = 5.18e-03 **

M8 (F1x4): lnL: -2310.397; np: 20; kappa: 1.507
    w: p0=0.948, p=0.514, q=1.907; p1=0.052, w=14.926

M8a (F1x4): lnL: -2314.305; np: 19; kappa: 1.449
    w: p0=0.902, p=0.510, q=2.314; p1=0.098, w=1.000
///

---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1


- F3x4

In [98]:
print_site_test(site_res['cf2'], site_res['M8a-cf2'])

-- M2a vs M1a --
2*dlnL = 4.299, df = 2, LLR P-value = 1.17e-01  

M2a (F3x4): lnL: -2326.418; np: 20; kappa: 1.346
    w: p0=0.660, w0=0.064; p1=0.288, w1=1.000; p2=0.052, w2=20.134; 

M1a (F3x4): lnL: -2328.567; np: 18; kappa: 1.277
    w: p0=0.676, w0=0.060; p1=0.324, w1=1.000; 
///

-- M8 vs M7 --
2*dlnL = 7.283, df = 2, LLR P-value = 2.62e-02 *

M8 (F3x4): lnL: -2282.279; np: 20; kappa: 1.326
    w: p0=0.939, p=0.518, q=5.868; p1=0.061, w=12.560

M7 (F3x4): lnL: -2285.920; np: 18; kappa: 1.517
    w: p=0.429, q=5.083
///

-- M8 vs M8a --
2*dlnL = 1.140, df = 1, LLR P-value = 2.86e-01  

M8 (F3x4): lnL: -2282.279; np: 20; kappa: 1.326
    w: p0=0.939, p=0.518, q=5.868; p1=0.061, w=12.560

M8a (F3x4): lnL: -2282.849; np: 19; kappa: 1.460
    w: p0=0.938, p=0.472, q=7.285; p1=0.062, w=1.000
///

---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1


### Branch model tests

In [108]:
# to allow for this code chunk to be run out of order, first switch back to the script dir
os.chdir(script_dir)
# change to the first segment analysis dir
working_dir = "../output/paml/B8441-OG-part/p697-981/"
os.chdir(working_dir)
branch_res = {
    '1r-cf1': codeml.read('branch-1r-cf1/mlc'),
    '1r-cf2': codeml.read('branch-1r-cf2/mlc'),
    'freer-cf1': codeml.read('branch-freer-cf1/mlc'),
    'freer-cf2': codeml.read('branch-freer-cf2/mlc'),
    '2r_all-cf1': codeml.read('branch-2r_all-cf1/mlc'),
    '2r_all-cf2': codeml.read('branch-2r_all-cf2/mlc'),
    '2r_alleq1-cf1': codeml.read('branch-2r_alleq1-cf1/mlc'),
    '2r_alleq1-cf2': codeml.read('branch-2r_alleq1-cf2/mlc')
}

#### Free vs one-omega models

- F1x4

In [100]:
print_branch_test(branch_res['freer-cf1'], branch_res['1r-cf1'])

H1 (F1x4): free dN/dS Ratios for branches for branches, 
    lnL: -2391.612283; np: 31; kappa: 1.429; w: [0.23095, 0.00445, 998.99991, 0.14377, 0.07403, 999.0, 92.44628, 511.2296, 0.1592, 0.11967, 0.10617, 957.23845, 0.04349, 0.62278, 0.17973]
H0 (F1x4): One dN/dS ratio for branches, 
    lnL: -2411.367957; np: 17; kappa: 1.413; w: 0.18181
LLR test
    2*dlnL = 39.511, df = 14, LLR P-value = 3.04e-04 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1


- F3x4

In [101]:
print_branch_test(branch_res['freer-cf2'], branch_res['1r-cf2'])

H1 (F3x4): free dN/dS Ratios for branches for branches, 
    lnL: -2380.28829; np: 31; kappa: 1.194; w: [0.65755, 0.00425, 999.0, 0.12997, 0.01553, 999.0, 0.1662, 999.0, 0.1177, 0.07865, 999.0, 0.11371, 0.0142, 0.28089, 0.02319]
H0 (F3x4): One dN/dS ratio for branches, 
    lnL: -2391.8604; np: 17; kappa: 1.443; w: 0.03329
LLR test
    2*dlnL = 23.144, df = 14, LLR P-value = 5.80e-02 .
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1


there is thus strong evidence for non-homogeneous dN/dS along the branches. But this test is almost always significant because the null hypothesis of homogeneous dN/dS across the entire tree is not realistic under most cases.

I plotted the free ratio model estimates on the gene tree using the `ggtree` package in R:
![free ratio estimates](../output/figure/20220730-B8441-OG-p697-981-branch-freer-tree.png)

For this partition, the two models gave quite similar results.


Comparing the two results, I hypothesize that there has been accelerated evolution in the internal branches **11..12, 11..13, 13..14, 14..15 and 15..16**. Considering that the placement of Hil7 in the PF11765 domain tree is uncertain (low BS support), and since p1-414 tree placed Hil7 as the outgroup for Hil1, 2, 3, 4, 6 and 8, it is conceivable that Hil7 is still the outgroup here and that the foreground branches encompasses all the internal branches after the divergence of Hil1, 2, 3, 4, 6 and 8 from Hil5 and Hil7.

#### two ratio vs one ratio
Here we are testing the alternative model with one foreground group vs the null model of equal omega across the tree. The foreground groups were designated based on the free ratio model estimates from both the F1x4 and F3x4 results: we added 13..14 to the foreground on top of the four branches identified in the F3x4 model.

- F1x4

In [102]:
print_branch_test(branch_res['2r_all-cf1'], branch_res['1r-cf1'])

H1 (F1x4): several dN/dS ratios for branches for branches, 
    lnL: -2406.811492; np: 18; kappa: 1.407; w: [0.13764, 1.52567]
H0 (F1x4): One dN/dS ratio for branches, 
    lnL: -2411.367957; np: 17; kappa: 1.413; w: 0.18181
LLR test
    2*dlnL = 9.113, df = 1, LLR P-value = 2.54e-03 **
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1


The result provided support for the foreground groups having a different omega than the background. Next we will test for evidence of the foreground omega significantly greater than 1. We do so by comparing the same two ratio model result with a different null, where the same foreground group are allowed to have a different omega than the background but at the same time, the foreground omega is fixed at 1 (neutral). This test directly test for evidence for positive selection.

In [103]:
print_branch_test(branch_res['2r_all-cf1'], branch_res['2r_alleq1-cf1'])

H1 (F1x4): several dN/dS ratios for branches for branches, 
    lnL: -2406.811492; np: 18; kappa: 1.407; w: [0.13764, 1.52567]
H0 (F1x4): several dN/dS ratios for branches for branches,  omega = 1.000 fixed
    lnL: -2406.843304; np: 17; kappa: 1.406; w: [0.14099, 1.0]
LLR test
    2*dlnL = 0.064, df = 1, LLR P-value = 8.01e-01  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1


Here the result is not significant. Therefore, while we have evidence for the foreground groups having significantly elevated dN/dS compared with the background groups, we cannot determine if that elevated dN/dS is all due to relaxed constraint or if there has been positive selection.

- F3x4

In [104]:
print_branch_test(branch_res['2r_all-cf2'], branch_res['1r-cf2'])

H1 (F3x4): several dN/dS ratios for branches for branches, 
    lnL: -2389.812147; np: 18; kappa: 1.412; w: [0.03353, 999.0]
H0 (F3x4): One dN/dS ratio for branches, 
    lnL: -2391.8604; np: 17; kappa: 1.443; w: 0.03329
LLR test
    2*dlnL = 4.097, df = 1, LLR P-value = 4.30e-02 *
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1


The P-value is just below 0.05, providing modest support for rate heterogeneity.

In [110]:
print_branch_test(branch_res['2r_all-cf2'], branch_res['2r_alleq1-cf2'])

H1 (F3x4): several dN/dS ratios for branches for branches, 
    lnL: -2389.812147; np: 18; kappa: 1.412; w: [0.03353, 999.0]
H0 (F3x4): several dN/dS ratios for branches for branches,  omega = 1.000 fixed
    lnL: -2389.952596; np: 17; kappa: 1.414; w: [0.03357, 1.0]
LLR test
    2*dlnL = 0.281, df = 1, LLR P-value = 5.96e-01  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1


Similar to the F1x4 model result, here there is no statistical support for positive selection on the foreground branches under the one site class model.

### Branch site test
Formally known as the **branch-site test for positive selection**, or as "test 2" in Zhang _et al._ 2005 (PMID: 16107592), it contrasts two models: the alternative model, specified by `model = 2; NSsites = 2`, is compared to the null model, which is a modified version of the previous one with `w2 = 1` fixed (`fix_omega = 1; omega = 1`). In the alternative model, both the foreground and background branches have the constrained and neutral classes, i.e., `0<w<1` and `w=1`. What's different between them is that the foreground branches have a third class, with w > 1. At those sites, the corresponding positions in the background branches would belong to either the constrained or neutral class. In the null model, `w2 = 1` is fixed and thus the LLR test specifically tests for evidence of positive selection on a **subset of the sites** and **only along the foreground branches**.

In [114]:
# to allow for this code chunk to be run out of order, first switch back to the script dir
os.chdir(script_dir)
# change to the first segment analysis dir
working_dir = "../output/paml/B8441-OG-part/p697-981/"
os.chdir(working_dir)
brnsite_res = {
    'alt-2r_all-cf1': codeml.read('brnsite-alt-2r_all-cf1/mlc'),
    'null-2r_all-cf1': codeml.read('brnsite-null-2r_all-cf1/mlc'),
    'alt-2r_all-cf2': codeml.read('brnsite-alt-2r_all-cf2/mlc'),
    'null-2r_all-cf2': codeml.read('brnsite-null-2r_all-cf2/mlc')
}

In [115]:
dl_bs_cf1 = get_ll(brnsite_res['alt-2r_all-cf1'], m = 2) - get_ll(brnsite_res['null-2r_all-cf1'], m = 2)
df_bs_cf1 = get_np(brnsite_res['alt-2r_all-cf1'], m = 2) - get_np(brnsite_res['null-2r_all-cf1'], m = 2)
p_bs_cf1 = 1 - chi2.cdf(2*dl_bs_cf1, df = df_bs_cf1)
sig_bs_cf1 = sig_code(p_bs_cf1)
print("Testing branch site model, with CodonFreq=1")
print(f'df={df_bs_cf1}, 2*∆lnL={2*dl_bs_cf1:.2f}, p={p_bs_cf1:.2e} {sig_bs_cf1}')

Testing branch site model, with CodonFreq=1
df=1, 2*∆lnL=11.82, p=5.85e-04 ***


In [116]:
dl_bs_cf2 = get_ll(brnsite_res['alt-2r_all-cf2'], m = 2) - get_ll(brnsite_res['null-2r_all-cf2'], m = 2)
df_bs_cf2 = get_np(brnsite_res['alt-2r_all-cf2'], m = 2) - get_np(brnsite_res['null-2r_all-cf2'], m = 2)
p_bs_cf2 = 1 - chi2.cdf(2*dl_bs_cf2, df = df_bs_cf2)
sig_bs_cf2 = sig_code(p_bs_cf2)
print("Testing branch site model, with CodonFreq=2")
print(f'df={df_bs_cf2}, 2*∆lnL={2*dl_bs_cf2:.2f}, p={p_bs_cf2:.2e} {sig_bs_cf2}')

Testing branch site model, with CodonFreq=1
df=1, 2*∆lnL=8.94, p=2.78e-03 **


With p697-981, we got significant results for the branch site model test under both F1x4 and F3x4. Below are the model output copied from the `mlc` file. Note that this part is not automatically updated. So if the above model is re-run with different parameters or input, the results below need to be manually updated.

F1x4, alternative model (model A) output:
(based on PAML documentation, only the BEB post-analysis results are shown and the NEB results are ignored)

F3x4, alternative model (model A) output:

### Conclusion for p697-981
- **Mixed evidence** for positive selection from the site test
    - M2a vs M1a test was not significant under both F1x4 and F3x4, although the P-value under F3x4 is close to 0.1
    - M8 vs M7 test was significant under both F1x4 and F3x4, with P-values < 0.05
    - M8 vs M8a test (more stringent test for positive selection) was significant under F1x4 and not under F3x4
- **Evidence for dN/dS rate heterogeneity along the branches**.
    - Statistical support for the foreground branches having a higher dN/dS ratio than the background under the F1x4 model and the P-value under the F3x4 model is slightly above 0.05.
    - The test for positive selection (foreground dN/dS > 1) was not significant under either codon models.
- **Evidence for branch-specific and site-specific positive selection**
    - Branch-site test performed under both F1x4 and F3x4 models yielded significant results, and the BEB procedure identified a shared set of residues potentially under positive selection among those branches.