Null hipothesis: positive selection is independent from duplications

Alt hipothesis: positive selection is more common in nodes where a duplication has happened

Reasoning: when a gene duplicates, its more likely that at least one of the children will go through positive selection

In [1]:
import json
import re
import pandas as pd
from ete3 import Tree
from scipy.stats import chi2_contingency

In [2]:
df = pd.read_csv("../cluster_seqs.csv", sep=';', index_col=0)
with open("aBSREL Results/trim5.json") as f:
    res = json.load(f)
t = Tree(res["input"]["trees"]["0"] + ";", format=1)
t.remove_child(t & "XM_004389073_2_Trichechus_manatus_latirostris")

Tree node 'XM_004389073_2_Trichechus_manatus_latirostris' (0x2eef64f852)

In [3]:
for leaf in t.iter_leaves():
    m = re.match(r"[XN]M_\d+_\d+_(.+)", leaf.name)
    leaf.add_features(species=m.group(1))
print(t.get_ascii(show_internal=True))


                            /-XM_012950138_2_Jaculus_jaculus
                           |
                           |                                  /-NM_001368753_1_Mus_musculus
                           |                            /Node13
                           |                      /Node12     \-NM_001310602_1_Mus_musculus
                           |                     |     |
                           |                /Node11     \-NM_001146007_1_Mus_musculus
                           |               |     |
                           |          /Node10     \-XM_034506169_1_Arvicanthis_niloticus
                           |         |     |
                           |     /Node9     \-NM_001014023_1_Rattus_norvegicus
                           |    |    |
                           |    |    |      /-XM_006979025_3_Peromyscus_maniculatus_bairdii
                       /Node6   |     \Node19
                      |    |    |          |      /-XM_005370173_3_Microtus_o

In [4]:
def positive_child(node):
    for child in node.children:
        child_stats = res["branch attributes"]["0"][child.name]
        if child_stats["Corrected P-value"] <= 0.05:
            return True
    return False

In [8]:
# Indicate that positive selection happened at this node (Cholopus seqs) but not in correlation with a duplication
posdf = pd.DataFrame(index=["Root", "Node0"], data={"pos_sel": [False, True], "dup": [False, False]})
cached_sps = t.get_cached_content(store_attr="species")
for node in t.iter_descendants():
    if not node.is_leaf():
        for sp in cached_sps[node]:
            if sp in cached_sps[node.children[0]] and sp in cached_sps[node.children[1]]:
                posdf.loc[node.name, "dup"] = True
                break
            else:
                posdf.loc[node.name, "dup"] = False
        if positive_child(node):
            posdf.loc[node.name, "pos_sel"] = True
        else:
            posdf.loc[node.name, "pos_sel"] = False
posdf

Unnamed: 0,pos_sel,dup
Root,False,False
Node0,True,False
Node2,True,False
Node159,False,True
Node3,True,False
...,...,...
Node64,False,True
Node147,False,True
Node30,False,False
Node33,False,False


In [9]:
cross = pd.crosstab(index=posdf["pos_sel"], columns=posdf["dup"])
cross

dup,False,True
pos_sel,Unnamed: 1_level_1,Unnamed: 2_level_1
False,40,15
True,16,10


In [10]:
chi_res = chi2_contingency(cross)
chi_res[1]  # P value

0.44718681462651844

Not significative :(