### Estimating divergence time between Jean-Talon and it's relatives in generations and years
Method adopted from Skoglund et al. 2011 (following Green et al. 2006).
Method uses triplets: Jean-Talon, relative & outgroup. Outgroup is S. cerevisiae, which is the reference genome.
First estimation of divergence between Jean-Talon & an outgroup (using molecular clock), by calculating fixed differences between Jean-Talon and outgroup (synonymous sites fixed for 1 in Jean-Talon). Second calculating sites which are shared between Jean-Talon & outgroup but not the relative (genotypes 0,1,0) and shared between relative and an outgroup but not Jean-Talon (genotypes 1, 0, 0). Number of these sites over total number of sites carry information about the proportion of the branch length from the split of Jean-Talon with its relative, relative to the branch length from the split of Jean-Talon (or relative) with an outgroup.
@author:aniafijarczyk

In [136]:
import pandas as pd
import glob
import random
import gzip
import numpy as np
from collections import defaultdict

#### Setting global variables

In [137]:
focal_species = "Jean-Talon"
relatives = ["A.Muntons","A.S-33","A.T-58","BE005","CFI","CFN","CFP"]
mutation_rate = 1.67E-10
gen_year_min = 150 # lower estimate of generation number per year (from Gallone et al. 2016)
gen_year_max = 2920 # higher estimate of generation number per year (from Fay & Benavides 2005)
jean_talon_synonymous_sites = 1640852.73 # total length of synonymous sites in Jean-Talon (no missing data)

#### Lengths of synonymous sites for pairs of genomes of different relatives with Jean-Talon

In [138]:
lengths = pd.read_csv("./input_files/synonymous_length.txt",sep="\t",header=0,names=['strain','length'])
lengths.head()

Unnamed: 0,strain,length
0,A.Muntons,963216.1761
1,A.S-33,775193.818
2,A.T-58,789363.5493
3,BE005,970227.5601
4,CFI,970243.8


#### Selecting strain indices for triplets (duplets here - S. cerevisiae is reference genome)

In [139]:
samples = pd.read_csv("./input_files/relatives_annot_Filtered2_01.samples", sep="\t", header=None, names=["haplotype"])
sample_names = list(samples["haplotype"])
T = []
for strain in relatives:
    test_samples = ["Jean-Talon",strain]
    test_samples_p1 = [[0,1][ele.split("_")[0] in test_samples[0]] for ele in sample_names]
    samp_indices_p1 = [i for i in range(len(test_samples_p1)) if test_samples_p1[i] == 1]
    test_samples_p2 = [[0,1][ele.split("_")[0] in test_samples[1]] for ele in sample_names]
    samp_indices_p2 = [i for i in range(len(test_samples_p2)) if test_samples_p2[i] == 1]
    samp_indices = samp_indices_p1 + samp_indices_p2
    T.append(samp_indices)
T

[[38, 39, 40, 41, 4, 5, 6, 7],
 [38, 39, 40, 41, 8, 9, 10, 11],
 [38, 39, 40, 41, 12, 13, 14, 15],
 [38, 39, 40, 41, 20, 21, 22, 23],
 [38, 39, 40, 41, 24, 25, 26, 27],
 [38, 39, 40, 41, 28, 29, 30, 31],
 [38, 39, 40, 41, 32, 33, 34, 35]]

#### Reading file with synonymous variants

In [146]:
fa = gzip.open("./input_files/relatives_annot_synonymous_snpEff.tab.gz", "rt").readlines()
#fa = gzip.open("./input_files/sample_annot_synonymous_snpEff.tab.gz", "rt").readlines()
ann = [ele.split() for ele in fa]
D = {a+"_"+b:c for a,b,c in ann}
print("Number of all synonymous variants = "+str(len(list(D.keys()))))

Number of all synonymous variants = 52812


#### Reading file with all variant genotypes & filtering only synonymous

In [148]:
fh = gzip.open("./input_files/relatives_annot_Filtered2_01.tab.gz","rt").readlines()
#fh = gzip.open("./input_files/sample_annot_Filtered2_01.tab.gz","rt").readlines()
d = {'_'.join(ele.split()[:2]):''.join(ele.split()[2:]) for ele in fh}
k = {ele:d[ele] for ele in D.keys()}
print("Number of filtered synonymous variants = "+str(len(list(k.keys()))))

Number of filtered synonymous variants = 52812


#### Calculating fixed differences between Jean-Talon & reference (outgroup)

In [149]:
jt = [] # fixed variants relative to reference
jt_tot = [] # all synonymous variants with no missing data
for pos in k.keys():
    newset = ''.join(k[pos][-4:])
    if (newset.count('.') == 0):
        jt_tot.append(newset)
        if (newset.count('0') == 0):
            jt.append(newset)
k_rate = len(jt)/jean_talon_synonymous_sites
t_out = k_rate/(2*mutation_rate)
print("Number of fixed synonymous differences between Jean-Talon & reference is "+str(len(jt)))
print("Divergence rate between Jean-Talon & reference is "+str(k_rate))
print("Number of generations since divergence of Jean-Talon with reference is "+str(t_out))

Number of fixed synonymous differences between Jean-Talon & reference is 7871
Divergence rate between Jean-Talon & reference is 0.004796896062695401
Number of generations since divergence of Jean-Talon with reference is 14361964.25956707


#### Calculating time of split of Jean-Talon with relatives, relative to time of split with reference

In [150]:
S = defaultdict(list)
S2 = defaultdict(list)
for duplex_index in range(len(T)):
    
    sec_strain = relatives[duplex_index]
    print(sec_strain)
    n = []
    for pos in k.keys():
        newset = ''.join([k[pos][i] for i in T[duplex_index]])
        if newset.count('.') == 0:
            n.append(newset)
    #n[:3]
    P = []
    P2 = []
    C2_aba = []
    C2_baa = []
    for site in n:
        # taxon 1 and taxon 2 bases are given by randomly selecting one base from all alleles in a given position
        #anc = random.sample(list(site[0:4]),1)
        sp1 = random.sample(list(site[0:4]),1)[0]
        sp2 = random.sample(list(site[4:8]),1)[0]
        #if sp1.intersection(set(anc)): p1 = "A"
        if sp1 == '1': p1 = "B"   
        else: p1 = "A"
        if sp2 == '1': p2 = "B"
        else: p2 = "A"
        pat = p1+p2+"A"
        P.append(pat)
        
        # derived bases in taxon 1 and 2 are all bases with derived mutations of any frequency 
        sp1 = set(list(site[0:4]))
        sp2 = set(list(site[4:8]))
        if sp1.intersection(set('1')): p1 = "B"
        else: p1 = "A"
        if sp2.intersection(set('1')): p2 = "B"
        else: p2 = "A"
        pat2 = p1+p2+"A"
        P2.append(pat2)
        if pat2 == "ABA":
            C2_aba.append(list(sp2).count("1")/4.)
        elif pat2 == "BAA":
            C2_baa.append(list(sp1).count("1")/4.)
     
    nnn = lengths.loc[lengths["strain"]==sec_strain,'length'].values[0]
    
    aba = P.count("ABA") 
    baa = P.count("BAA")
    Ss1 = aba/float(nnn)
    Ss2 = baa/float(nnn)
    S['strain'].append(sec_strain)
    S['ABA'].append(aba)
    S['BAA'].append(baa)
    S['Ss_ABA'].append(Ss1)
    S['Ss_BAA'].append(Ss2)
    S['mean_Ss'].append(np.mean([Ss1, Ss2]))
   
    
    aba2 = P2.count("ABA")
    baa2 = P2.count("BAA")
    Ss1 = aba2/float(nnn)
    Ss2 = baa2/float(nnn)
    S2['strain'].append(sec_strain)
    S2['ABA'].append(aba2)
    S2['BAA'].append(baa2)
    S2['Ss1'].append(Ss1)
    S2['Ss2'].append(Ss2)
    S2['meanSs'].append(np.mean([Ss1, Ss2]))
    # rate of aba and baa patterns is multiplied by frequency of corresponding derived mutations
    Ss1_freq = sum(C2_aba)/float(nnn)
    Ss2_freq = sum(C2_baa)/float(nnn)
    S2['Ss1_freq'].append(Ss1_freq)
    S2['Ss2_freq'].append(Ss2_freq)
    S2['meanSs_freq'].append(np.mean([Ss1_freq, Ss2_freq]))
    
dS1 = pd.DataFrame(S)
dS2 = pd.DataFrame(S2)

A.Muntons
A.S-33
A.T-58
BE005
CFI
CFN
CFP


#### Calculating divergence times

In [151]:
dS1['t_aba_150'] = (dS1['Ss_ABA']*t_out)/150
dS1['t_baa_150'] = (dS1['Ss_BAA']*t_out)/150
dS1['t_150'] = (dS1['mean_Ss']*t_out)/150
dS1['t_aba_2920'] = (dS1['Ss_ABA']*t_out)/2920
dS1['t_baa_2920'] = (dS1['Ss_BAA']*t_out)/2920
dS1['t_2920'] = (dS1['mean_Ss']*t_out)/2920
dS1['t_out'] = t_out
dS1['mut_rate'] = mutation_rate
dF = dS1.loc[dS1['strain'].isin(["A.Muntons","A.S-33","BE005","CFI","CFN"]),:]
dM = pd.merge(dF,lengths,on=['strain'],how='left')
dM

Unnamed: 0,strain,ABA,BAA,Ss_ABA,Ss_BAA,mean_Ss,t_aba_150,t_baa_150,t_150,t_aba_2920,t_baa_2920,t_2920,t_out,mut_rate,length
0,A.Muntons,1544,2949,0.001603,0.003062,0.002332,153.477993,293.13899,223.308492,7.884143,15.05851,11.471327,14361960.0,1.67e-10,963216.1761
1,A.S-33,796,1740,0.001027,0.002245,0.001636,98.31626,214.912428,156.614344,5.050493,11.040022,8.045257,14361960.0,1.67e-10,775193.818
2,BE005,4330,5631,0.004463,0.005804,0.005133,427.303915,555.692459,491.498187,21.950544,28.545845,25.248195,14361960.0,1.67e-10,970227.5601
3,CFI,4514,5683,0.004652,0.005857,0.005255,445.454408,560.814666,503.134537,22.882932,28.808973,25.845952,14361960.0,1.67e-10,970243.8
4,CFN,4419,5663,0.004555,0.005837,0.005196,436.080111,558.841745,497.460928,22.401376,28.707624,25.5545,14361960.0,1.67e-10,970242.5225


#### Saving table

In [152]:
dM.to_csv("calcRelativeTime_Scer.out",sep="\t",index=False,header=True)
#dM.to_csv("calcRelativeTime_Scer_sample.out",sep="\t",index=False,header=True)