In [1]:
import sys
import os
sys.path.append('/share/software/user/open/py-matplotlib/3.7.1_py39/lib/python3.9/site-packages')
sys.path.append('/share/software/user/open/py-numpy/1.24.2_py39/lib/python3.9/site-packages')
import numpy as np

In [2]:
# load precomputed haplotype count table
path = '/home/groups/bhgood/uhgg/site_pairs/MGYG-HGUT-02492/MGYG-HGUT-02492.npy'
pairtype = 0 # syn

ns = np.load(path).astype(float)
n11s = ns[:, 0]
n10s = ns[:, 1]
n01s = ns[:, 2]
n00s = ns[:, 3]
ells = ns[:, 4]
types = ns[:, 5]
ntots = n11s+n10s+n01s+n00s

max_ntot = ntots.max()

# keeping only well covered sites
good_idxs = (ntots>0.95*max_ntot)

mask = (types==pairtype)
good_idxs = good_idxs & mask

n11s = n11s[good_idxs]
n10s = n10s[good_idxs]
n01s = n01s[good_idxs]
n00s = n00s[good_idxs]
n_obs = np.vstack([n10s, n01s, n11s, n00s]).T
print("Total data shape: {}, {} bytes".format(n_obs.shape, n_obs.nbytes))
ntots = ntots[good_idxs]
ells = ells[good_idxs]

Total data shape: (9697752, 4), 310328064 bytes


In [3]:
f11s = n11s*1.0/ntots
f10s = n10s*1.0/ntots
f01s = n01s*1.0/ntots
f00s = n00s*1.0/ntots

fAs = f11s+f10s
fBs = f11s+f01s

nAs = n11s+n10s
nBs = n11s+n01s

In [4]:
%%time
f0s = np.logspace(-3, -0.5, 100)
fmins = f0s * 0.3
fmaxs = f0s * 3

mean_nonzero_LE = []
LE_prob = []
mean_LE = []
num_pairs = []

ellmin,ellmax = 1e6, 1e9
for i in range(len(f0s)):
    fmax = fmaxs[i]
    fmin = fmins[i]
    good_idxs = (ells>ellmin)*(ells<ellmax)*(fAs<=fmax)*(fBs<=fmax)*(fAs>=fmin)*(fBs>=fmin)
    lambs = f11s[good_idxs] * f10s[good_idxs] * f01s[good_idxs] * f00s[good_idxs]
    lambs /= fAs[good_idxs]**2 * fBs[good_idxs]**2 * (1-fAs[good_idxs])**2 * (1-fBs[good_idxs])**2
    
    mean_LE.append(np.mean(lambs))
    LE_prob.append(np.mean(lambs>0))
    mean_nonzero_LE.append(np.mean(lambs[lambs>0]))
    num_pairs.append(len(lambs))

CPU times: user 6.93 s, sys: 449 ms, total: 7.37 s
Wall time: 7.48 s


In [5]:
np.save("f0s_fig10d", f0s)
np.save("LE_prob_fig10d", LE_prob)

In [6]:
def four_gamete_theory(fA, fB, NR, n):
    alpha = 2 * NR * fA * fB
    beta = 2 * NR
    probs = 1 - np.power(beta / (beta + n), alpha)
    return probs

In [7]:
# compution an effective theory curve
# idea: summing the theory prediction over the observed distribution of fA, fB within the (fmin, fmax) window
f0s = np.logspace(-3, -0.5, 100)
fmins = f0s * 0.3
fmaxs = f0s * 3

NR_max = 1 / 0.025 / 0.0275
NR_min = 1 / 0.175 / 0.1925

LE_prob = []
theory_max = []
theory_min = []

ellmin,ellmax = 1e6, 1e9
for i in range(len(f0s)):
    fmax = fmaxs[i]
    fmin = fmins[i]
    good_idxs = (ells>ellmin)*(ells<ellmax)*(fAs<=fmax)*(fBs<=fmax)*(fAs>=fmin)*(fBs>=fmin)
    lambs = f11s[good_idxs] * f10s[good_idxs] * f01s[good_idxs] * f00s[good_idxs]
    lambs /= fAs[good_idxs]**2 * fBs[good_idxs]**2 * (1-fAs[good_idxs])**2 * (1-fBs[good_idxs])**2

    mean_theory_prob = np.mean(four_gamete_theory(fAs[good_idxs], fBs[good_idxs], NR_max, n=4600))
    theory_max.append(mean_theory_prob)
    mean_theory_prob = np.mean(four_gamete_theory(fAs[good_idxs], fBs[good_idxs], NR_min, n=4600))
    theory_min.append(mean_theory_prob)
    
    LE_prob.append(np.mean(lambs>0))

In [11]:
np.save("LE_theory_fig10d", np.stack([theory_max, theory_min]))