In [1]:
import pandas as pd
import numpy as np
import msprime as msp
import allel

from IPython.display import SVG

In [2]:
real_samples = pd.read_table("~/projects/nea-over-time/data/admixture_array_nea.tsv")

In [3]:
def plot_tree(tree):
    colour_map = {0 : "purple", 1 : "blue", 2 : "green", 3 : "red"}

    node_colours = {u: colour_map[tree.population(u)] for u in tree.nodes()}
    node_labels = {u if tree.is_leaf(u) else "": u for u in tree.nodes()}
    mut_labels = {"": m for m in tree.mutations()}
    mut_colours = {"black": m for m in tree.mutations()}

    return SVG(tree.draw(node_colours=node_colours, node_labels=node_labels, mutation_labels=mut_labels, mutation_colours=mut_colours, width=1000, height=600))

In [4]:
def yrs_to_gen(years, gen_time=25):
    return int(years / gen_time)

In [5]:
EUR_ages = [yrs_to_gen(y) for y in real_samples.age]

In [6]:
n_CH, n_AFR, n_EUR, n_NEA = 1, 100, len(EUR_ages), 4

In [7]:
# population IDs
CH, AFR, EUR, NEA = 0, 1, 2, 3

# effective population sizes at present
Ne0 = 10000
Ne_anc = 10000
Ne_bottle = 2000
Ne_NEA = 1000
Ne_AFR = 10000
Ne_EUR = 10000

# split times
T_split_CH = yrs_to_gen(6_000_000)
T_split_NEA = yrs_to_gen(600_000)
T_split_EUR = yrs_to_gen(60_000)

# migration start and end times
T_m_NEA = yrs_to_gen(55000)

samples = (
    [msp.Sample(population=CH, time=0) for _ in range(n_CH)] +
    [msp.Sample(population=AFR, time=0) for _ in range(n_AFR)] +
    [msp.Sample(population=EUR, time=age) for age in range(n_EUR)] +
    [msp.Sample(population=NEA, time=yrs_to_gen(70000)) for _ in range(n_NEA)]
)

pop_config = [
    msp.PopulationConfiguration(),
    msp.PopulationConfiguration(initial_size=Ne_AFR),
    msp.PopulationConfiguration(initial_size=Ne_EUR),
    msp.PopulationConfiguration(initial_size=Ne_NEA)
]

demography = [
    # Neanderthal admixture
    # out of Africa migration
    msp.MassMigration(time=T_m_NEA, source=EUR, destination=NEA, proportion=0.05),

    # out of Africa migration
    msp.MassMigration(time=T_split_EUR, source=EUR, destination=AFR, proportion=1.0),

    # Neanderthal split
    msp.MassMigration(time=T_split_NEA, source=NEA, destination=AFR, proportion=1.0),

    # chimpanzee split
    msp.MassMigration(time=T_split_CH, source=AFR, destination=CH, proportion=1.0),
]

msp.DemographyDebugger(
    Ne=Ne0,
    population_configurations=pop_config,
    demographic_events=demography
).print_history()

Epoch: 0 -- 2200.0 generations
     start     end      growth_rate |     0        1        2        3    
   -------- --------       -------- | -------- -------- -------- -------- 
0 |  1e+04    1e+04               0 |     0        0        0        0    
1 |  1e+04    1e+04               0 |     0        0        0        0    
2 |  1e+04    1e+04               0 |     0        0        0        0    
3 |  1e+03    1e+03               0 |     0        0        0        0    

Events @ generation 2200.0
   - Mass migration: lineages move from 2 to 3 with probability 0.05
Epoch: 2200.0 -- 2400.0 generations
     start     end      growth_rate |     0        1        2        3    
   -------- --------       -------- | -------- -------- -------- -------- 
0 |  1e+04    1e+04               0 |     0        0        0        0    
1 |  1e+04    1e+04               0 |     0        0        0        0    
2 |  1e+04    1e+04               0 |     0        0        0        0    
3 |  1e+03 

In [8]:
%%time

ts = msp.simulate(
    length=100_000_000,
    Ne=Ne0,
    mutation_rate=1e-8,
    recombination_rate=1e-8,
    samples=samples,
    population_configurations=pop_config,
    demographic_events=demography
)

CPU times: user 6min 41s, sys: 2.82 s, total: 6min 43s
Wall time: 6min 51s


In [9]:
snps = allel.HaplotypeArray(ts.genotype_matrix())
snps

Unnamed: 0,0,1,2,3,4,...,147,148,149,150,151,Unnamed: 12
0,1,0,0,0,0,...,0,0,0,0,0,
1,1,0,0,0,0,...,0,0,0,0,0,
2,0,1,1,1,1,...,1,1,1,1,1,
...,...,...,...,...,...,...,...,...,...,...,...,...
803371,0,1,1,1,1,...,1,1,1,1,1,
803372,0,0,0,0,0,...,0,0,0,0,0,
803373,0,0,1,0,0,...,0,0,0,0,0,


In [None]:
def admix_array(snps):
    

In [80]:
def pop_ids(sample_ids, pop):
    inds = [ind for ind in sample_ids.keys() if ind.startswith(pop)]
    return [sample_ids[ind] for ind in inds]

In [88]:
afr_ids = pop_ids(sample_ids, "afr")
nea_ids = pop_ids(sample_ids, "nea")

allele_counts = snps.count_alleles_subpops(subpops={"afr": afr_ids, "nea": nea_ids})

In [92]:
afr_freq = allele_counts["afr"] / len(afr_ids)
nea_freq = allele_counts["nea"] / len(nea_ids)

In [107]:
diff_pos = ((afr_freq[:, 0] == 0) | (afr_freq[:, 0] == 1)) & \
           ((nea_freq[:, 0] == 0) | (nea_freq[:, 0] == 1)) & \
           (abs(afr_freq[:, 0] - nea_freq[:, 0] > 0.5))

In [118]:
admix_array = snps[diff_pos]
admix_array

Unnamed: 0,0,1,2,3,4,...,147,148,149,150,151,Unnamed: 12
0,0,0,0,0,0,...,0,1,1,1,1,
1,0,0,0,0,0,...,0,1,1,1,1,
2,0,0,0,0,0,...,0,1,1,1,1,
...,...,...,...,...,...,...,...,...,...,...,...,...
32536,0,0,0,0,0,...,0,1,1,1,1,
32537,0,0,0,0,0,...,0,1,1,1,1,
32538,0,0,0,0,0,...,0,1,1,1,1,


In [122]:
(admix_array[:, sample_ids["eur0"]] == admix_array[:, sample_ids["nea0"]]).mean()

0.040413042810166264

In [62]:
snps[:, [sample_ids["afr0"], sample_ids["afr1"]]]

Unnamed: 0,0,1,Unnamed: 3
0,0,0,
1,0,0,
2,1,1,
...,...,...,...
803371,1,1,
803372,0,0,
803373,0,1,


In [19]:
sample_ids = {s: i for i, s in enumerate(["chimp" + str(i) for i in range(n_CH)] +
                                         ["afr" + str(i) for i in range(n_AFR)] +
                                         ["eur" + str(i) for i in range(n_EUR)] +
                                         ["nea" + str(i) for i in range(n_NEA)])}

In [45]:
def average_patterson_f4(aca, acb, acc, acd, blen):
    # calculate per-variant values
    num, den = allel.patterson_d(aca, acb, acc, acd)

    # N.B., nans can occur if any of the populations have completely missing
    # genotype calls at a variant (i.e., allele number is zero). Here we
    # assume that is rare enough to be negligible.

    # calculate overall estimate
    num_avg = np.nansum(num)

    # compute the numerator and denominator within each block
    num_bsum = allel.moving_statistic(num, statistic=np.nansum, size=blen)

    # estimate standard error
    _, se, vj = allel.stats.misc.jackknife(num_bsum, statistic=lambda n: np.sum(n))

    # compute Z score
    z = num_avg / se

    return num_avg, z

In [34]:
def dstat(a, b, c, d):
    aca = snps.count_alleles(subpop=[sample_ids[a]])
    acb = snps.count_alleles(subpop=[sample_ids[b]])
    acc = snps.count_alleles(subpop=[sample_ids[c]])
    acd = snps.count_alleles(subpop=[sample_ids[d]])
    d, se, z, vb, vj = allel.average_patterson_d(aca, acb, acc, acd, 10_000)
    return d, z

In [49]:
def f4(a, b, c, d):
    aca = snps.count_alleles(subpop=[sample_ids[a]])
    acb = snps.count_alleles(subpop=[sample_ids[b]])
    acc = snps.count_alleles(subpop=[sample_ids[c]])
    acd = snps.count_alleles(subpop=[sample_ids[d]])
    num, _ = allel.patterson_d(aca, acb, acc, acd)
    return np.mean(num)

In [41]:
def f4_ratio(x, a, b, c, d):
    acx = snps.count_alleles(subpop=[sample_ids[x]])
    aca = snps.count_alleles(subpop=[sample_ids[a]])
    acb = snps.count_alleles(subpop=[sample_ids[b]])
    acc = snps.count_alleles(subpop=[sample_ids[c]])
    acd = snps.count_alleles(subpop=[sample_ids[d]])
    
    f4num, _   = average_patterson_f4(aca, acd, acx, acc, 10000)
    d2f4den, _ = average_patterson_f4(aca, acd, acb, acd, 10000)

    return f4num / d2f4den

In [36]:
d("eur1", "afr0", "nea0", "chimp0")

(0.3163894023413432, 7.954536060052858)

In [37]:
d("eur1", "eur2", "nea0", "chimp0")

(0.03297644539614561, 0.6205610358085286)

In [48]:
f4_ratio("eur0", "nea0", "nea1", "afr0", "chimp0")

0.0029530948161319128

In [54]:
x = np.array([f4("nea0", "chimp0", "eur" + str(i), "afr0") / f4("nea0", "chimp0", "nea1", "afr0") for i in range(n_EUR)])

In [56]:
x.mean()

0.04005069763223921