In [1]:
%run setup.ipynb

In [2]:
callset = phase2_ar1.callset_pass_biallelic
# g3R = allel.GenotypeChunkedArray(callset['3R']['calldata']['genotype'])
# g3L = allel.GenotypeChunkedArray(callset['3L']['calldata']['genotype'])
df_samples = phase2_ar1.df_samples
df_samples.reset_index(inplace=True)
populations = phase2_ar1.pop_ids
#make idx dict for subpops
idx_dic = {p : list(df_samples[df_samples.population == p].index) for p in populations}

In [3]:
region_3R_free = '3R-free', '3R', 1, 37000000
region_3L_free = '3L-free', '3L', 15000000, 41000000

In [None]:
# ac_3R = g3R.count_alleles_subpops(idx_dic)
# ac_3L = g3L.count_alleles_subpops(idx_dic)

In [4]:
def load_ac(pop_id, chrom):
    genotypes = allel.GenotypeDaskArray(callset[chrom]['calldata/genotype'])
    ac = genotypes.take(idx_dic[pop_id], axis=1).count_alleles().compute()
    return ac

In [5]:
def compute_pair_fst(pops, regions, blen=10000):
    # order is irrelevant
    pop1, pop2 = pops
    log(pop1, pop2, regions, blen)
    
    with section('load allele counts'):
        ac1 = None
        ac2 = None
        for rname, chrom, start, stop in regions:
            pos = allel.SortedIndex(callset[chrom]['variants/POS'])
            loc = pos.locate_range(start, stop)
            rac1 = load_ac(pop1, chrom)[loc]
            rac2 = load_ac(pop2, chrom)[loc]
            if ac1 is None:
                ac1 = rac1
                ac2 = rac2
            else:
                ac1 = np.vstack([ac1, rac1])
                ac2 = np.vstack([ac2, rac2])
            log(rname, ac1.shape, ac2.shape)
    
    ac1 = allel.AlleleCountsArray(ac1)
    ac2 = allel.AlleleCountsArray(ac2)
    with section('ascertain SNPs'):
        loc_asc = ac1.is_segregating() & ac2.is_segregating() & (ac1.max_allele() <= 1) & (ac2.max_allele() <= 1)
        log('ascertainment', pop1, pop2, chrom, start, stop, nnz(loc_asc))
        ac1 = ac1[loc_asc]
        ac2 = ac2[loc_asc]
        log(ac1.shape)
    
    # compute Fst
    fst, se, _, _ = allel.blockwise_hudson_fst(ac1, ac2, blen)
    
    return fst, se

In [6]:
compute_pair_fst(pops=('BFcol', 'BFgam'), regions=(region_3R_free, region_3L_free))

BFcol BFgam (('3R-free', '3R', 1, 37000000), ('3L-free', '3L', 15000000, 41000000)) 10000
[load allele counts] begin
3R-free (8535400, 2) (8535400, 2)
3L-free (14525218, 2) (14525218, 2)
[load allele counts] done in a minute
[ascertain SNPs] begin
ascertainment BFcol BFgam 3L 15000000 41000000 2278245
(2278245, 2)
[ascertain SNPs] done in a second


(0.03246888951356084, 0.0042652011674895315)

In [7]:
def compute_pairwise_fst(regions, blen=10000):
    pairwise_fst = list()
    pairwise_fst_se = list()
    for pop1, pop2 in itertools.combinations(populations, 2):
        pops = tuple(sorted([pop1, pop2]))
        fst, se = compute_pair_fst(pops=pops, regions=regions, blen=blen)
        pairwise_fst.append(fst)
        pairwise_fst_se.append(se)
    return pairwise_fst, pairwise_fst_se

In [8]:
def tabulate_pairwise_fst(regions, blen=10000):
    fst, se = compute_pairwise_fst(regions, blen=blen)
    fstsq = scipy.spatial.distance.squareform(fst)
    sesq = scipy.spatial.distance.squareform(se)
    data = [zip(r1, r2) for r1, r2 in zip(fstsq, sesq)]
    tbl_fst = (etl
        .wrap(data)
        .pushheader(all_pops)
        .convertall(lambda v: '' if v == (0, 0) else '%.04f (%.04f)' % v)
        .addcolumn('population', populations, index=0)
    )
    tbl_fst.displayall(index_header=False, caption='Fst (standard error)')

In [9]:
tabulate_pairwise_fst(regions=(region_3R_free, region_3L_free))

AOcol GHcol (('3R-free', '3R', 1, 37000000), ('3L-free', '3L', 15000000, 41000000)) 10000
[load allele counts] begin
3R-free (8535400, 2) (8535400, 2)
3L-free (14525218, 2) (14525218, 2)
[load allele counts] done in 57 seconds
[ascertain SNPs] begin
ascertainment AOcol GHcol 3L 15000000 41000000 980434
(980434, 2)
[ascertain SNPs] done in a second
AOcol BFcol (('3R-free', '3R', 1, 37000000), ('3L-free', '3L', 15000000, 41000000)) 10000
[load allele counts] begin
3R-free (8535400, 2) (8535400, 2)
3L-free (14525218, 2) (14525218, 2)
[load allele counts] done in a minute
[ascertain SNPs] begin
ascertainment AOcol BFcol 3L 15000000 41000000 1080998
(1080998, 2)
[ascertain SNPs] done in a second
AOcol CIcol (('3R-free', '3R', 1, 37000000), ('3L-free', '3L', 15000000, 41000000)) 10000
[load allele counts] begin
3R-free (8535400, 2) (8535400, 2)
3L-free (14525218, 2) (14525218, 2)
[load allele counts] done in a minute
[ascertain SNPs] begin
ascertainment AOcol CIcol 3L 15000000 41000000 99635

[load allele counts] done in 34 seconds
[ascertain SNPs] begin
ascertainment GHcol GNgam 3L 15000000 41000000 1574323
(1574323, 2)
[ascertain SNPs] done in a moment
GAgam GHcol (('3R-free', '3R', 1, 37000000), ('3L-free', '3L', 15000000, 41000000)) 10000
[load allele counts] begin
3R-free (8535400, 2) (8535400, 2)
3L-free (14525218, 2) (14525218, 2)
[load allele counts] done in 47 seconds
[ascertain SNPs] begin
ascertainment GAgam GHcol 3L 15000000 41000000 1154589
(1154589, 2)
[ascertain SNPs] done in a moment
GHcol UGgam (('3R-free', '3R', 1, 37000000), ('3L-free', '3L', 15000000, 41000000)) 10000
[load allele counts] begin
3R-free (8535400, 2) (8535400, 2)
3L-free (14525218, 2) (14525218, 2)
[load allele counts] done in 58 seconds
[ascertain SNPs] begin
ascertainment GHcol UGgam 3L 15000000 41000000 1825106
(1825106, 2)
[ascertain SNPs] done in a second
GHcol GQgam (('3R-free', '3R', 1, 37000000), ('3L-free', '3L', 15000000, 41000000)) 10000
[load allele counts] begin
3R-free (85354

BFgam CIcol (('3R-free', '3R', 1, 37000000), ('3L-free', '3L', 15000000, 41000000)) 10000
[load allele counts] begin
3R-free (8535400, 2) (8535400, 2)
3L-free (14525218, 2) (14525218, 2)
[load allele counts] done in a minute
[ascertain SNPs] begin
ascertainment BFgam CIcol 3L 15000000 41000000 1944244
(1944244, 2)
[ascertain SNPs] done in a second
CIcol GNgam (('3R-free', '3R', 1, 37000000), ('3L-free', '3L', 15000000, 41000000)) 10000
[load allele counts] begin
3R-free (8535400, 2) (8535400, 2)
3L-free (14525218, 2) (14525218, 2)
[load allele counts] done in 50 seconds
[ascertain SNPs] begin
ascertainment CIcol GNgam 3L 15000000 41000000 1631358
(1631358, 2)
[ascertain SNPs] done in a second
CIcol GAgam (('3R-free', '3R', 1, 37000000), ('3L-free', '3L', 15000000, 41000000)) 10000
[load allele counts] begin
3R-free (8535400, 2) (8535400, 2)
3L-free (14525218, 2) (14525218, 2)
[load allele counts] done in a minute
[ascertain SNPs] begin
ascertainment CIcol GAgam 3L 15000000 41000000 118

[ascertain SNPs] begin
ascertainment GAgam GW 3L 15000000 41000000 1371463
(1371463, 2)
[ascertain SNPs] done in a second
GW UGgam (('3R-free', '3R', 1, 37000000), ('3L-free', '3L', 15000000, 41000000)) 10000
[load allele counts] begin
3R-free (8535400, 2) (8535400, 2)
3L-free (14525218, 2) (14525218, 2)
[load allele counts] done in a minute
[ascertain SNPs] begin
ascertainment GW UGgam 3L 15000000 41000000 2418309
(2418309, 2)
[ascertain SNPs] done in a second
GQgam GW (('3R-free', '3R', 1, 37000000), ('3L-free', '3L', 15000000, 41000000)) 10000
[load allele counts] begin
3R-free (8535400, 2) (8535400, 2)
3L-free (14525218, 2) (14525218, 2)
[load allele counts] done in a minute
[ascertain SNPs] begin
ascertainment GQgam GW 3L 15000000 41000000 985291
(985291, 2)
[ascertain SNPs] done in a second
FRgam GW (('3R-free', '3R', 1, 37000000), ('3L-free', '3L', 15000000, 41000000)) 10000
[load allele counts] begin
3R-free (8535400, 2) (8535400, 2)
3L-free (14525218, 2) (14525218, 2)
[load al

3R-free (8535400, 2) (8535400, 2)
3L-free (14525218, 2) (14525218, 2)
[load allele counts] done in 45 seconds
[ascertain SNPs] begin
ascertainment GAgam GHgam 3L 15000000 41000000 996047
(996047, 2)
[ascertain SNPs] done in a moment
GHgam UGgam (('3R-free', '3R', 1, 37000000), ('3L-free', '3L', 15000000, 41000000)) 10000
[load allele counts] begin
3R-free (8535400, 2) (8535400, 2)
3L-free (14525218, 2) (14525218, 2)
[load allele counts] done in a minute
[ascertain SNPs] begin
ascertainment GHgam UGgam 3L 15000000 41000000 1423913
(1423913, 2)
[ascertain SNPs] done in a moment
GHgam GQgam (('3R-free', '3R', 1, 37000000), ('3L-free', '3L', 15000000, 41000000)) 10000
[load allele counts] begin
3R-free (8535400, 2) (8535400, 2)
3L-free (14525218, 2) (14525218, 2)
[load allele counts] done in 17 seconds
[ascertain SNPs] begin
ascertainment GHgam GQgam 3L 15000000 41000000 776625
(776625, 2)
[ascertain SNPs] done in a second
FRgam GHgam (('3R-free', '3R', 1, 37000000), ('3L-free', '3L', 1500

(429718, 2)
[ascertain SNPs] done in a second
GQgam KE (('3R-free', '3R', 1, 37000000), ('3L-free', '3L', 15000000, 41000000)) 10000
[load allele counts] begin
3R-free (8535400, 2) (8535400, 2)
3L-free (14525218, 2) (14525218, 2)
[load allele counts] done in 53 seconds
[ascertain SNPs] begin
ascertainment GQgam KE 3L 15000000 41000000 374787
(374787, 2)
[ascertain SNPs] done in a second
FRgam KE (('3R-free', '3R', 1, 37000000), ('3L-free', '3L', 15000000, 41000000)) 10000
[load allele counts] begin
3R-free (8535400, 2) (8535400, 2)
3L-free (14525218, 2) (14525218, 2)
[load allele counts] done in 53 seconds
[ascertain SNPs] begin
ascertainment FRgam KE 3L 15000000 41000000 319771
(319771, 2)
[ascertain SNPs] done in a second


NameError: name 'all_pops' is not defined