In [1]:
!pip install -U malariagen_data

Requirement already up-to-date: malariagen_data in /opt/conda/lib/python3.7/site-packages (0.4.2)


In [2]:
from functools import lru_cache
import dask.array as da
import allel
from humanize import naturalsize
import numpy as np

In [3]:
import psutil
import sys
import os

def mem(after=None):
    vm = psutil.virtual_memory()
    process = psutil.Process(os.getpid())
    pm = process.memory_info()

    msg = (
        f"{naturalsize(pm.rss)} ({pm.rss * 100 / vm.total:.1f}%) rss, "
        f"{naturalsize(pm.vms)} vms, "
        f"{naturalsize(vm.free)} free, "
        f"{naturalsize(vm.available)} avail"
    )
    if after:
        msg += "; after " + after
    print(msg, file=sys.stdout)
    sys.stdout.flush()

In [4]:
mem()

120.8 MB (0.8%) rss, 1.1 GB vms, 12.6 GB free, 14.3 GB avail


In [5]:
os.environ["MALLOC_MMAP_THRESHOLD_"]

'16384'

In [6]:
from dask.distributed import Client

client = Client("tcp://10.34.4.118:45573")
client

0,1
Client  Scheduler: tcp://10.34.4.118:45573  Dashboard: /user/alimanfoo@googlemail.com/proxy/38639/status,Cluster  Workers: 0  Cores: 0  Memory: 0 B


In [6]:
import logging
logging.getLogger("distributed.scheduler").setLevel(logging.ERROR)
logging.getLogger("distributed.core").setLevel(logging.ERROR)
logging.getLogger("distributed.deploy.adaptive").setLevel(logging.ERROR)
logging.getLogger("distributed.utils_perf").setLevel(logging.ERROR)
logging.getLogger("distributed.batched").setLevel(logging.ERROR)
from dask_kubernetes import KubeCluster
from dask.distributed import Client
cluster = KubeCluster(n_workers=50, 
                      env={'EXTRA_PIP_PACKAGES': 'malariagen_data'})
client = Client(cluster)
client

0,1
Client  Scheduler: tcp://10.34.4.118:36147  Dashboard: /user/alimanfoo@googlemail.com/proxy/8787/status,Cluster  Workers: 0  Cores: 0  Memory: 0 B


In [7]:
mem()

214.7 MB (1.4%) rss, 1.5 GB vms, 12.6 GB free, 14.3 GB avail


In [8]:
import malariagen_data
ag3 = malariagen_data.Ag3("gs://vo_agam_release")

In [9]:
df_samples = ag3.sample_metadata()
df_samples

Unnamed: 0,sample_id,partner_sample_id,contributor,country,location,year,month,latitude,longitude,sex_call,sample_set,release,aim_fraction_colu,aim_fraction_arab,species_gambcolu_arabiensis,species_gambiae_coluzzii,species
0,AR0047-C,LUA047,Joao Pinto,Angola,Luanda,2009,4,-8.884,13.302,F,AG1000G-AO,v3,0.945,0.001,gamb_colu,coluzzii,coluzzii
1,AR0049-C,LUA049,Joao Pinto,Angola,Luanda,2009,4,-8.884,13.302,F,AG1000G-AO,v3,0.933,0.001,gamb_colu,coluzzii,coluzzii
2,AR0051-C,LUA051,Joao Pinto,Angola,Luanda,2009,4,-8.884,13.302,F,AG1000G-AO,v3,0.937,0.002,gamb_colu,coluzzii,coluzzii
3,AR0061-C,LUA061,Joao Pinto,Angola,Luanda,2009,4,-8.884,13.302,F,AG1000G-AO,v3,0.938,0.002,gamb_colu,coluzzii,coluzzii
4,AR0078-C,LUA078,Joao Pinto,Angola,Luanda,2009,4,-8.884,13.302,F,AG1000G-AO,v3,0.926,0.001,gamb_colu,coluzzii,coluzzii
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2779,AC0295-C,K92,Martin Donnelly,Uganda,Kihihi,2012,11,-0.751,29.701,F,AG1000G-UG,v3,0.026,0.002,gamb_colu,gambiae,gambiae
2780,AC0296-C,K93,Martin Donnelly,Uganda,Kihihi,2012,11,-0.751,29.701,F,AG1000G-UG,v3,0.029,0.003,gamb_colu,gambiae,gambiae
2781,AC0297-C,K94,Martin Donnelly,Uganda,Kihihi,2012,11,-0.751,29.701,F,AG1000G-UG,v3,0.026,0.002,gamb_colu,gambiae,gambiae
2782,AC0298-C,K95,Martin Donnelly,Uganda,Kihihi,2012,11,-0.751,29.701,F,AG1000G-UG,v3,0.029,0.002,gamb_colu,gambiae,gambiae


In [10]:
df_samples.eval("species == 'gambiae'").values

array([False, False, False, ...,  True,  True,  True])

In [11]:
def compute_ac(contig, pop=None, site_mask=None):
    gt = ag3.snp_genotypes(contig=contig, site_mask=site_mask)
    mem("access genotypes")

    if pop is not None:
        loc_samples = df_samples.eval(pop).values
        print(f'{loc_samples.sum()} samples, {naturalsize(loc_samples.nbytes)}')
        mem("locate samples")
        gt = da.compress(loc_samples, gt, axis=1)
        mem("select samples")

    ac = allel.GenotypeDaskArray(gt).count_alleles(max_allele=3)
    mem("setup allele counts")

    ac = ac.compute()
    mem("compute allele counts")
    print(ac.is_segregating().sum())
#     return ac

In [27]:
def compute_ac_xarray(contig, pop=None, site_mask=None):
    ds = ag3.snp_calls(contig=contig, site_mask=site_mask)
    mem("access dataset")

    if pop is not None:
        loc_samples = df_samples.eval(pop).values
        print(f'{loc_samples.sum()} samples, {naturalsize(loc_samples.nbytes)}')
        mem("locate samples")
        ds = ds.isel(samples=loc_samples)
        mem("select samples")

    gt = ds["call_genotype"]
    ac = allel.GenotypeDaskArray(gt.data).count_alleles(max_allele=3)
    mem("setup allele counts")

    ac = ac.compute()
    mem("compute allele counts")
    print(ac.is_segregating().sum())
#     return ac

In [14]:
%%time
mem()
compute_ac("2R", pop="species == 'arabiensis'", site_mask="gamb_colu_arab")
mem()

430.6 MB (2.7%) rss, 2.0 GB vms, 12.4 GB free, 14.1 GB avail
431.6 MB (2.7%) rss, 2.0 GB vms, 12.4 GB free, 14.1 GB avail; after access genotypes
368 samples, 2.8 kB
431.6 MB (2.7%) rss, 2.0 GB vms, 12.4 GB free, 14.1 GB avail; after locate samples
431.6 MB (2.7%) rss, 2.0 GB vms, 12.4 GB free, 14.1 GB avail; after select samples
431.6 MB (2.7%) rss, 2.0 GB vms, 12.4 GB free, 14.1 GB avail; after setup allele counts
1.1 GB (7.3%) rss, 2.7 GB vms, 11.6 GB free, 13.4 GB avail; after compute allele counts
5166190
494.5 MB (3.1%) rss, 2.1 GB vms, 12.3 GB free, 14.0 GB avail
CPU times: user 20.5 s, sys: 2.63 s, total: 23.1 s
Wall time: 24.9 s


In [17]:
%%time
mem()
compute_ac_xarray("2R", pop=None, site_mask=None)
mem()

729.9 MB (4.6%) rss, 2.3 GB vms, 12.1 GB free, 13.8 GB avail
735.4 MB (4.7%) rss, 2.3 GB vms, 12.1 GB free, 13.8 GB avail; after access dataset
735.4 MB (4.7%) rss, 2.3 GB vms, 12.1 GB free, 13.8 GB avail; after setup allele counts
1.7 GB (11.0%) rss, 3.3 GB vms, 11.1 GB free, 12.8 GB avail; after compute allele counts
41884531
771.6 MB (4.9%) rss, 2.3 GB vms, 12.0 GB free, 13.7 GB avail
CPU times: user 57.6 s, sys: 4.48 s, total: 1min 2s
Wall time: 1min 1s


In [18]:
%%time
mem()
compute_ac_xarray("2R", pop=None, site_mask="gamb_colu_arab")
mem()

771.6 MB (4.9%) rss, 2.3 GB vms, 12.0 GB free, 13.7 GB avail
775.9 MB (4.9%) rss, 2.3 GB vms, 12.0 GB free, 13.8 GB avail; after access dataset
775.9 MB (4.9%) rss, 2.3 GB vms, 12.0 GB free, 13.8 GB avail; after setup allele counts
1.6 GB (10.2%) rss, 3.2 GB vms, 11.2 GB free, 12.9 GB avail; after compute allele counts
25834162
949.9 MB (6.0%) rss, 2.5 GB vms, 11.8 GB free, 13.6 GB avail
CPU times: user 1min 14s, sys: 4.73 s, total: 1min 19s
Wall time: 1min 19s


In [30]:
client.restart()

0,1
Client  Scheduler: tcp://10.34.4.118:36147  Dashboard: /user/alimanfoo@googlemail.com/proxy/8787/status,Cluster  Workers: 50  Cores: 150  Memory: 600.00 GB


In [31]:
%%time
mem()
compute_ac_xarray("2R", pop="species == 'arabiensis'", site_mask="gamb_colu_arab")
mem()

958.5 MB (6.1%) rss, 2.6 GB vms, 11.9 GB free, 13.6 GB avail
962.1 MB (6.1%) rss, 2.6 GB vms, 11.9 GB free, 13.6 GB avail; after access dataset
368 samples, 2.8 kB
962.1 MB (6.1%) rss, 2.6 GB vms, 11.9 GB free, 13.6 GB avail; after locate samples
962.7 MB (6.1%) rss, 2.6 GB vms, 11.9 GB free, 13.6 GB avail; after select samples
962.7 MB (6.1%) rss, 2.6 GB vms, 11.9 GB free, 13.6 GB avail; after setup allele counts
1.6 GB (10.2%) rss, 3.3 GB vms, 11.2 GB free, 12.9 GB avail; after compute allele counts
5166190
962.1 MB (6.1%) rss, 2.6 GB vms, 11.8 GB free, 13.5 GB avail
CPU times: user 30.3 s, sys: 3.08 s, total: 33.4 s
Wall time: 38.1 s


In [20]:
ds = ag3.snp_calls(contig="3L")
ds

Unnamed: 0,Array,Chunk
Bytes,40.76 MB,524.29 kB
Shape,"(40758473,)","(524288,)"
Count,78 Tasks,78 Chunks
Type,uint8,numpy.ndarray
"Array Chunk Bytes 40.76 MB 524.29 kB Shape (40758473,) (524288,) Count 78 Tasks 78 Chunks Type uint8 numpy.ndarray",40758473  1,

Unnamed: 0,Array,Chunk
Bytes,40.76 MB,524.29 kB
Shape,"(40758473,)","(524288,)"
Count,78 Tasks,78 Chunks
Type,uint8,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,163.03 MB,2.10 MB
Shape,"(40758473,)","(524288,)"
Count,79 Tasks,78 Chunks
Type,int32,numpy.ndarray
"Array Chunk Bytes 163.03 MB 2.10 MB Shape (40758473,) (524288,) Count 79 Tasks 78 Chunks Type int32 numpy.ndarray",40758473  1,

Unnamed: 0,Array,Chunk
Bytes,163.03 MB,2.10 MB
Shape,"(40758473,)","(524288,)"
Count,79 Tasks,78 Chunks
Type,int32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,66.82 kB,7.27 kB
Shape,"(2784,)","(303,)"
Count,81 Tasks,27 Chunks
Type,|S24,numpy.ndarray
"Array Chunk Bytes 66.82 kB 7.27 kB Shape (2784,) (303,) Count 81 Tasks 27 Chunks Type |S24 numpy.ndarray",2784  1,

Unnamed: 0,Array,Chunk
Bytes,66.82 kB,7.27 kB
Shape,"(2784,)","(303,)"
Count,81 Tasks,27 Chunks
Type,|S24,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,163.03 MB,1.57 MB
Shape,"(40758473, 4)","(524288, 3)"
Count,392 Tasks,156 Chunks
Type,|S1,numpy.ndarray
"Array Chunk Bytes 163.03 MB 1.57 MB Shape (40758473, 4) (524288, 3) Count 392 Tasks 156 Chunks Type |S1 numpy.ndarray",4  40758473,

Unnamed: 0,Array,Chunk
Bytes,163.03 MB,1.57 MB
Shape,"(40758473, 4)","(524288, 3)"
Count,392 Tasks,156 Chunks
Type,|S1,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,40.76 MB,300.00 kB
Shape,"(40758473,)","(300000,)"
Count,137 Tasks,136 Chunks
Type,bool,numpy.ndarray
"Array Chunk Bytes 40.76 MB 300.00 kB Shape (40758473,) (300000,) Count 137 Tasks 136 Chunks Type bool numpy.ndarray",40758473  1,

Unnamed: 0,Array,Chunk
Bytes,40.76 MB,300.00 kB
Shape,"(40758473,)","(300000,)"
Count,137 Tasks,136 Chunks
Type,bool,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,40.76 MB,300.00 kB
Shape,"(40758473,)","(300000,)"
Count,137 Tasks,136 Chunks
Type,bool,numpy.ndarray
"Array Chunk Bytes 40.76 MB 300.00 kB Shape (40758473,) (300000,) Count 137 Tasks 136 Chunks Type bool numpy.ndarray",40758473  1,

Unnamed: 0,Array,Chunk
Bytes,40.76 MB,300.00 kB
Shape,"(40758473,)","(300000,)"
Count,137 Tasks,136 Chunks
Type,bool,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,40.76 MB,300.00 kB
Shape,"(40758473,)","(300000,)"
Count,137 Tasks,136 Chunks
Type,bool,numpy.ndarray
"Array Chunk Bytes 40.76 MB 300.00 kB Shape (40758473,) (300000,) Count 137 Tasks 136 Chunks Type bool numpy.ndarray",40758473  1,

Unnamed: 0,Array,Chunk
Bytes,40.76 MB,300.00 kB
Shape,"(40758473,)","(300000,)"
Count,137 Tasks,136 Chunks
Type,bool,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,226.94 GB,30.00 MB
Shape,"(40758473, 2784, 2)","(300000, 50, 2)"
Count,18523 Tasks,9248 Chunks
Type,int8,numpy.ndarray
"Array Chunk Bytes 226.94 GB 30.00 MB Shape (40758473, 2784, 2) (300000, 50, 2) Count 18523 Tasks 9248 Chunks Type int8 numpy.ndarray",2  2784  40758473,

Unnamed: 0,Array,Chunk
Bytes,226.94 GB,30.00 MB
Shape,"(40758473, 2784, 2)","(300000, 50, 2)"
Count,18523 Tasks,9248 Chunks
Type,int8,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,226.94 GB,30.00 MB
Shape,"(40758473, 2784)","(300000, 50)"
Count,18523 Tasks,9248 Chunks
Type,int16,numpy.ndarray
"Array Chunk Bytes 226.94 GB 30.00 MB Shape (40758473, 2784) (300000, 50) Count 18523 Tasks 9248 Chunks Type int16 numpy.ndarray",2784  40758473,

Unnamed: 0,Array,Chunk
Bytes,226.94 GB,30.00 MB
Shape,"(40758473, 2784)","(300000, 50)"
Count,18523 Tasks,9248 Chunks
Type,int16,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,226.94 GB,30.00 MB
Shape,"(40758473, 2784)","(300000, 50)"
Count,18523 Tasks,9248 Chunks
Type,int16,numpy.ndarray
"Array Chunk Bytes 226.94 GB 30.00 MB Shape (40758473, 2784) (300000, 50) Count 18523 Tasks 9248 Chunks Type int16 numpy.ndarray",2784  40758473,

Unnamed: 0,Array,Chunk
Bytes,226.94 GB,30.00 MB
Shape,"(40758473, 2784)","(300000, 50)"
Count,18523 Tasks,9248 Chunks
Type,int16,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,907.77 GB,120.00 MB
Shape,"(40758473, 2784, 4)","(300000, 50, 4)"
Count,18523 Tasks,9248 Chunks
Type,int16,numpy.ndarray
"Array Chunk Bytes 907.77 GB 120.00 MB Shape (40758473, 2784, 4) (300000, 50, 4) Count 18523 Tasks 9248 Chunks Type int16 numpy.ndarray",4  2784  40758473,

Unnamed: 0,Array,Chunk
Bytes,907.77 GB,120.00 MB
Shape,"(40758473, 2784, 4)","(300000, 50, 4)"
Count,18523 Tasks,9248 Chunks
Type,int16,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,226.94 GB,30.00 MB
Shape,"(40758473, 2784, 2)","(300000, 50, 2)"
Count,27771 Tasks,9248 Chunks
Type,bool,numpy.ndarray
"Array Chunk Bytes 226.94 GB 30.00 MB Shape (40758473, 2784, 2) (300000, 50, 2) Count 27771 Tasks 9248 Chunks Type bool numpy.ndarray",2  2784  40758473,

Unnamed: 0,Array,Chunk
Bytes,226.94 GB,30.00 MB
Shape,"(40758473, 2784, 2)","(300000, 50, 2)"
Count,27771 Tasks,9248 Chunks
Type,bool,numpy.ndarray


In [21]:
ds = ag3.snp_calls(contig="3L", site_mask="gamb_colu_arab")
ds

Unnamed: 0,Array,Chunk
Bytes,25.87 MB,429.06 kB
Shape,"(25869385,)","(429065,)"
Count,1434 Tasks,78 Chunks
Type,uint8,numpy.ndarray
"Array Chunk Bytes 25.87 MB 429.06 kB Shape (25869385,) (429065,) Count 1434 Tasks 78 Chunks Type uint8 numpy.ndarray",25869385  1,

Unnamed: 0,Array,Chunk
Bytes,25.87 MB,429.06 kB
Shape,"(25869385,)","(429065,)"
Count,1434 Tasks,78 Chunks
Type,uint8,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,103.48 MB,1.72 MB
Shape,"(25869385,)","(429065,)"
Count,1435 Tasks,78 Chunks
Type,int32,numpy.ndarray
"Array Chunk Bytes 103.48 MB 1.72 MB Shape (25869385,) (429065,) Count 1435 Tasks 78 Chunks Type int32 numpy.ndarray",25869385  1,

Unnamed: 0,Array,Chunk
Bytes,103.48 MB,1.72 MB
Shape,"(25869385,)","(429065,)"
Count,1435 Tasks,78 Chunks
Type,int32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,66.82 kB,7.27 kB
Shape,"(2784,)","(303,)"
Count,81 Tasks,27 Chunks
Type,|S24,numpy.ndarray
"Array Chunk Bytes 66.82 kB 7.27 kB Shape (2784,) (303,) Count 81 Tasks 27 Chunks Type |S24 numpy.ndarray",2784  1,

Unnamed: 0,Array,Chunk
Bytes,66.82 kB,7.27 kB
Shape,"(2784,)","(303,)"
Count,81 Tasks,27 Chunks
Type,|S24,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,103.48 MB,1.29 MB
Shape,"(25869385, 4)","(429065, 3)"
Count,2174 Tasks,156 Chunks
Type,|S1,numpy.ndarray
"Array Chunk Bytes 103.48 MB 1.29 MB Shape (25869385, 4) (429065, 3) Count 2174 Tasks 156 Chunks Type |S1 numpy.ndarray",4  25869385,

Unnamed: 0,Array,Chunk
Bytes,103.48 MB,1.29 MB
Shape,"(25869385, 4)","(429065, 3)"
Count,2174 Tasks,156 Chunks
Type,|S1,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,25.87 MB,269.68 kB
Shape,"(25869385,)","(269675,)"
Count,546 Tasks,136 Chunks
Type,bool,numpy.ndarray
"Array Chunk Bytes 25.87 MB 269.68 kB Shape (25869385,) (269675,) Count 546 Tasks 136 Chunks Type bool numpy.ndarray",25869385  1,

Unnamed: 0,Array,Chunk
Bytes,25.87 MB,269.68 kB
Shape,"(25869385,)","(269675,)"
Count,546 Tasks,136 Chunks
Type,bool,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,25.87 MB,269.68 kB
Shape,"(25869385,)","(269675,)"
Count,409 Tasks,136 Chunks
Type,bool,numpy.ndarray
"Array Chunk Bytes 25.87 MB 269.68 kB Shape (25869385,) (269675,) Count 409 Tasks 136 Chunks Type bool numpy.ndarray",25869385  1,

Unnamed: 0,Array,Chunk
Bytes,25.87 MB,269.68 kB
Shape,"(25869385,)","(269675,)"
Count,409 Tasks,136 Chunks
Type,bool,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,25.87 MB,269.68 kB
Shape,"(25869385,)","(269675,)"
Count,546 Tasks,136 Chunks
Type,bool,numpy.ndarray
"Array Chunk Bytes 25.87 MB 269.68 kB Shape (25869385,) (269675,) Count 546 Tasks 136 Chunks Type bool numpy.ndarray",25869385  1,

Unnamed: 0,Array,Chunk
Bytes,25.87 MB,269.68 kB
Shape,"(25869385,)","(269675,)"
Count,546 Tasks,136 Chunks
Type,bool,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,144.04 GB,26.97 MB
Shape,"(25869385, 2784, 2)","(269675, 50, 2)"
Count,27908 Tasks,9248 Chunks
Type,int8,numpy.ndarray
"Array Chunk Bytes 144.04 GB 26.97 MB Shape (25869385, 2784, 2) (269675, 50, 2) Count 27908 Tasks 9248 Chunks Type int8 numpy.ndarray",2  2784  25869385,

Unnamed: 0,Array,Chunk
Bytes,144.04 GB,26.97 MB
Shape,"(25869385, 2784, 2)","(269675, 50, 2)"
Count,27908 Tasks,9248 Chunks
Type,int8,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,144.04 GB,26.97 MB
Shape,"(25869385, 2784)","(269675, 50)"
Count,27908 Tasks,9248 Chunks
Type,int16,numpy.ndarray
"Array Chunk Bytes 144.04 GB 26.97 MB Shape (25869385, 2784) (269675, 50) Count 27908 Tasks 9248 Chunks Type int16 numpy.ndarray",2784  25869385,

Unnamed: 0,Array,Chunk
Bytes,144.04 GB,26.97 MB
Shape,"(25869385, 2784)","(269675, 50)"
Count,27908 Tasks,9248 Chunks
Type,int16,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,144.04 GB,26.97 MB
Shape,"(25869385, 2784)","(269675, 50)"
Count,27908 Tasks,9248 Chunks
Type,int16,numpy.ndarray
"Array Chunk Bytes 144.04 GB 26.97 MB Shape (25869385, 2784) (269675, 50) Count 27908 Tasks 9248 Chunks Type int16 numpy.ndarray",2784  25869385,

Unnamed: 0,Array,Chunk
Bytes,144.04 GB,26.97 MB
Shape,"(25869385, 2784)","(269675, 50)"
Count,27908 Tasks,9248 Chunks
Type,int16,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,576.16 GB,107.87 MB
Shape,"(25869385, 2784, 4)","(269675, 50, 4)"
Count,27908 Tasks,9248 Chunks
Type,int16,numpy.ndarray
"Array Chunk Bytes 576.16 GB 107.87 MB Shape (25869385, 2784, 4) (269675, 50, 4) Count 27908 Tasks 9248 Chunks Type int16 numpy.ndarray",4  2784  25869385,

Unnamed: 0,Array,Chunk
Bytes,576.16 GB,107.87 MB
Shape,"(25869385, 2784, 4)","(269675, 50, 4)"
Count,27908 Tasks,9248 Chunks
Type,int16,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,144.04 GB,26.97 MB
Shape,"(25869385, 2784, 2)","(269675, 50, 2)"
Count,37156 Tasks,9248 Chunks
Type,bool,numpy.ndarray
"Array Chunk Bytes 144.04 GB 26.97 MB Shape (25869385, 2784, 2) (269675, 50, 2) Count 37156 Tasks 9248 Chunks Type bool numpy.ndarray",2  2784  25869385,

Unnamed: 0,Array,Chunk
Bytes,144.04 GB,26.97 MB
Shape,"(25869385, 2784, 2)","(269675, 50, 2)"
Count,37156 Tasks,9248 Chunks
Type,bool,numpy.ndarray


In [24]:
ds['variant_position'] > 0

Unnamed: 0,Array,Chunk
Bytes,25.87 MB,429.06 kB
Shape,"(25869385,)","(429065,)"
Count,1513 Tasks,78 Chunks
Type,bool,numpy.ndarray
"Array Chunk Bytes 25.87 MB 429.06 kB Shape (25869385,) (429065,) Count 1513 Tasks 78 Chunks Type bool numpy.ndarray",25869385  1,

Unnamed: 0,Array,Chunk
Bytes,25.87 MB,429.06 kB
Shape,"(25869385,)","(429065,)"
Count,1513 Tasks,78 Chunks
Type,bool,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,25.87 MB,429.06 kB
Shape,"(25869385,)","(429065,)"
Count,1434 Tasks,78 Chunks
Type,uint8,numpy.ndarray
"Array Chunk Bytes 25.87 MB 429.06 kB Shape (25869385,) (429065,) Count 1434 Tasks 78 Chunks Type uint8 numpy.ndarray",25869385  1,

Unnamed: 0,Array,Chunk
Bytes,25.87 MB,429.06 kB
Shape,"(25869385,)","(429065,)"
Count,1434 Tasks,78 Chunks
Type,uint8,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,103.48 MB,1.72 MB
Shape,"(25869385,)","(429065,)"
Count,1435 Tasks,78 Chunks
Type,int32,numpy.ndarray
"Array Chunk Bytes 103.48 MB 1.72 MB Shape (25869385,) (429065,) Count 1435 Tasks 78 Chunks Type int32 numpy.ndarray",25869385  1,

Unnamed: 0,Array,Chunk
Bytes,103.48 MB,1.72 MB
Shape,"(25869385,)","(429065,)"
Count,1435 Tasks,78 Chunks
Type,int32,numpy.ndarray


In [22]:
ds['variant_position'].data

Unnamed: 0,Array,Chunk
Bytes,103.48 MB,1.72 MB
Shape,"(25869385,)","(429065,)"
Count,1435 Tasks,78 Chunks
Type,int32,numpy.ndarray
"Array Chunk Bytes 103.48 MB 1.72 MB Shape (25869385,) (429065,) Count 1435 Tasks 78 Chunks Type int32 numpy.ndarray",25869385  1,

Unnamed: 0,Array,Chunk
Bytes,103.48 MB,1.72 MB
Shape,"(25869385,)","(429065,)"
Count,1435 Tasks,78 Chunks
Type,int32,numpy.ndarray


In [23]:
ds['variant_position'].data.chunks

((85223,
  106329,
  166370,
  378897,
  316354,
  364909,
  383980,
  126864,
  260767,
  397768,
  335595,
  381383,
  211527,
  193963,
  285880,
  322009,
  214506,
  241811,
  318916,
  247707,
  291323,
  355570,
  362297,
  337299,
  280439,
  342911,
  324348,
  379953,
  227643,
  280983,
  417060,
  400454,
  323202,
  394005,
  371274,
  389359,
  363828,
  283667,
  340620,
  346479,
  336413,
  365200,
  269059,
  311252,
  328588,
  308938,
  323037,
  357377,
  343327,
  328571,
  393493,
  381763,
  330627,
  357638,
  405567,
  371890,
  389014,
  348121,
  385275,
  355984,
  376661,
  358619,
  379617,
  396204,
  375173,
  378047,
  390042,
  227437,
  388324,
  320683,
  387689,
  383912,
  410886,
  394379,
  429065,
  418433,
  426253,
  283355),)

In [19]:
ds['foo'] = ('variants', 'samples'), ds['call_GQ'] < 0
ds

TypeError: unhashable type: 'numpy.ndarray'

In [14]:
loc_sites = ds['variant_filter_pass_gamb_colu_arab']
loc_sites

Unnamed: 0,Array,Chunk
Bytes,40.76 MB,300.00 kB
Shape,"(40758473,)","(300000,)"
Count,137 Tasks,136 Chunks
Type,bool,numpy.ndarray
"Array Chunk Bytes 40.76 MB 300.00 kB Shape (40758473,) (300000,) Count 137 Tasks 136 Chunks Type bool numpy.ndarray",40758473  1,

Unnamed: 0,Array,Chunk
Bytes,40.76 MB,300.00 kB
Shape,"(40758473,)","(300000,)"
Count,137 Tasks,136 Chunks
Type,bool,numpy.ndarray


In [16]:
loc_sites.data

Unnamed: 0,Array,Chunk
Bytes,40.76 MB,300.00 kB
Shape,"(40758473,)","(300000,)"
Count,137 Tasks,136 Chunks
Type,bool,numpy.ndarray
"Array Chunk Bytes 40.76 MB 300.00 kB Shape (40758473,) (300000,) Count 137 Tasks 136 Chunks Type bool numpy.ndarray",40758473  1,

Unnamed: 0,Array,Chunk
Bytes,40.76 MB,300.00 kB
Shape,"(40758473,)","(300000,)"
Count,137 Tasks,136 Chunks
Type,bool,numpy.ndarray


In [17]:
mem()
ds_fp = ds.isel(variants=loc_sites.data)
mem('select sites')

323.2 MB (2.1%) rss, 2.0 GB vms, 12.4 GB free, 14.3 GB avail
4.9 GB (31.0%) rss, 6.6 GB vms, 7.8 GB free, 9.7 GB avail; after select sites


In [29]:
mem()
a = ds['call_genotype'].isel(variants=loc_sites)
mem()
a

6.1 GB (38.5%) rss, 7.7 GB vms, 6.7 GB free, 8.6 GB avail
6.1 GB (38.5%) rss, 7.7 GB vms, 6.7 GB free, 8.5 GB avail


Unnamed: 0,Array,Chunk
Bytes,144.04 GB,26.97 MB
Shape,"(25869385, 2784, 2)","(269675, 50, 2)"
Count,27771 Tasks,9248 Chunks
Type,int8,numpy.ndarray
"Array Chunk Bytes 144.04 GB 26.97 MB Shape (25869385, 2784, 2) (269675, 50, 2) Count 27771 Tasks 9248 Chunks Type int8 numpy.ndarray",2  2784  25869385,

Unnamed: 0,Array,Chunk
Bytes,144.04 GB,26.97 MB
Shape,"(25869385, 2784, 2)","(269675, 50, 2)"
Count,27771 Tasks,9248 Chunks
Type,int8,numpy.ndarray


In [None]:
mem()
a.sum().compute()

6.1 GB (38.5%) rss, 7.7 GB vms, 6.7 GB free, 8.6 GB avail


In [25]:
mem()
a = ds['call_genotype'].isel(variants=loc_sites.data)
mem()

5.8 GB (36.6%) rss, 7.5 GB vms, 7.0 GB free, 8.9 GB avail
5.8 GB (36.7%) rss, 7.5 GB vms, 7.0 GB free, 8.8 GB avail


In [27]:
mem()
b = ds['call_genotype'].data[loc_sites.data]
b.compute_chunk_sizes()
mem()
b

5.8 GB (36.7%) rss, 7.5 GB vms, 7.0 GB free, 8.8 GB avail
5.8 GB (36.8%) rss, 7.5 GB vms, 7.0 GB free, 8.8 GB avail


Unnamed: 0,Array,Chunk
Bytes,144.04 GB,26.97 MB
Shape,"(25869385, 2784, 2)","(269675, 50, 2)"
Count,27908 Tasks,9248 Chunks
Type,int8,numpy.ndarray
"Array Chunk Bytes 144.04 GB 26.97 MB Shape (25869385, 2784, 2) (269675, 50, 2) Count 27908 Tasks 9248 Chunks Type int8 numpy.ndarray",2  2784  25869385,

Unnamed: 0,Array,Chunk
Bytes,144.04 GB,26.97 MB
Shape,"(25869385, 2784, 2)","(269675, 50, 2)"
Count,27908 Tasks,9248 Chunks
Type,int8,numpy.ndarray


In [28]:
b.sum().compute()

2501042161

In [22]:
a

Unnamed: 0,Array,Chunk
Bytes,144.04 GB,26.97 MB
Shape,"(25869385, 2784, 2)","(269675, 50, 2)"
Count,27771 Tasks,9248 Chunks
Type,int8,numpy.ndarray
"Array Chunk Bytes 144.04 GB 26.97 MB Shape (25869385, 2784, 2) (269675, 50, 2) Count 27771 Tasks 9248 Chunks Type int8 numpy.ndarray",2  2784  25869385,

Unnamed: 0,Array,Chunk
Bytes,144.04 GB,26.97 MB
Shape,"(25869385, 2784, 2)","(269675, 50, 2)"
Count,27771 Tasks,9248 Chunks
Type,int8,numpy.ndarray


In [None]:
ds_fp['call_MQ'].mean().compute()

In [17]:
%%time
mem()
compute_ac_xarray("2R", pop="species == 'arabiensis'", site_mask=None)
mem()

529.9 MB (3.4%) rss, 2.1 GB vms, 12.1 GB free, 14.0 GB avail
533.7 MB (3.4%) rss, 2.1 GB vms, 12.1 GB free, 14.0 GB avail; after access dataset
368 samples, 2.8 kB
533.7 MB (3.4%) rss, 2.1 GB vms, 12.1 GB free, 14.0 GB avail; after locate samples
534.2 MB (3.4%) rss, 2.1 GB vms, 12.1 GB free, 14.0 GB avail; after select samples
534.5 MB (3.4%) rss, 2.1 GB vms, 12.1 GB free, 14.0 GB avail; after setup allele counts
1.6 GB (9.8%) rss, 3.1 GB vms, 11.1 GB free, 13.0 GB avail; after compute allele counts
12370554
586.0 MB (3.7%) rss, 2.2 GB vms, 12.0 GB free, 13.9 GB avail
CPU times: user 31.8 s, sys: 5.82 s, total: 37.6 s
Wall time: 38 s


In [13]:
mem()

424.0 MB (2.7%) rss, 2.0 GB vms, 12.2 GB free, 14.1 GB avail


In [14]:
%%time
mem()
compute_ac("3R", pop="species == 'gambiae'", site_mask="gamb_colu_arab")
mem()

424.0 MB (2.7%) rss, 2.0 GB vms, 12.2 GB free, 14.1 GB avail
424.8 MB (2.7%) rss, 2.0 GB vms, 12.2 GB free, 14.1 GB avail; after access genotypes
1571 samples, 2.8 kB
424.8 MB (2.7%) rss, 2.0 GB vms, 12.2 GB free, 14.1 GB avail; after locate samples
425.1 MB (2.7%) rss, 2.0 GB vms, 12.2 GB free, 14.1 GB avail; after select samples
425.1 MB (2.7%) rss, 2.0 GB vms, 12.2 GB free, 14.1 GB avail; after setup allele counts
1.3 GB (8.1%) rss, 2.8 GB vms, 11.3 GB free, 13.2 GB avail; after compute allele counts
18294658
742.4 MB (4.7%) rss, 2.3 GB vms, 11.9 GB free, 13.8 GB avail
CPU times: user 53.8 s, sys: 4.18 s, total: 58 s
Wall time: 57.1 s


In [15]:
def clear_logs(dask_scheduler):
    dask_scheduler.log.clear()
    dask_scheduler.transition_log.clear()
    dask_scheduler.events.clear()
    

In [16]:
mem()
client.run_on_scheduler(clear_logs)
mem('clear scheduler logs')

742.4 MB (4.7%) rss, 2.3 GB vms, 11.9 GB free, 13.8 GB avail


distributed.worker - INFO - Run out-of-band function 'clear_logs'


735.9 MB (4.7%) rss, 2.3 GB vms, 11.9 GB free, 13.8 GB avail; after clear scheduler logs


UsageError: Line magic function `%memit` not found.


In [14]:
mem()

2.1 GB / 15.8 GB (15.1 %) used


In [11]:
mem()
client.restart()
mem()

2.4 GB / 15.8 GB (16.9 %) used
2.4 GB / 15.8 GB (16.8 %) used


In [20]:
import dask

In [21]:
dask.__version__

'2.9.0'

In [22]:
import distributed

In [23]:
distributed.__version__

'2.9.0'

In [13]:
mem()
client.shutdown()
mem()

2.4 GB / 15.8 GB (17.1 %) used


distributed.batched - INFO - Batched Comm Closed: 
distributed.batched - INFO - Batched Comm Closed: 
distributed.batched - INFO - Batched Comm Closed: 
distributed.batched - INFO - Batched Comm Closed: 
distributed.batched - INFO - Batched Comm Closed: 


2.4 GB / 15.8 GB (17.1 %) used


distributed.client - ERROR - Failed to reconnect to scheduler after 10.00 seconds, closing client
distributed.utils - ERROR - 
Traceback (most recent call last):
  File "/opt/conda/lib/python3.7/site-packages/distributed/utils.py", line 663, in log_errors
    yield
  File "/opt/conda/lib/python3.7/site-packages/distributed/client.py", line 1296, in _close
    await gen.with_timeout(timedelta(seconds=2), list(coroutines))
concurrent.futures._base.CancelledError
distributed.utils - ERROR - 
Traceback (most recent call last):
  File "/opt/conda/lib/python3.7/site-packages/distributed/utils.py", line 663, in log_errors
    yield
  File "/opt/conda/lib/python3.7/site-packages/distributed/client.py", line 1025, in _reconnect
    await self._close()
  File "/opt/conda/lib/python3.7/site-packages/distributed/client.py", line 1296, in _close
    await gen.with_timeout(timedelta(seconds=2), list(coroutines))
concurrent.futures._base.CancelledError


In [30]:
cluster.close()

In [32]:
client.close()

In [33]:
mem()

3.3 GB / 15.8 GB (22.7 %) used


In [15]:
%%time
mem()
compute_ac("3L", pop="species == 'gambiae'", site_mask="gamb_colu_arab")
mem()

1.7 GB / 15.8 GB (12.8 %) used
1.7 GB / 15.8 GB (12.7 %) used after access genotypes
1571 samples, 2.8 kB
1.7 GB / 15.8 GB (12.7 %) used after locate samples
1.7 GB / 15.8 GB (12.7 %) used after select samples
1.7 GB / 15.8 GB (12.7 %) used after locate sites
1.7 GB / 15.8 GB (12.7 %) used after select sites
1.7 GB / 15.8 GB (12.7 %) used after compute chunk sizes
(25869385, 1571, 2) ((60227, 40817, 52624, 56934, 113414, 207562, 207470, 173104, 184395, 197982, 249958, 215830, 153283, 16002, 103324, 209187, 225899, 216525, 181979, 204842, 238928, 153009, 65066, 91735, 174117, 161834, 191767, 172352, 120325, 130573, 131384, 160035, 193027, 156112, 138205, 171211, 186906, 200276, 186906, 212762, 187035, 202143, 203021, 88866, 228586, 192598, 185937, 216480, 205528, 146946, 107561, 161468, 220916, 241799, 243008, 214418, 174473, 199688, 236413, 219729, 208434, 222964, 222993, 197762, 160484, 184015, 191201, 203038, 199423, 185917, 193299, 195186, 210708, 211814, 135648, 183979, 161900, 206

In [17]:
import gc
gc.collect()
mem()

2.0 GB / 15.8 GB (14.2 %) used


In [20]:
%%time
mem()
compute_ac("3L", pop="species == 'gambiae'", site_mask="gamb_colu_arab")
mem()

2.2 GB / 15.8 GB (16.1 %) used




2.2 GB / 15.8 GB (16.1 %) used after access genotypes
1571 samples, 2.8 kB
2.2 GB / 15.8 GB (16.1 %) used after locate samples
2.2 GB / 15.8 GB (16.1 %) used after select samples
2.2 GB / 15.8 GB (16.1 %) used after locate sites
2.2 GB / 15.8 GB (16.1 %) used after select sites
2.2 GB / 15.8 GB (16.1 %) used after setup allele counts




2.9 GB / 15.8 GB (20.1 %) used after compute allele counts
2.4 GB / 15.8 GB (17.5 %) used
CPU times: user 49.4 s, sys: 3.84 s, total: 53.2 s
Wall time: 53.4 s


In [10]:
mem()
compute_ac("3L", pop=None, site_mask=None)
mem()

1.9 GB / 15.8 GB (14.0 %) used
1.9 GB / 15.8 GB (14.0 %) used after access genotypes
1.9 GB / 15.8 GB (14.0 %) used after setup allele counts
2.7 GB / 15.8 GB (19.3 %) used after compute allele counts
2.1 GB / 15.8 GB (15.2 %) used


In [11]:
mem()
compute_ac("3L", pop=None, site_mask=None)
mem()

2.1 GB / 15.8 GB (15.0 %) used
2.1 GB / 15.8 GB (15.0 %) used after access genotypes
2.1 GB / 15.8 GB (15.0 %) used after setup allele counts




2.8 GB / 15.8 GB (19.6 %) used after compute allele counts
2.1 GB / 15.8 GB (15.5 %) used


In [12]:
mem()
compute_ac("3R", pop=None, site_mask=None)
mem()

2.1 GB / 15.8 GB (15.4 %) used
2.1 GB / 15.8 GB (15.4 %) used after access genotypes
2.1 GB / 15.8 GB (15.4 %) used after setup allele counts




3.0 GB / 15.8 GB (21.3 %) used after compute allele counts
2.2 GB / 15.8 GB (16.0 %) used


In [13]:
mem()
compute_ac("3R", pop=None, site_mask=None)
mem()

2.2 GB / 15.8 GB (16.0 %) used
2.2 GB / 15.8 GB (16.0 %) used after access genotypes
2.2 GB / 15.8 GB (16.0 %) used after setup allele counts




3.2 GB / 15.8 GB (22.5 %) used after compute allele counts
2.4 GB / 15.8 GB (17.2 %) used


In [14]:
mem()
compute_ac("3R", pop=None, site_mask=None)
mem()

2.4 GB / 15.8 GB (17.2 %) used
2.4 GB / 15.8 GB (17.2 %) used after access genotypes
2.4 GB / 15.8 GB (17.2 %) used after setup allele counts




3.1 GB / 15.8 GB (21.4 %) used after compute allele counts
2.2 GB / 15.8 GB (16.1 %) used


In [19]:
mem()
compute_ac("3L", pop="species == 'arabiensis'")
mem()

2.2 GB / 15.8 GB (16.0 %) used
2.2 GB / 15.8 GB (16.0 %) used after access genotypes
368 samples, 2.9 kB
2.2 GB / 15.8 GB (16.0 %) used after locate samples
2.2 GB / 15.8 GB (16.0 %) used after select samples
2.2 GB / 15.8 GB (16.0 %) used after setup allele counts




3.0 GB / 15.8 GB (21.0 %) used after compute allele counts
2.3 GB / 15.8 GB (16.8 %) used


In [20]:
mem()
compute_ac("3L", pop="species == 'arabiensis'")
mem()

2.2 GB / 15.8 GB (16.1 %) used
2.2 GB / 15.8 GB (16.1 %) used after access genotypes
368 samples, 2.9 kB
2.2 GB / 15.8 GB (16.1 %) used after locate samples
2.2 GB / 15.8 GB (16.1 %) used after select samples
2.2 GB / 15.8 GB (16.1 %) used after setup allele counts




3.6 GB / 15.8 GB (25.0 %) used after compute allele counts
3.0 GB / 15.8 GB (20.8 %) used


In [41]:
import gc

In [44]:
x = np.array([1, 3, 5])
x

array([1, 3, 5])

In [55]:
xf = client.scatter(x, broadcast=True)
xd = da.from_delayed(xf, shape=x.shape, dtype=x.dtype)
xd

Unnamed: 0,Array,Chunk
Bytes,24 B,24 B
Shape,"(3,)","(3,)"
Count,2 Tasks,1 Chunks
Type,int64,numpy.ndarray
"Array Chunk Bytes 24 B 24 B Shape (3,) (3,) Count 2 Tasks 1 Chunks Type int64 numpy.ndarray",3  1,

Unnamed: 0,Array,Chunk
Bytes,24 B,24 B
Shape,"(3,)","(3,)"
Count,2 Tasks,1 Chunks
Type,int64,numpy.ndarray


In [61]:
a = da.arange(100, chunks=2)
a

Unnamed: 0,Array,Chunk
Bytes,800 B,16 B
Shape,"(100,)","(2,)"
Count,50 Tasks,50 Chunks
Type,int64,numpy.ndarray
"Array Chunk Bytes 800 B 16 B Shape (100,) (2,) Count 50 Tasks 50 Chunks Type int64 numpy.ndarray",100  1,

Unnamed: 0,Array,Chunk
Bytes,800 B,16 B
Shape,"(100,)","(2,)"
Count,50 Tasks,50 Chunks
Type,int64,numpy.ndarray


In [62]:
da.take(a, xd).compute()

array([1, 3, 5])

In [59]:
mem()
gc.collect()
mem()

5.7 GB / 15.8 GB (38.4 %) used
5.7 GB / 15.8 GB (38.4 %) used


In [21]:
import tracemalloc

In [25]:
tracemalloc.start()
mem()
tm1 = tracemalloc.take_snapshot()
compute_ac("3L", pop="species == 'arabiensis'", site_mask="gamb_colu_arab")
tm2 = tracemalloc.take_snapshot()
mem()
top_stats = tm2.compare_to(tm1, 'lineno')
for stat in top_stats[:10]:
    print(stat)

2.7 GB / 15.8 GB (19.0 %) used
2.7 GB / 15.8 GB (19.4 %) used after access genotypes
368 samples, 2.9 kB
2.7 GB / 15.8 GB (19.4 %) used after locate samples
2.7 GB / 15.8 GB (19.4 %) used after select samples
25869385 sites, 207.0 MB
3.0 GB / 15.8 GB (20.7 %) used after locate sites
3.3 GB / 15.8 GB (23.1 %) used after select sites
3.3 GB / 15.8 GB (23.1 %) used after setup allele counts
6.9 GB / 15.8 GB (46.0 %) used after compute allele counts
6.1 GB / 15.8 GB (40.5 %) used
/opt/conda/lib/python3.7/site-packages/distributed/protocol/core.py:193: size=44.3 MiB (+10084 KiB), count=758897 (+168544), average=61 B
/opt/conda/lib/python3.7/site-packages/distributed/scheduler.py:4372: size=12.6 MiB (+2977 KiB), count=46950 (+10921), average=282 B
/opt/conda/lib/python3.7/site-packages/distributed/scheduler.py:3575: size=1513 KiB (+1416 KiB), count=10758 (+10071), average=144 B
/opt/conda/lib/python3.7/site-packages/distributed/comm/tcp.py:201: size=4936 KiB (+1301 KiB), count=45 (+5), avera

In [27]:
cluster.scheduler

<Scheduler: "tcp://10.34.177.27:44011" processes: 50 cores: 150>

In [39]:
cluster.scheduler.used_resources.items()

ItemsView(<class 'distributed.scheduler._StateLegacyMapping'>({'tcp://10.32.108.5:35213': {}, 'tcp://10.32.116.4:39093': {}, 'tcp://10.32.117.4:43283': {}, 'tcp://10.32.118.4:44183': {}, 'tcp://10.32.119.4:37711': {}, 'tcp://10.32.120.4:33057': {}, 'tcp://10.32.121.4:34901': {}, 'tcp://10.32.122.4:41633': {}, 'tcp://10.32.123.4:35283': {}, 'tcp://10.32.124.3:36877': {}, 'tcp://10.32.125.3:45441': {}, 'tcp://10.32.126.3:44833': {}, 'tcp://10.32.127.3:37473': {}, 'tcp://10.32.128.3:36671': {}, 'tcp://10.32.129.3:41359': {}, 'tcp://10.32.130.3:41823': {}, 'tcp://10.32.131.3:37013': {}, 'tcp://10.32.132.3:40377': {}, 'tcp://10.32.133.3:41077': {}, 'tcp://10.32.134.3:35063': {}, 'tcp://10.32.135.3:34919': {}, 'tcp://10.32.136.3:37479': {}, 'tcp://10.32.137.3:41169': {}, 'tcp://10.32.138.3:38621': {}, 'tcp://10.32.139.3:44411': {}, 'tcp://10.32.140.3:43869': {}, 'tcp://10.32.141.3:32947': {}, 'tcp://10.32.142.3:32843': {}, 'tcp://10.32.143.3:37305': {}, 'tcp://10.32.144.3:37355': {}, 'tcp://

In [40]:
mem()

6.0 GB / 15.8 GB (40.1 %) used


In [20]:
mem()
compute_ac("3R", pop="species == 'coluzzii'")
mem()

2.9 GB / 15.8 GB (20.1 %) used




2.9 GB / 15.8 GB (20.1 %) used after access genotypes
675 samples
2.9 GB / 15.8 GB (20.1 %) used after locate samples
2.9 GB / 15.8 GB (20.1 %) used after select samples
2.9 GB / 15.8 GB (20.1 %) used after setup allele counts




3.7 GB / 15.8 GB (25.7 %) used after compute allele counts
2.9 GB / 15.8 GB (20.4 %) used


In [21]:
mem()
compute_ac("3L", pop="species == 'gambiae'")
mem()

2.9 GB / 15.8 GB (20.2 %) used
2.9 GB / 15.8 GB (20.2 %) used after access genotypes
1571 samples
2.9 GB / 15.8 GB (20.2 %) used after locate samples
2.9 GB / 15.8 GB (20.2 %) used after select samples
2.9 GB / 15.8 GB (20.2 %) used after setup allele counts




3.6 GB / 15.8 GB (24.8 %) used after compute allele counts
2.9 GB / 15.8 GB (20.7 %) used


In [25]:
mem()
compute_ac("3L", pop="species == 'arabiensis'", site_mask="gamb_colu_arab")
mem()

5.3 GB / 15.8 GB (35.4 %) used




5.3 GB / 15.8 GB (35.4 %) used after access genotypes
368 samples
5.3 GB / 15.8 GB (35.4 %) used after locate samples
5.3 GB / 15.8 GB (35.4 %) used after select samples
5.5 GB / 15.8 GB (36.6 %) used after locate sites
5.9 GB / 15.8 GB (39.2 %) used after select sites
5.9 GB / 15.8 GB (39.2 %) used after setup allele counts




6.4 GB / 15.8 GB (42.4 %) used after compute allele counts
5.3 GB / 15.8 GB (35.8 %) used


In [27]:
mem()
compute_ac("3L", pop="species == 'arabiensis'", site_mask="gamb_colu")
mem()

6.0 GB / 15.8 GB (39.9 %) used
5.8 GB / 15.8 GB (39.0 %) used after access genotypes
368 samples
5.8 GB / 15.8 GB (39.0 %) used after locate samples
5.8 GB / 15.8 GB (39.0 %) used after select samples
6.0 GB / 15.8 GB (40.2 %) used after locate sites
6.5 GB / 15.8 GB (43.0 %) used after select sites
6.5 GB / 15.8 GB (43.0 %) used after setup allele counts




6.9 GB / 15.8 GB (45.9 %) used after compute allele counts
5.8 GB / 15.8 GB (38.7 %) used


In [23]:
mem()
del ag3
mem()

6.8 GB / 15.8 GB (44.8 %) used
6.8 GB / 15.8 GB (44.8 %) used


In [22]:
mem()
ag3.fs.clear_instance_cache()
mem()

6.8 GB / 15.8 GB (44.9 %) used
6.8 GB / 15.8 GB (44.9 %) used


In [25]:
mem()
client.close()
mem()

6.8 GB / 15.8 GB (44.9 %) used
6.8 GB / 15.8 GB (44.9 %) used


In [27]:
mem()
cluster.close()
mem()

6.8 GB / 15.8 GB (44.8 %) used
6.8 GB / 15.8 GB (44.8 %) used


In [17]:
psutil.virtual_memory()

svmem(total=15767891968, available=8667013120, percent=45.0, used=6786740224, free=6732668928, active=7423770624, inactive=994439168, buffers=387637248, cached=1860845568, shared=1224704, slab=457560064)

In [11]:
naturalsize(ac_arab.nbytes)

'413.9 MB'

In [15]:
mem()
del  client
mem()

5.9 GB (39.5 %) used
5.9 GB (39.5 %) used


In [12]:
mem()
client.restart()
mem()

6.9 GB (46.1 %) used
6.9 GB (46.1 %) used


In [13]:
cluster.scheduler.

<Scheduler: "tcp://10.34.177.19:33585" processes: 50 cores: 150>

In [16]:
mem()
ac_colu = compute_ac("3L", pop="species == 'coluzzii'")
mem()
ac_colu

5.9 GB (39.4 %) used
675 samples
5.9 GB (39.4 %) used after locate samples
5.9 GB (39.4 %) used after access genotypes
5.9 GB (39.4 %) used after select samples
5.9 GB (39.4 %) used after setup allele counts


KeyboardInterrupt: 

In [None]:
mem()
ac_gamb = compute_ac("3L", pop="species == 'gambiae'", site_mask="gamb_colu_arab")
ac_gamb

In [25]:
ac_gambcolu = compute_ac("3L", pop="species in ['gambiae', 'coluzzii', 'intermediate_gambiae_coluzzii']")
ac_gambcolu

2415 samples
5.0 GB (33.6 %) used after locate samples
5.0 GB (33.6 %) used after access genotypes
5.0 GB (33.6 %) used after take genotypes
5.9 GB (39.5 %) used after compute allele counts


Unnamed: 0,0,1,2,3,Unnamed: 5
0,0,0,0,0,
1,0,0,0,0,
2,0,0,0,0,
...,...,...,...,...,...
40758470,98,0,0,0,
40758471,94,0,0,0,
40758472,82,0,0,0,


In [14]:
np.sum(ac_arab.is_segregating())

8842883

In [15]:
np.sum(ac_gamb.is_segregating())

24320283

In [16]:
np.sum(ac_colu.is_segregating())

17323206

In [26]:
# fixed differences
ac1 = ac_arab
ac2 = ac_gambcolu
loc_joint = (ac1 > 0) & (ac2 > 0)
joint_allelism = loc_joint.sum(axis=1)
loc_fd = (ac1.allelism() >= 1) & (ac2.allelism() >= 1) & (joint_allelism == 0)
loc_fd.sum()

4281

In [28]:
mem()

5.9 GB (39.6 %) used


In [30]:
cluster.adapt()

<distributed.deploy.adaptive.Adaptive at 0x7f8321db4710>

distributed.deploy.adaptive - INFO - Retiring workers [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]


In [38]:
loc_pass = ag3.site_filters(contig="3L", mask="gamb_colu_arab").compute()
loc_pass

array([False, False, False, ..., False, False, False])

In [39]:
ac_arab_pass = np.compress(loc_pass, ac_arab, axis=0)
ac_arab_pass

Unnamed: 0,0,1,2,3,Unnamed: 5
0,736,0,0,0,
1,736,0,0,0,
2,736,0,0,0,
...,...,...,...,...,...
25869382,736,0,0,0,
25869383,736,0,0,0,
25869384,736,0,0,0,
