In [1]:
!pip install -Uq malariagen_data

In [2]:
import logging
logging.getLogger("distributed.scheduler").setLevel(logging.ERROR)
logging.getLogger("distributed.core").setLevel(logging.ERROR)
logging.getLogger("distributed.deploy.adaptive").setLevel(logging.ERROR)
logging.getLogger("distributed.utils_perf").setLevel(logging.ERROR)
logging.getLogger("distributed.batched").setLevel(logging.ERROR)
from dask_kubernetes import KubeCluster
from dask.distributed import Client
cluster = KubeCluster(n_workers=20, 
                      env={'EXTRA_PIP_PACKAGES': 'malariagen_data'})
client = Client(cluster)
client

0,1
Client  Scheduler: tcp://10.35.97.2:43395  Dashboard: /user/alimanfoo@googlemail.com/proxy/8787/status,Cluster  Workers: 0  Cores: 0  Memory: 0 B


In [3]:
import functools
import bisect
import allel
import numpy as np
import dask.array as da
from dask.diagnostics import ProgressBar
import malariagen_data
import bokeh.io
import bokeh.plotting
import bokeh.models
import bokeh
import seaborn as sns
from bokeh.core.enums import MarkerType
from matplotlib.colors import to_hex
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [4]:
bokeh.io.output_notebook()

In [5]:
ProgressBar().register()

In [6]:
ag3 = malariagen_data.Ag3("gs://vo_agam_release/")

In [34]:
import random

In [43]:
@functools.lru_cache(maxsize=None)
def setup_pca_data(
    contig, 
    region_start, 
    region_stop,
    sample_sets,
    sample_query,
    site_mask,
    min_ac,
    max_an_missing,
    n_snps,
    seed):
    
    # load sample metadata
    df_samples = ag3.sample_metadata(sample_sets=sample_sets)
    
    # load SNP positions
    pos = ag3.snp_sites(contig=contig, field='POS', site_mask=site_mask).compute()
    
    # access SNP genotypes
    gt = ag3.snp_genotypes(contig=contig, sample_sets=sample_sets, site_mask=site_mask)

    # locate genome region 
    if region_start or region_stop:
        loc_region = slice(
            bisect.bisect_left(pos, region_start) if region_start else None,
            bisect.bisect_right(pos, region_stop) if region_stop else None,
        )
        gt = gt[loc_region]
    
    # locate selected samples
    if sample_query:
        loc_samples = df_samples.eval(sample_query).values
        df_samples = df_samples.loc[loc_samples, :]
        gt = da.compress(loc_samples, gt, axis=1)
        
    # perform allele count
    ac = allel.GenotypeDaskArray(gt).count_alleles(max_allele=3).compute()
    
    # calculate some convenience variables
    n_chroms = gt.shape[1] * 2
    an_called = ac.sum(axis=1)
    an_missing = n_chroms - an_called
    
    # locate segregating sites above threshold frequency
    max_ac = n_chroms - min_ac
    # here we choose biallelic sites involving the reference allele
    loc_seg = np.nonzero(ac.is_biallelic() & 
                         (ac[:, 0] >= min_ac) & 
                         (ac[:, 0] <= max_ac) & 
                         (an_missing <= max_an_missing))[0]
    
    # thin SNPs to desired number
    step = loc_seg.shape[0] // n_snps
    # start at a random offset
    random.seed(seed)
    offset = random.choice(range(step))
    loc_seg_ds = loc_seg[offset::step]

    # subset genotypes to selected sites
    gt_seg = da.take(gt, loc_seg_ds, axis=0)
    
    # convert to genotype alt counts
    gn_seg = allel.GenotypeDaskArray(gt_seg).to_n_alt().compute()
    
    # remove any edge-cases where all genotypes are identical
    loc_var = np.any(gn_seg != gn_seg[:, 0, np.newaxis], axis=1)
    gn_var = np.compress(loc_var, gn_seg, axis=0)

    return df_samples, gn_var
    

In [45]:
@functools.lru_cache(maxsize=None)
def run_pca(
    contig="3L", 
    region_start=15_000_000, 
    region_stop=41_000_000,
    sample_sets="v3_wild",
    sample_query=None,
    site_mask="gamb_colu_arab",
    min_ac=3,
    max_an_missing=0,
    n_snps=100_000,
    seed=42):

    # generate input data
    df_samples, gn_var = setup_pca_data(
        contig=contig,
        region_start=region_start,
        region_stop=region_stop,
        sample_sets=sample_sets,
        sample_query=sample_query,
        site_mask=site_mask,
        min_ac=min_ac,
        max_an_missing=max_an_missing,
        n_snps=n_snps,
        seed=seed,
    )
    
    # run PCA
    coords, model = allel.pca(gn_var)
    
    return df_samples, coords, model
    

In [67]:
def plot_pca(
    df_samples,
    coords,
    pcx=1, 
    pcy=2, 
    color_field='country',
    color_palette="Set1",
    figsize=300,
    title='PCA',
    marker_size=12,
    ):

    data = dict(
        x=coords[:, pcx-1],
        y=coords[:, pcy-1],
        sample_id=df_samples['sample_id'],
        species=df_samples['species'],
        country=df_samples['country'],
        location=df_samples['location'],
        year=df_samples['year'],
        sample_set=df_samples['sample_set'],
    )

    # marker by species
    species = ['arabiensis', 
               'gambiae', 
               'coluzzii', 
               'intermediate_arabiensis_gambiae', 
               'intermediate_gambiae_coluzzii']
    species_markers = ['triangle', 'circle', 'square', 'plus', 'star']
    markers = [species_markers[species.index(s)] for s in df_samples['species']]
    data['marker'] = markers

    # color by whatever you ask for
    if isinstance(color_palette, str):
        color_keys = df_samples[color_field].unique().tolist()
        color_palette = sns.color_palette('Set1', n_colors=len(color_keys))
        colors = [to_hex(color_palette[color_keys.index(v)])
                  for v in df_samples[color_field]]
    elif isinstance(color_palette, dict):
        colors = [color_palette[v] for v in df_samples[color_field]]
    else:
        raise ValueError
    data['color'] = colors
    
    source = bokeh.plotting.ColumnDataSource(data)
    tools = "pan,wheel_zoom,box_zoom,reset,hover,save"
    fig = bokeh.plotting.figure(title=title, tools=tools, active_scroll="wheel_zoom", 
                                width=figsize, height=figsize)
    fig.scatter('x', 'y', marker="marker", size=marker_size, source=source, 
                line_color="black", fill_color='color', legend_field='country')
    
    # setup hover tooltips
    hover = fig.select(dict(type=bokeh.models.HoverTool))
    hover.tooltips = {
        "sample_id": "@sample_id",
        "species": "@species",
        "country": "@country",
        "location": "@location",
        "year": "@year",
        "sample_set": "@sample_set",
    }
    
    # axis labels
    fig.xaxis.axis_label = "PC{}".format(pcx)
    fig.yaxis.axis_label = "PC{}".format(pcy)
    
    # TODO color legend
    fig.legend.location = "top_left"
    
    # TODO marker legend
    
    bokeh.plotting.show(fig)

## Sandbox

In [68]:
df_samples, coords, model = run_pca(
    sample_query='species == "arabiensis"',
    site_mask='arab',
    seed=42,
)

plot_pca(df_samples, coords, 
         pcx=1,
         pcy=2,
         title='An. arabiensis; 3L:15-41 Mbp')

In [69]:
df_samples, coords, model = run_pca(
    sample_query='species == "arabiensis"',
    site_mask='arab',
    seed=42,
)

plot_pca(df_samples, coords, 
         pcx=5,
         pcy=6,
         title='An. arabiensis; 3L:15-41 Mbp')

In [51]:
df_samples, coords, model = run_pca(
    sample_query='species == "arabiensis"',
    site_mask='arab',
    seed=1,
)

plot_pca(df_samples, coords, title='An. arabiensis; 3L:15-41 Mbp')

In [9]:
df_samples

Unnamed: 0,sample_id,partner_sample_id,contributor,country,location,year,month,latitude,longitude,sex_call,sample_set,release,aim_fraction_colu,aim_fraction_arab,species_gambcolu_arabiensis,species_gambiae_coluzzii,species
269,AB0333-C,BF18-11,Austin Burt,Burkina Faso,Bana,2014,7,11.233,-4.472,F,AG1000G-BF-B,v3,0.472,0.731,arabiensis,,arabiensis
324,AB0457-C,BF17-34,Austin Burt,Burkina Faso,Pala,2014,7,11.150,-4.235,M,AG1000G-BF-B,v3,0.452,0.712,arabiensis,,arabiensis
341,AB0502-C,BF17-15,Austin Burt,Burkina Faso,Pala,2014,7,11.150,-4.235,M,AG1000G-BF-B,v3,0.441,0.693,arabiensis,,arabiensis
1012,AN0341-C,621,Brad White,Cameroon,Lagdo,2013,10,9.049,13.656,F,AG1000G-CM-C,v3,0.478,0.651,arabiensis,,arabiensis
1013,AN0338-C,557,Brad White,Cameroon,Lagdo,2013,10,9.049,13.656,F,AG1000G-CM-C,v3,0.493,0.618,arabiensis,,arabiensis
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2571,AC0085-C,3_E2,Martin Donnelly,Uganda,Nagongera,2012,10,0.770,34.026,F,AG1000G-UG,v3,0.454,0.733,arabiensis,,arabiensis
2572,AC0086-C,3_F1,Martin Donnelly,Uganda,Nagongera,2012,10,0.770,34.026,F,AG1000G-UG,v3,0.451,0.742,arabiensis,,arabiensis
2573,AC0087-Cx,3_F2,Martin Donnelly,Uganda,Nagongera,2012,10,0.770,34.026,F,AG1000G-UG,v3,0.454,0.741,arabiensis,,arabiensis
2574,AC0088-C,3_G1,Martin Donnelly,Uganda,Nagongera,2012,10,0.770,34.026,F,AG1000G-UG,v3,0.460,0.747,arabiensis,,arabiensis


In [10]:
gn_var.shape

(108176, 368)