In [1]:
import zarr
from pyprojroot import here
import pandas as pd
import numpy as np
import allel
import yaml
import matplotlib.pyplot as plt
import functools
import seaborn as sns
sns.set_context('paper')
sns.set_style('darkgrid')
import dask.array as da
import scipy.interpolate
import scipy.stats
import petl as etl
import pyfasta
from matplotlib.ticker import MultipleLocator, FormatStrFormatter
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [2]:
genome_path = here() / 'data/external/vectorbase/Anopheles-gambiae-PEST_CHROMOSOMES_AgamP4.fa'
genome = pyfasta.Fasta(str(genome_path), key_fn=lambda x: x.split()[0])

In [3]:
chromosomes = '2', '3', 'X'
chromosome_plen = {
    '2': len(genome['2R']) + len(genome['2L']), 
    '3': len(genome['3R']) + len(genome['3L']), 
    'X': len(genome['X'])
}

In [4]:
pop_defs_path = here() / 'notebooks/gwss/pop_defs.yml'
with open(pop_defs_path, 'rt') as f:
    pop_defs = yaml.safe_load(f)
pops = list(pop_defs)

In [5]:
h12_root_path = here() / 'data/gwss/h12/h12.zarr'
h12_root = zarr.open_consolidated(str(h12_root_path))

    
def load_h12_gwss(pop, chromosome):
    window_size = pop_defs[pop]['h12_window_size']
    window_step = 200
    grp = h12_root[f'{pop}/{window_size}/{window_step}/{chromosome}']
    return (
        grp['windows'][:],
        grp['gwindows'][:],
        grp['h1'][:],
        grp['h12'][:],
        grp['h123'][:],
        grp['h2_h1'][:]
    )

In [6]:
ihs_root_path = here() / 'data/gwss/ihs/ihs.zarr'
ihs_root = zarr.open_consolidated(str(ihs_root_path))


@functools.lru_cache(maxsize=None)
def load_ihs_gwss(pop, chromosome, window_size=200, window_step=100):
    grp = ihs_root[f'{pop}/{chromosome}']
    pos = grp['pos'][:]
    ihs_std = np.fabs(grp['ihs_std'][:])
    x = allel.moving_statistic(pos, np.mean, size=window_size, step=window_step)
    y_max = allel.moving_statistic(ihs_std, np.max, size=window_size, step=window_step)
    y_pc95 = allel.moving_statistic(ihs_std, lambda v: np.percentile(v, 95), size=window_size, step=window_step)
    y_pc75 = allel.moving_statistic(ihs_std, lambda v: np.percentile(v, 75), size=window_size, step=window_step)
    y_pc50 = allel.moving_statistic(ihs_std, np.median, size=window_size, step=window_step)
    return x, y_max, y_pc95, y_pc75, y_pc50


In [7]:
xpehh_root_path = here() / 'data/gwss/xpehh/xpehh.zarr'
xpehh_root = zarr.open_consolidated(str(xpehh_root_path))


@functools.lru_cache(maxsize=None)
def load_xpehh_gwss(pop1, pop2, chromosome, window_size=500, window_step=250):

    # avoid running the same scan twice
    orig_pop1, orig_pop2 = pop1, pop2
    pop1, pop2 = sorted([pop1, pop2])
    
    grp = xpehh_root[f'{pop1}_{pop2}/{chromosome}']
    pos = grp['pos'][:]
    xpehh = grp['xpehh'][:]

    if pop1 == orig_pop2:
        # flip back
        xpehh = -xpehh
        pop1, pop2 = pop2, pop1

    # centre
    xpehh = xpehh - np.median(xpehh)
    
    # clip at zero to focus on selection in pop1
    xpehh1 = np.clip(xpehh, a_min=0, a_max=None)
    
    x = allel.moving_statistic(pos, np.mean, size=window_size, step=window_step)
    y_max = allel.moving_statistic(xpehh1, np.max, size=window_size, step=window_step)
    y_pc95 = allel.moving_statistic(xpehh1, lambda v: np.percentile(v, 95), size=window_size, step=window_step)
    y_pc75 = allel.moving_statistic(xpehh1, lambda v: np.percentile(v, 75), size=window_size, step=window_step)
    y_pc50 = allel.moving_statistic(xpehh1, np.median, size=window_size, step=window_step)
    
    return x, y_max, y_pc95, y_pc75, y_pc50


In [8]:
pbs_root_path = here() / 'data/gwss/pbs/pbs.zarr'
pbs_root = zarr.open_consolidated(str(pbs_root_path))


def load_pbs_gwss(pop1, pop2, pop3, chromosome, window_size=500, window_step=250):

    grp_path = f'/{pop1}_{pop2}_{pop3}/{window_size}/{window_step}/{chromosome}'
    grp = pbs_root[grp_path]
    windows = grp['windows'][:]
    gwindows = grp['gwindows'][:]
    pbs = grp['pbs'][:]
    pbs_scaled = grp['pbs_scaled'][:]
    
    return windows, gwindows, pbs, pbs_scaled


In [9]:
def load_genes():

    features_path = here() / 'data/external/vectorbase/Anopheles-gambiae-PEST_BASEFEATURES_AgamP4.12.gff3'
    df_genes = (
        allel.gff3_to_dataframe(
            str(features_path),
            attributes=['ID', 'Name', 'biotype']
        )
        .set_index('ID')
        .query("type == 'gene' and biotype == 'protein_coding'")
    )

    # convert to chromosomal coordinates
    df_genes['chromosome'] = df_genes['seqid'].copy()
    df_genes['chromosome_start'] = df_genes['start'].copy()
    df_genes['chromosome_end'] = df_genes['end'].copy()
    loc_2R = df_genes.seqid == '2R'
    df_genes.loc[loc_2R, 'chromosome'] = '2'
    loc_2L = df_genes.seqid == '2L'
    df_genes.loc[loc_2L, 'chromosome'] = '2'
    df_genes.loc[loc_2L, 'chromosome_start'] = df_genes.loc[loc_2L, 'start'] + len(genome['2R'])
    df_genes.loc[loc_2L, 'chromosome_end'] = df_genes.loc[loc_2L, 'end'] + len(genome['2R'])
    loc_3R = df_genes.seqid == '3R'
    df_genes.loc[loc_3R, 'chromosome'] = '3'
    loc_3L = df_genes.seqid == '3L'
    df_genes.loc[loc_3L, 'chromosome'] = '3'
    df_genes.loc[loc_3L, 'chromosome_start'] = df_genes.loc[loc_3L, 'start'] + len(genome['3R'])
    df_genes.loc[loc_3L, 'chromosome_end'] = df_genes.loc[loc_3L, 'end'] + len(genome['3R'])
    df_genes['chromosome_center'] = (df_genes['chromosome_start'] + df_genes['chromosome_end']) / 2
    
    return df_genes


df_genes = load_genes()

In [10]:
df_genes.head()

Unnamed: 0_level_0,seqid,source,type,start,end,score,strand,phase,Name,biotype,chromosome,chromosome_start,chromosome_end,chromosome_center
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
AGAP004677,2L,VectorBase,gene,157348,186936,-1,-,-1,.,protein_coding,2,61702453,61732041,61717247.0
AGAP004678,2L,VectorBase,gene,203779,205293,-1,+,-1,.,protein_coding,2,61748884,61750398,61749641.0
AGAP004679,2L,VectorBase,gene,207894,210460,-1,+,-1,.,protein_coding,2,61752999,61755565,61754282.0
AGAP004680,2L,VectorBase,gene,271285,271815,-1,+,-1,.,protein_coding,2,61816390,61816920,61816655.0
AGAP004681,2L,VectorBase,gene,358329,359280,-1,-,-1,.,protein_coding,2,61903434,61904385,61903909.5


In [19]:
import warnings

In [21]:
with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    ace1 = df_genes.loc['AGAP001356']
    ace1['Name'] = 'Ace1'
    cyp6p3 = df_genes.loc['AGAP002865']
    cyp6p3['Name'] = 'Cyp6p3'
    vgsc = df_genes.loc['AGAP004707']
    vgsc['Name'] = 'Vgsc'
    gaba = df_genes.loc['AGAP006028']
    gaba['Name'] = 'Gaba'
    gste2 = df_genes.loc['AGAP009194']
    gste2['Name'] = 'Gste2'
    cyp9k1 = df_genes.loc['AGAP000818']
    cyp9k1['Name'] = 'Cyp9k1'
ir_genes = [ace1, cyp6p3, vgsc, gaba, gste2, cyp9k1]

In [12]:
novel_loci = {
    'A': ('2', 24_860_000),
    'B': ('2', 40_940_000),
    'C': ('2', 28_549_590 + len(genome['2R'])),
    'D': ('2', 34_050_000 + len(genome['2R'])),
    'E': ('X', 4_360_000),
    'F': ('X', 9_220_000),
}