In [1]:
import zarr
from pyprojroot import here
import pandas as pd
import numpy as np
import allel
import yaml
import matplotlib.pyplot as plt
import functools
import seaborn as sns
sns.set_context('paper')
sns.set_style('darkgrid')
import dask.array as da
import scipy.interpolate
import scipy.stats
import petl as etl
import pyfasta
import matplotlib as mpl
from matplotlib.ticker import MultipleLocator, FormatStrFormatter
import numba
import lmfit
from bisect import bisect_left, bisect_right
import itertools
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [2]:
# OLD VECTORBASE - gone

# genome_path = here() / 'data/external/vectorbase/Anopheles-gambiae-PEST_CHROMOSOMES_AgamP4.fa'
# genome = pyfasta.Fasta(str(genome_path), key_fn=lambda x: x.split()[0])

# NEW VECTORBASE

def _genome_key_fn(s):
    k = s.split()[0]
    if k.startswith('AgamP4'):
        k = k.split('_')[1]
    return k

genome_path = here() / 'data/external/vectorbase/VectorBase-48_AgambiaePEST_Genome.fasta'
genome = pyfasta.Fasta(str(genome_path), key_fn=_genome_key_fn)

In [3]:
chromosomes = '2', '3', 'X'
chromosome_plen = {
    '2': len(genome['2R']) + len(genome['2L']), 
    '3': len(genome['3R']) + len(genome['3L']), 
    'X': len(genome['X'])
}

In [4]:
def chrom2arm(chromosome, pos):
    if chromosome in '23':
        if pos > len(genome[f'{chromosome}R']):
            chrom_arm = f'{chromosome}L'
            pos_arm = pos - len(genome[f'{chromosome}R'])
        else:
            chrom_arm = f'{chromosome}R'
            pos_arm = pos
    else:
        chrom_arm = chromosome
        pos_arm = pos
    return chrom_arm, pos_arm


In [5]:
pop_defs_path = here() / 'notebooks/gwss/pop_defs.yml'
with open(pop_defs_path, 'rt') as f:
    pop_defs = yaml.safe_load(f)
pops = list(pop_defs)

In [6]:
h12_root_path = here() / 'data/gwss/h12/h12.zarr'
h12_root = zarr.open_consolidated(str(h12_root_path))

    
def load_h12_gwss(pop, chromosome):
    window_size = pop_defs[pop]['h12_window_size']
    window_step = 200
    grp = h12_root[f'{pop}/{window_size}/{window_step}/{chromosome}']
    return (
        grp['windows'][:],
        grp['gwindows'][:],
        grp['h1'][:],
        grp['h12'][:],
        grp['h123'][:],
        grp['h2_h1'][:]
    )

In [7]:
ihs_root_path = here() / 'data/gwss/ihs/ihs.zarr'
ihs_root = zarr.open_consolidated(str(ihs_root_path))


@functools.lru_cache(maxsize=None)
def load_ihs_gwss(pop, chromosome, window_size=200, window_step=100):
    grp = ihs_root[f'{pop}/{chromosome}']
    pos = grp['pos'][:]
    gpos = grp['gpos'][:]
    ihs_std = np.fabs(grp['ihs_std'][:])
    x = allel.moving_statistic(pos, np.mean, size=window_size, step=window_step)
    gx = allel.moving_statistic(gpos, np.mean, size=window_size, step=window_step)
    y_max = allel.moving_statistic(ihs_std, np.max, size=window_size, step=window_step)
    y_pc95 = allel.moving_statistic(ihs_std, lambda v: np.percentile(v, 95), size=window_size, step=window_step)
    y_pc75 = allel.moving_statistic(ihs_std, lambda v: np.percentile(v, 75), size=window_size, step=window_step)
    y_pc50 = allel.moving_statistic(ihs_std, np.median, size=window_size, step=window_step)
    return x, gx, y_max, y_pc95, y_pc75, y_pc50


In [8]:
xpehh_root_path = here() / 'data/gwss/xpehh/xpehh.zarr'
xpehh_root = zarr.open_consolidated(str(xpehh_root_path))


@functools.lru_cache(maxsize=None)
def load_xpehh_gwss(pop1, pop2, chromosome, window_size=500, window_step=250):

    # avoid running the same scan twice
    orig_pop1, orig_pop2 = pop1, pop2
    pop1, pop2 = sorted([pop1, pop2])
    
    grp = xpehh_root[f'{pop1}_{pop2}/{chromosome}']
    pos = grp['pos'][:]
    gpos = grp['gpos'][:]
    xpehh = grp['xpehh'][:]

    if pop1 == orig_pop2:
        # flip back
        xpehh = -xpehh
        pop1, pop2 = pop2, pop1

    # centre
    xpehh = xpehh - np.median(xpehh)
    
    # clip at zero to focus on selection in pop1
    xpehh1 = np.clip(xpehh, a_min=0, a_max=None)
    
    x = allel.moving_statistic(pos, np.mean, size=window_size, step=window_step)
    gx = allel.moving_statistic(gpos, np.mean, size=window_size, step=window_step)
    y_max = allel.moving_statistic(xpehh1, np.max, size=window_size, step=window_step)
    y_pc95 = allel.moving_statistic(xpehh1, lambda v: np.percentile(v, 95), size=window_size, step=window_step)
    y_pc75 = allel.moving_statistic(xpehh1, lambda v: np.percentile(v, 75), size=window_size, step=window_step)
    y_pc50 = allel.moving_statistic(xpehh1, np.median, size=window_size, step=window_step)
    
    return x, gx, y_max, y_pc95, y_pc75, y_pc50


In [9]:
pbs_root_path = here() / 'data/gwss/pbs/pbs.zarr'
pbs_root = zarr.open_consolidated(str(pbs_root_path))


def load_pbs_gwss(pop1, pop2, pop3, chromosome, window_size=500, window_step=250):

    grp_path = f'/{pop1}_{pop2}_{pop3}/{window_size}/{window_step}/{chromosome}'
    grp = pbs_root[grp_path]
    windows = grp['windows'][:]
    gwindows = grp['gwindows'][:]
    pbs = grp['pbs'][:]
    pbs_scaled = grp['pbs_scaled'][:]
    
    return windows, gwindows, pbs, pbs_scaled


In [12]:
def load_genes():

    # OLD VECTORBASE - prefer this, as it has gene names
    features_path = here() / 'data/external/vectorbase/Anopheles-gambiae-PEST_BASEFEATURES_AgamP4.12.gff3'
    df_genes = (
        allel.gff3_to_dataframe(
            str(features_path),
            attributes=['ID', 'Name', 'description', 'biotype']
        )
        .set_index('ID')
        .query("type == 'gene' and biotype == 'protein_coding'")
    )

#     # NEW VECTORBASE
#     features_path = here() / 'data/external/vectorbase/VectorBase-48_AgambiaePEST.gff'
#     df_genes = (
#         allel.gff3_to_dataframe(
#             str(features_path),
#             attributes=['ID', 'description']
#         )
#         .sort_values(['seqid', 'start'])
#         .set_index('ID')
#         .query("type == 'gene'")
#     )
#     df_genes['Name'] = '.'
#     # fix chromosome IDs
#     df_genes['seqid'] = df_genes['seqid'].str.split('_', expand=True).loc[:, 1]

    # convert to chromosomal coordinates
    df_genes['chromosome'] = df_genes['seqid'].copy()
    df_genes['chromosome_start'] = df_genes['start'].copy()
    df_genes['chromosome_end'] = df_genes['end'].copy()
    loc_2R = df_genes.seqid == '2R'
    df_genes.loc[loc_2R, 'chromosome'] = '2'
    loc_2L = df_genes.seqid == '2L'
    df_genes.loc[loc_2L, 'chromosome'] = '2'
    df_genes.loc[loc_2L, 'chromosome_start'] = df_genes.loc[loc_2L, 'start'] + len(genome['2R'])
    df_genes.loc[loc_2L, 'chromosome_end'] = df_genes.loc[loc_2L, 'end'] + len(genome['2R'])
    loc_3R = df_genes.seqid == '3R'
    df_genes.loc[loc_3R, 'chromosome'] = '3'
    loc_3L = df_genes.seqid == '3L'
    df_genes.loc[loc_3L, 'chromosome'] = '3'
    df_genes.loc[loc_3L, 'chromosome_start'] = df_genes.loc[loc_3L, 'start'] + len(genome['3R'])
    df_genes.loc[loc_3L, 'chromosome_end'] = df_genes.loc[loc_3L, 'end'] + len(genome['3R'])
    df_genes['chromosome_center'] = (df_genes['chromosome_start'] + df_genes['chromosome_end']) / 2
    
    return df_genes


df_genes = load_genes()

In [13]:
import warnings

In [15]:
# hard-code some gene names
with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    ace1 = df_genes.loc['AGAP001356']
    ace1['Name'] = 'Ace1'
    cyp6p3 = df_genes.loc['AGAP002865']
    cyp6p3['Name'] = 'Cyp6p3'
    vgsc = df_genes.loc['AGAP004707']
    vgsc['Name'] = 'Vgsc'
    gaba = df_genes.loc['AGAP006028']
    gaba['Name'] = 'Gaba'
    gste2 = df_genes.loc['AGAP009194']
    gste2['Name'] = 'Gste2'
    cyp9k1 = df_genes.loc['AGAP000818']
    cyp9k1['Name'] = 'Cyp9k1'
    tep1 = df_genes.loc['AGAP010815']
    tep1['Name'] = 'Tep1'
    # AGAP003645 (Keap1)
    keap1 = df_genes.loc['AGAP003645']
    keap1['Name'] = 'Keap1'
    # AGAP006228 (COEAE2F) 
    coeae2f = df_genes.loc['AGAP006228']
    coeae2f['Name'] = 'Coeae2f'
    # AGAP000519 (Rdga/Dgkz)
    rdga = df_genes.loc['AGAP000519']
    rdga['Name'] = 'Rdga'
candidate_ir_genes = [keap1, coeae2f, rdga]
validated_ir_genes = [ace1, cyp6p3, vgsc, gaba, gste2, cyp9k1]

In [17]:
# novel_loci = {
#     'A': ('2', 24_860_000),
#     'B': ('2', 40_940_000),
#     'C': ('2', 28_549_590 + len(genome['2R'])),
#     'D': ('2', 34_050_000 + len(genome['2R'])),
#     'E': ('X', 4_360_000),
#     'F': ('X', 9_220_000),
# }

In [18]:
tbl_chromatin = [
    ('name', 'chrom', 'start', 'end'),
    ('CHX', 'X', 20009764, 24393108),
    ('CH2R', '2R', 58984778, 61545105),
    ('CH2L', '2L', 1, 2431617),
    ('PEU2L', '2L', 2487770, 5042389),
    ('IH2L', '2L', 5078962, 5788875),
    ('IH3R', '3R', 38988757, 41860198),
    ('CH3R', '3R', 52161877, 53200684),
    ('CH3L', '3L', 1, 1815119),
    ('PEU3L', '3L', 1896830, 4235209),
    ('IH3L', '3L', 4264713, 5031692)
]

In [19]:
seq_ids = '2R', '2L', '3R', '3L', 'X'

In [20]:
def build_gmap():
    
    # crude recombination rate lookup, keyed off chromatin state
    # use units of cM / bp, assume 2 cM / Mbp == 2x10^-6 cM / bp
    tbl_rr = (
        etl.wrap(tbl_chromatin)
        # extend heterochromatin on 2L - this is empirical, based on making vgsc peaks symmetrical
        .update('end', 2840000, where=lambda r: r.name == 'CH2L')
        .update('start', 2840001, where=lambda r: r.name == 'PEU2L')
        .addfield('rr', lambda r: .5e-6 if 'H' in r.name else 2e-6)
    )
    
    # per-base map of recombination rates
    rr_map = {seq_id: np.full(len(genome[seq_id]), fill_value=2e-6, dtype='f8')
              for seq_id in seq_ids}
    for row in tbl_rr.records():
        rr_map[row.chrom][row.start - 1:row.end] = row.rr
        
    # genetic map
    gmap = {seq_id: np.cumsum(rr_map[seq_id]) for seq_id in seq_ids}
    gmap['2'] = np.concatenate([gmap['2R'], gmap['2L'] + gmap['2R'][-1]])
    gmap['3'] = np.concatenate([gmap['3R'], gmap['3L'] + gmap['3R'][-1]])
    
    return gmap

gmap = build_gmap()

In [21]:
def mbp2cm(chromosome, pos):
    """Convert physical distance in Mbp to genetic distance in cM."""
    return bp2cm(chromosome, pos*1e6)


def bp2cm(chromosome, pos):
    """Convert physical distance in bp to genetic distance in cM."""
    return gmap[chromosome][int(pos) - 1]


In [22]:
def tex_italicize_species(s):
    return (
        s
        .replace('An. gambiae', '\textit{An. gambiae}')
        .replace('An. coluzzii', '\textit{An. coluzzii}')
    )


In [23]:
def tex_math_species(s):
    return (
        s
        .replace('An. gambiae', '$An. gambiae$')
        .replace('An. coluzzii', '$An. coluzzii$')
    )


In [24]:
def root_mean_square(s):
    return np.sqrt(np.mean(s**2))


def mean_absolute(s):
    return np.mean(np.fabs(s))


In [25]:
reds = sns.color_palette('Reds', 5)
blues = sns.color_palette('Blues', 4)
greens = sns.color_palette('Greens', 2)
browns = sns.color_palette('YlOrBr', 4)
purples = sns.color_palette('Purples_d', 2)
greys = sns.color_palette('Greys', 3)
pop_colors = {
    'ao_col': reds[4],
    'gh_col': reds[3],
    'bf_col': reds[2],
    'ci_col': reds[1],
    'gn_col': reds[0],
    'cm_sav_gam': blues[3],
    'gh_gam': blues[2],
    'bf_gam': blues[1],
    'gn_gam': blues[0],
    'ga_gam': greens[1],
    'ug_gam': greens[0],
    'fr_gam': purples[1],
    'gq_gam': purples[0],
    'gw': browns[1],
    'gm': browns[2],
    'ke': greys[1],
}

In [26]:
pop_labels = {
    'ao_col': 'Angola $An. coluzzii$',
    'bf_col': 'Burkina Faso $An. coluzzii$',
    'gh_col': 'Ghana $An. coluzzii$',
    'ci_col': "Côte d'Ivoire $An. coluzzii$",
    'gn_col': 'Guinea $An. coluzzii$',
    'gw': 'Guinea-Bissau',
    'gm': 'The Gambia',
    'gn_gam': 'Guinea $An. gambiae$',
    'bf_gam': 'Burkina Faso $An. gambiae$',
    'gh_gam': 'Ghana $An. gambiae$',
    'cm_sav_gam': 'Cameroon $An. gambiae$',
    'ug_gam': 'Uganda $An. gambiae$',
    'ga_gam': 'Gabon $An. gambiae$',
    'gq_gam': 'Bioko $An. gambiae$',
    'fr_gam': 'Mayotte $An. gambiae$',
    'ke': 'Kenya',
}