# Shared setup

Data setup. Use in other notebooks, e.g.:

    %run setup.ipynb

In [1]:
import os
if 'DOCKER_IMAGE' in os.environ:
    print('docker image:', os.environ['DOCKER_IMAGE'])

In [2]:
%%HTML
<style type="text/css">
.container {
    width: 96%;
}
#maintoolbar {
    display: none;
}
#header-container {
    display: none;
}
#notebook {
    padding-top: 0;
}
</style>

## Imports

In [3]:
# python standard library
import sys
import os
import operator
import itertools
import collections
import functools
import glob
import csv
import datetime
import bisect
import sqlite3
import subprocess
import random
import gc
import shutil
import shelve
import contextlib
import tempfile
import math

In [1]:
# general purpose third party packages

import cython
%reload_ext Cython

import numpy as np
nnz = np.count_nonzero
import scipy
import scipy.stats
import scipy.spatial.distance
import numexpr
import h5py
import tables
import bcolz
import dask
import dask.array as da
import pandas
import IPython
from IPython.display import clear_output, display, HTML
import rpy2
import rpy2.robjects as ro
# %reload_ext rpy2.ipython
import statsmodels
import sklearn
import sklearn.decomposition
import sklearn.manifold
import sh
import sqlalchemy
import pymysql
import psycopg2
import petl as etl
etl.config.display_index_header = True
import humanize
from humanize import naturalsize, intcomma, intword
import zarr

In [5]:
import warnings

def showwarning(message, category, filename, lineno, line=None):
    # ignore this one from matplotlib
    if str(message).startswith('axes.color_cycle'):
        return
    warnings.formatwarning(message, category, filename, lineno, line=line)

warnings.showwarning = showwarning

In [6]:
# plotting setup
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from matplotlib.gridspec import GridSpec
import matplotlib_venn as venn
import seaborn as sns
sns.set_context('paper')
sns.set_style('white')
sns.set_style('ticks')
rcParams = plt.rcParams
rcParams['font.size'] = 8
#rcParams['font.family'] = 'arial'
rcParams['axes.labelsize'] = 8
rcParams['xtick.labelsize'] = 8
rcParams['ytick.labelsize'] = 8
rcParams['legend.fontsize'] = 8
rcParams['axes.linewidth'] = .5
rcParams['lines.linewidth'] = .5
rcParams['patch.linewidth'] = .5
rcParams['ytick.direction'] = 'out'
rcParams['xtick.direction'] = 'out'
rcParams['savefig.jpeg_quality'] = 100
rcParams['savefig.dpi'] = 120
rcParams['lines.markeredgewidth'] = .5
rcParams['figure.figsize'] = (4.85, 3)

In [7]:
# bio third party packages
import Bio
import pyfasta
import pysam
import pysamstats
import petlx
import petlx.bio
import vcf
import vcfnp
import anhima
import allel
import ete3

## General utilities

In [8]:
_slog_indent = -2

def log(*msg):
    s = ' '.join(map(str, msg))
    print(s, file=sys.stdout)
    sys.stdout.flush()


@contextlib.contextmanager
def timer(*msg):
    before = datetime.datetime.now()
    try:
        yield
    except:
        after = datetime.datetime.now()
        elapsed = (after - before).total_seconds()
        done = 'errored after %s' % humanize.naturaldelta(elapsed)
        if not msg:
            msg = done
        else:
            msg = ', '.join(map(str, msg)) + ', ' + done
        print(msg, file=sys.stderr)
        sys.stderr.flush()   
        raise
    else:
        after = datetime.datetime.now()
        elapsed = (after - before).total_seconds()
        done = 'done in %s' % humanize.naturaldelta(elapsed)
        if not msg:
            msg = done
        else:
            msg = ', '.join(map(str, msg)) + ', ' + done
        print(msg, file=sys.stdout)
        sys.stdout.flush()
        
        
@contextlib.contextmanager
def section(*title):
    global _slog_indent
    before = datetime.datetime.now()
    _slog_indent += 2                  
    prefix = (' ' * _slog_indent) + '[' + ', '.join(map(str, title)) + '] '
    
    def slog(*msg, file=sys.stdout):
        print(prefix + ' '.join(map(str, msg)), file=file)
        file.flush()
    
    slog('begin')
                            
    try:
        yield slog
    
    except:
        after = datetime.datetime.now()
        elapsed = (after - before).total_seconds()
        msg = 'errored after %s' % humanize.naturaldelta(elapsed)
        slog(msg, file=sys.stderr)
        _slog_indent -= 2                  
        raise
    
    else:
        after = datetime.datetime.now()
        elapsed = (after - before).total_seconds()
        msg = 'done in %s' % humanize.naturaldelta(elapsed)
        slog(msg, file=sys.stdout)
        _slog_indent -= 2                  
    

In [9]:
def subplots(*args, despine=None, **kwargs):
    fig, ax = plt.subplots(*args, **kwargs)
    if despine is None:
        despine = dict(offset=10, trim=False)
    if hasattr(ax, '__len__'):
        for a in ax:
            sns.despine(ax=a, **despine)
    else:
        sns.despine(ax=ax, **despine)
    return fig, ax


In [10]:
def str_ascii(b):
    if isinstance(b, (tuple, list)):
        return type(b)([str(i, 'ascii') for i in b])
    elif isinstance(b, np.ndarray):
        return np.array([str(i, 'ascii') for i in b.flatten()]).reshape(b.shape)
    else:
        return str(b, 'ascii')
        

In [11]:
def _h5ls(h5o, currentdepth, maxdepth, maxitems, prefix):
    if maxdepth is not None and currentdepth == maxdepth:
        return
    for i, k in enumerate(h5o.keys()):
        path = prefix + '/' + k
        if maxitems is not None and i == maxitems:
            print(prefix + '/...')
            break
        v = h5o[k]
        print(path + ' : ' + repr(v))
        if hasattr(v, 'keys'):
            _h5ls(v, currentdepth+1, maxdepth=maxdepth, maxitems=maxitems, prefix=path)

def h5ls(h5o, maxdepth=None, maxitems=None):
    _h5ls(h5o, 0, maxdepth=maxdepth, maxitems=maxitems, prefix='')
    

## Phase 1 AR3 data release

In [1]:
class ReleasePhase1AR3(object):

    def __init__(self):
        self.title = 'Phase 1 AR3 release'
        self.release_dir = '/data/coluzzi/ag1000g/data/phase2/release/AR1.rc1'
        
        # reference genome
        self.genome_fn = os.path.join(self.release_dir, 'genome', 'agamP3', 'Anopheles-gambiae-PEST_CHROMOSOMES_AgamP3.fa')
        self.genome = pyfasta.Fasta(self.genome_fn)
        self.gff_fn = os.path.join(self.release_dir, 'geneset', 'Anopheles-gambiae-PEST_BASEFEATURES_AgamP4.2.sorted.gff3.gz')
        self.gene_labels = {
            'AGAP009195': 'GSTE1',
            'AGAP009194': 'GSTE2',
            'AGAP009197': 'GSTE3',
            'AGAP009193': 'GSTE4',
            'AGAP009192': 'GSTE5',
            'AGAP009191': 'GSTE6',
            'AGAP009196': 'GSTE7',
            'AGAP009190': 'GSTU4',
            'AGAP004707': 'VGSC',
            'AGAP002862': 'CYP6AA1',
            'AGAP013128': 'CYP6AA2',
            'AGAP002863': 'COEAE6O',
            'AGAP002865': 'CYP6P3',
            'AGAP002866': 'CYP6P5',
            'AGAP002867': 'CYP6P4',
            'AGAP002868': 'CYP6P1',
            'AGAP002869': 'CYP6P2',
            'AGAP002870': 'CYP6AD1',
            'AGAP002915': 'PCSK4/furin',
            'AGAP002825': 'PP01',
            'AGAP002824': 'GPRTAK1',
            'AGAP006028': 'GABA',
            'AGAP010815': 'TEP1'
        }
        self.autosomes = '2R', '2L', '3R', '3L'
        self.chromosomes = self.autosomes + ('X',)
        
        # variant callsets

        h5_callset_fn = os.path.join(self.release_dir, 'variation', 'main', 'hdf5', 'ag1000g.phase1.ar3.h5')
        if os.path.exists(h5_callset_fn):
            self.callset_fn = h5_callset_fn
            self.callset = h5py.File(self.callset_fn, mode='r')

        h5_callset_pass_fn = os.path.join(self.release_dir, 'variation', 'main', 'hdf5', 'ag1000g.phase1.ar3.pass.h5')
        if os.path.exists(h5_callset_pass_fn):
            self.callset_pass_fn = h5_callset_pass_fn
            self.callset_pass = h5py.File(self.callset_pass_fn, mode='r')

        # genome accessibility
        self.accessibility_fn = os.path.join(self.release_dir, 'accessibility', 'accessibility.h5')
        self.accessibility = h5py.File(self.accessibility_fn)
        
        # sample metadata
        self.tbl_samples = (
            etl
            .fromtsv(os.path.join(self.release_dir, 'samples', 'samples.all.txt'))
            .convert(('index', 'year', 'n_sequences', 'kt_2la', 'kt_2rb'), int)
            .convert(('mean_coverage', 'latitude', 'longitude') + tuple(range(20, 36)), float)
        )
        self.lkp_samples = self.tbl_samples.recordlookupone('ox_code')
        self.samples = self.tbl_samples.values('ox_code').list()
        self.df_samples = self.tbl_samples.todataframe(index='index')
        
        # allele counts
        self.extras_dir = os.path.join(self.release_dir, 'extras')
        self.allele_counts_fn = os.path.join(self.extras_dir, 'allele_counts.h5')
        self.allele_counts = h5py.File(self.allele_counts_fn, mode='r')
        self.allele_counts_gq10_fn = os.path.join(self.extras_dir, 'allele_counts.gq10.h5')
        self.allele_counts_gq10 = h5py.File(self.allele_counts_gq10_fn, mode='r')
        self.outgroup_species = 'arab', 'meru', 'mela', 'quad', 'epir', 'chri'
        self.outgroup_alleles_fn = os.path.join(self.extras_dir, 'outgroup_alleles.h5')
        self.outgroup_alleles = h5py.File(self.outgroup_alleles_fn, mode='r')
        self.outgroup_allele_counts_fn = os.path.join(self.extras_dir, 'outgroup_allele_counts.h5')
        self.outgroup_allele_counts = h5py.File(self.outgroup_allele_counts_fn, mode='r')
        
        # genome regions
        self.region_X_speciation = 'X-speciation', 'X', 15000000, 24000000 
        self.region_X_free = 'X-free', 'X', 1, 14000000 
        self.region_3L_free = '3L-free', '3L', 15000000, 41000000
        self.region_3R_free = '3R-free', '3R', 1, 37000000 

        # chromatin
        data_chromatin = b"""CHX     chro    X       20009764        24393108
        CH2R    chro    2R      58984778        61545105
        CH2L    chro    2L      1       2431617
        PEU2L   chro    2L      2487770 5042389
        IH2L    chro    2L      5078962 5788875
        IH3R    chro    3R      38988757        41860198
        CH3R    chro    3R      52161877        53200684
        CH3L    chro    3L      1       1815119
        PEU3L   chro    3L      1896830 4235209
        IH3L    chro    3L      4264713 5031692
        """
        self.tbl_chromatin = (etl
            .fromtext(etl.MemorySource(data_chromatin))
            .split('lines', '\s+', ['name', 'type', 'chrom', 'start', 'stop'])
            .convert(('start', 'stop'), int)
            .cutout('type')
        )

        # populations
        self.populations = 'AOM', 'BFM', 'GWA', 'GNS', 'BFS', 'CMS', 'GAS', 'UGS', 'KES'
        self.pop_labels = {
            'AOM': 'AO $coluzzii$',
            'BFM': 'BF $coluzzii$',
            'GWA': 'GW',
            'GNS': 'GN $gambiae$',
            'BFS': 'BF $gambiae$',
            'CMS': 'CM $gambiae$',
            'UGS': 'UG $gambiae$',
            'GAS': 'GA $gambiae$',
            'KES': 'KE',
            'colony': 'colony',
        }
        self.populations_gamb = 'BFS', 'GNS', 'CMS', 'UGS', 'GAS', 'KES'
        self.populations_colu = 'BFM', 'AOM'
        self.populations_hybr = 'GWA',
        pop_colours = {
            'AOM': sns.color_palette('YlOrBr', 5)[4],
            'BFM': sns.color_palette('Reds', 3)[1],
            'GWA': sns.color_palette('YlOrBr', 5)[1],
            'GNS': sns.color_palette('Blues', 3)[0],
            'BFS': sns.color_palette('Blues', 3)[1],
            'CMS': sns.color_palette('Blues', 3)[2],
            'UGS': sns.color_palette('Greens', 2)[0],
            'GAS': sns.color_palette('Greens', 2)[1],
            'KES': sns.color_palette('Greys', 5)[2],
            'colony': sns.color_palette('Greys', 5)[-1]
        }
        for p in pop_colours:
            h = '#%02x%02x%02x' % tuple(int(255*c) for c in pop_colours[p])
            pop_colours[p] = h
        self.pop_colours = pop_colours
        
    def get_features(self, chrom, start=None, stop=None):
        if start and stop:
            region = '%s:%s-%s' % (chrom, start, stop)
        else:
            region = chrom
        return etl.fromgff3(self.gff_fn, region=region)

    def plot_genes(self, chrom, start=1, stop=None, ax=None, height=.3, label=False, labels=None, 
                   label_unnamed=True, barh_kwargs=None):

        if stop is None:
            stop = len(self.genome[chrom])

        if ax is None:
            fig, ax = plt.subplots(figsize=(7, 1))
            sns.despine(ax=ax, offset=5)

        genes = self.get_features(chrom, start, stop).eq('type', 'gene').records()

        fwd_ranges = [(g.start, (g.end - g.start)) for g in genes if g.strand == '+']
        rev_ranges = [(g.start, (g.end - g.start)) for g in genes if g.strand == '-']
        if barh_kwargs is None:
            barh_kwargs = dict()
        barh_kwargs.setdefault('color', 'k')
        ax.broken_barh(fwd_ranges, (.5, height), **barh_kwargs)
        ax.broken_barh(rev_ranges, (.5-height, height), **barh_kwargs)
        ax.set_ylim(0, 1)
        ax.axhline(.5, color='k', linestyle='-')
        ax.set_xlim(start, stop)
        ax.set_yticks([.5-(height/2), .5+(height/2)])
        ax.set_yticklabels(['-', '+'])
        ax.set_ylabel('genes', rotation=0, ha='right', va='center')

        if label:
            for gene in genes:
                gid = gene.attributes['ID']
                if labels and gid not in labels and not label_unnamed:
                    continue
                if labels and gid in labels:
                    label = labels[gid]
                else:
                    label = gid
                x = gene.start
                if x < start: 
                    x = start
                if x > stop: 
                    x = stop
                if gene.strand == '+':
                    rotation = 45
                    y = .5 + height
                    ax.text(x, y, label, rotation=rotation, fontsize=6, ha='left', va='bottom')
                else:
                    rotation = -45
                    y = .5 - height
                    ax.text(x, y, label, rotation=rotation, fontsize=6, ha='left', va='top')

    @functools.lru_cache(maxsize=None)
    def equally_accessible_windows(self, chrom, size):
        is_accessible = self.accessibility[chrom]['is_accessible'][:]
        pos_accessible = np.nonzero(is_accessible)[0] + 1
        windows = allel.stats.moving_statistic(pos_accessible, lambda v: [v[0], v[-1]], size=size)
        return windows

    
# singleton instance
release_phase1_ar3 = ReleasePhase1AR3()

NameError: name 'functools' is not defined

## Phase 1 AR3.1 data release

In [13]:
# class ReleasePhase1AR31(object):

#     def __init__(self):
        
#         self.release_dir = '/data/coluzzi/ag1000g/data/phase1/release/AR3.1'

#         zarr_callset_fn = os.path.join(self.release_dir, 'variation', 'main', 'zarr2', 'ag1000g.phase1.ar3')
#         if os.path.exists(zarr_callset_fn):
#             self.callset_fn = zarr_callset_fn
#             self.callset = zarr.open_group(self.callset_fn, mode='r')
#         zarr_callset_pass_fn = os.path.join(self.release_dir, 'variation', 'main', 'zarr2', 'ag1000g.phase1.ar3.pass')
#         if os.path.exists(zarr_callset_pass_fn):
#             self.callset_pass_fn = zarr_callset_pass_fn
#             self.callset_pass = zarr.open_group(self.callset_pass_fn, mode='r')
        
#         self.haplotypes_fn = os.path.join(self.release_dir, 
#                                           'haplotypes', 'main', 'zarr2', 'ag1000g.phase1.ar3.1.haplotypes')
#         self.haplotypes = zarr.open_group(self.haplotypes_fn, mode='r')

#         self.colony_samples = {
#             'AD0231-C': 'Ghana',
#             'AD0232-C': 'Kisumu',
#             'AD0254-C': 'Ghana',
#             'AD0255-C': 'Mali',
#             'AD0305-C': 'Mali',
#             'AD0306-C': 'Kisumu',
#             'AD0347-C': 'Pimperena',
#             'AD0348-C': 'Mali',
#         }

#         # TODO double check this
#         self.colony_sample_sex = {
#             'AD0231-C': 'F',
#             'AD0232-C': 'M',
#             'AD0254-C': 'F',
#             'AD0255-C': 'M',
#             'AD0305-C': 'F',
#             'AD0306-C': 'M',
#             'AD0347-C': 'F',
#             'AD0348-C': 'M',    
#         }

#         self.phased_samples = [str(s, 'ascii') for s in self.haplotypes['3R']['samples'][:]]
#         self.haplotype_labels = list(itertools.chain(*[[s + 'a', s + 'b'] for s in self.phased_samples]))

#         self.tbl_haplotypes = (
#             etl
#             .empty()
#             .addcolumn('label', self.haplotype_labels)
#             .addrownumbers(start=0)
#             .rename('row', 'index')
#             .addfield('ox_code', lambda row: row.label[:-1])
#             .hashleftjoin(release_phase1_ar3.tbl_samples.cutout('index'), key='ox_code')
#             .convert('population', 
#                      lambda v, row: 'colony' if row.ox_code in self.colony_samples else v, 
#                      pass_row=True)
#             .convert('region', 
#                      lambda v, row: colony_samples[row.ox_code] if row.ox_code in self.colony_samples else v, 
#                      pass_row=True)
#             .convert('sex', 
#                      lambda v, row: row.sex if row.ox_code not in self.colony_samples else self.colony_sample_sex[row.ox_code], 
#                      pass_row=True)
#             .convert('m_s', lambda v, row: (row.m_s if row.m_s is not None 
#                                             else 'M' if row.population == 'colony' and row.region == 'Mali'
#                                             else 'S' if row.population == 'colony' and row.region == 'Pimperena'
#                                             else '?'), pass_row=True)  
#             .addfield('label_aug', lambda row: '%s [%s, %s, %s, %s]' % (row.label, row.country, row.region, row.m_s, row.sex))
#             .cut('index', 'label', 'ox_code', 'population', 'label_aug', 'country', 'region', 'sex', 'm_s', 'kt_2la', 'kt_2rb')
#         )
        
#         self.lkp_haplotypes = self.tbl_haplotypes.recordlookupone('label')

#         self.n_haplotypes = self.tbl_haplotypes.nrows()
#         self.n_haplotype_pairs = (self.n_haplotypes * (self.n_haplotypes - 1)) // 2

#         self.df_haplotypes = self.tbl_haplotypes.todataframe(index='index')

#     @functools.lru_cache(maxsize=None)
#     def load_haplotypes(chrom, start, stop):
#         """Load haplotypes into memory (uncompressed) for a given genome region."""

#         pos = allel.SortedIndex(self.haplotypes[chrom]['variants']['POS'][:])
#         loc = pos.locate_range(start, stop)

#         g = allel.GenotypeArray(self.haplotypes[chrom]['calldata']['genotype'][loc])

#         return g.to_haplotypes()

    
# # singleton instance
# release_phase1_ar3_1 = ReleasePhase1AR31()

## Other utilities

In [14]:
def fig_linear_genome(plotf, genome, chromosomes=None, fig=None, 
                      bottom=0, height=1, width_factor=1.08, chrom_pad=0.035, 
                      clip_patch_kwargs=None, **kwargs):
    if chromosomes is None:
        chromosomes = ['2R', '2L', '3R', '3L', 'X']
    genome_size = sum(len(genome[chrom]) for chrom in chromosomes)

    from matplotlib.path import Path

    if fig is None:
        fig = plt.figure(figsize=(8, 1))

    left = 0

    if clip_patch_kwargs is None:
        clip_patch_kwargs = dict()
    clip_patch_kwargs.setdefault('edgecolor', 'k')
    clip_patch_kwargs.setdefault('facecolor', 'none')
    clip_patch_kwargs.setdefault('lw', 1)

    axs = dict()
    for chrom in chromosomes:

        # calculate width needed for this chrom
        width = len(genome[chrom]) / (genome_size * width_factor)

        # create axes
        ax = fig.add_axes([left, bottom, width, height])
        ax.set_axis_bgcolor((1, 1, 1, 0));
        axs[chrom] = ax

        # construct clip path
        if chrom in {'2R', '3R'}:
            verts = [(0.01, 0.02), (0.9, 0.02), (1.01, 0.3), (1.01, 0.7), (0.9, .98), (0.01, .98), (0.01, 0.02)]
            codes = [Path.MOVETO, Path.LINETO, Path.LINETO, Path.LINETO, Path.LINETO, Path.LINETO, Path.CLOSEPOLY]
        elif chrom == "X":
            verts = [(0.01, 0.02), (0.9, 0.02), (0.99, 0.3), (0.99, 0.7), (0.9, .98), (0.01, .98), (0.01, 0.02)]
            codes = [Path.MOVETO, Path.LINETO, Path.LINETO, Path.LINETO, Path.LINETO, Path.LINETO, Path.CLOSEPOLY]
        else:
            verts = [(0.1, 0.02), (.99, 0.02), (.99, .98), (.1, .98), (-0.01, .7), (-0.01, .3), (0.1, 0.02)]
            codes = [Path.MOVETO, Path.LINETO, Path.LINETO, Path.LINETO, Path.LINETO, Path.LINETO, Path.CLOSEPOLY]
        path = Path(verts, codes)
        clip_patch = mpl.patches.PathPatch(path, transform=ax.transAxes, **clip_patch_kwargs)

        # do the plotting
        plotf(chrom=chrom, ax=ax, clip_patch=clip_patch, **kwargs)

        # increment left coordinate
        left += len(genome[chrom]) / (genome_size * width_factor)
        if chrom in {'2L', '3L'}:
            left += chrom_pad

    return axs


class GenomeFigure(object):
    
    def __init__(self, genome, *args, **kwargs):
        self.chromosomes = kwargs.pop('chromosomes', ['2R', '2L', '3R', '3L', 'X'])
        maxchrsize = max(np.array(genome[chrom]).size for chrom in self.chromosomes)
        fig = plt.figure(*args, **kwargs)
        self.fig = fig
        self.ax = dict()
        for i, chrom in enumerate(self.chromosomes):
            ax = fig.add_subplot(3, 2, i+1)
            self.ax[chrom] = ax
            S = np.array(genome[chrom])
            if i % 2 == 1:
                sns.despine(ax=ax, offset=10, top=True, left=True, right=False)
                ax.set_xlim(0, maxchrsize)
                ax.yaxis.tick_right()
                ax.yaxis.set_label_position('right')
            else:
                ax.set_xlim((S.size)-(maxchrsize), S.size)
                ax.yaxis.tick_left()
                sns.despine(ax=ax, offset=10, top=True, left=False, right=True)
            ax.set_xticks(range(0, S.size, int(5e6)))
            ax.set_xticklabels(range(0, int(S.size/1e6), 5))
            ax.set_title(chrom, fontweight='bold')
            ax.xaxis.tick_bottom()
        fig.tight_layout()
        
    def apply(self, f, **kwargs):
        chromosomes = kwargs.pop('chromosomes', self.chromosomes)
        for chrom in chromosomes:
            ax = self.ax[chrom]
            f(chrom, ax, **kwargs)


## Checks

In [15]:
release_phase1_ar3.get_features('2R')

0|seqid,1|source,2|type,3|start,4|end,5|score,6|strand,7|phase,8|attributes
2R,VectorBase,contig,1,61545105,.,.,.,"{'topology': 'linear', 'localization': 'chromosomal', 'translation_table': '1', 'ID': '2R', 'molecule_type': 'dsDNA'}"
2R,VectorBase,exon,6577,7219,.,+,.,"{'Parent': 'AGAP001096-RA', 'ID': 'AGAP001096-RA-E1A'}"
2R,VectorBase,five_prime_UTR,6577,6717,.,+,.,{'Parent': 'AGAP001096-RA'}
2R,VectorBase,gene,6577,7851,.,+,.,"{'biotype': 'protein_coding', 'ID': 'AGAP001096'}"
2R,VectorBase,mRNA,6577,7851,.,+,.,"{'description': 'hypothetical protein', 'Parent': 'AGAP001096', 'Dbxref': 'RefSeq_NA:XM_551751,RefSeq_NA:XM_551751,RefSeq_mRNA_predicted:XM_551751,RefSeq_Prot:XP_551751,RefSeq_rna_predicted:XM_551751,GenBank:AAAB01008987,UniProtKB:Q380A9,protein_id:EAL38657,UniParc:UPI0001D4BB89,EMBL:AAAB01008987,EMBL_predicted:AAAB01008987,RefSeq_peptide_predicted:XP_551751,Uniprot/SPTREMBL_predicted:Q380A9,Uniprot/SPTREMBL:Q380A9,protein_id_predicted:EAL38657', 'ID': 'AGAP001096-RA'}"


In [16]:
try:
    release_phase1_ar3.callset
except AttributeError as e:
    print(e)

'ReleasePhase1AR3' object has no attribute 'callset'


In [17]:
try:
    release_phase1_ar3.callset_pass
except AttributeError as e:
    print(e)

'ReleasePhase1AR3' object has no attribute 'callset_pass'


In [18]:
release_phase1_ar3.accessibility

<HDF5 file "accessibility.h5" (mode r)>

In [19]:
release_phase1_ar3.tbl_samples

0|index,1|ox_code,2|src_code,3|sra_sample_accession,4|population,5|country,6|region,7|contributor,8|contact,9|year,10|m_s,11|sex,12|n_sequences,13|mean_coverage,14|latitude,15|longitude,16|kt_2la,17|kt_2rb,18|kdr_1014,19|rdl_296,20|f_roh_100kb,21|count_roh_100kb,22|sum_roh_100kb,23|pca_3R_free_pc1,24|pca_3R_free_pc2,25|pca_3R_free_pc3,26|pca_3R_free_pc4,27|pca_3L_free_pc1,28|pca_3L_free_pc2,29|pca_3L_free_pc3,30|pca_3L_free_pc4,31|pca_2La_pc1,32|pca_2La_pc2,33|pca_2La_pc3,34|pca_2La_pc4,35|pca_2Rb_pc1,36|pca_2Rb_pc2,37|pca_2Rb_pc3,38|pca_2Rb_pc4
0,AB0085-C,BF2-4,ERS223996,BFS,Burkina Faso,Pala,Austin Burt,Sam O'Loughlin,2012,S,F,89905852,28.01,11.15,-4.235,2,2,F/F,+/G,0.0421070108732297,29.0,8677141.0,23.6793027843,-18.5070597845,12.0772681634,19.2280281052,21.9745708329,10.7956364888,-8.29093998681,-18.5427676041,-55.5113887109,32.6821432536,-1.83373939012,-0.381984489959,-38.9343010433,31.9393825655,19.3456057294,7.36200361786
1,AB0087-C,BF3-3,ERS224013,BFM,Burkina Faso,Bana,Austin Burt,Sam O'Loughlin,2012,M,F,116706234,36.76,11.233,-4.472,2,1,+/F,S/S,0.0251457167876643,21.0,5181867.0,15.7323873539,-1.83549452896,32.9818625773,-89.1273994494,18.2159670837,-4.68369329711,-38.6030764755,76.9156170044,-51.8566327426,27.4012490277,2.58648776622,0.643100495995,-10.0728787877,29.0072663226,-21.7360873147,-30.3095623125
2,AB0088-C,BF3-5,ERS223991,BFM,Burkina Faso,Bana,Austin Burt,Sam O'Loughlin,2012,M,F,112090460,23.3,11.233,-4.472,2,0,F/F,+/S,0.0521155978560955,22.0,10739646.0,15.9144742202,-3.28434973341,34.0105947774,-96.0052368219,16.0453888787,-4.64766358271,-35.5533400502,73.9854882996,-50.9424557656,28.5722072116,3.07258266003,-0.643137464416,12.2817439138,22.2884170021,-43.6613009253,-51.5571401018
3,AB0089-C,BF3-8,ERS224031,BFM,Burkina Faso,Bana,Austin Burt,Sam O'Loughlin,2012,M,F,145350454,41.36,11.233,-4.472,2,1,F/F,+/S,0.027995121838662,12.0,5769054.0,15.4642367255,-1.17455185408,34.1155964442,-95.490470647,15.8183147597,-5.18368348209,-36.6215680942,76.4539930728,-51.1692466079,29.4149747305,2.19843391766,0.01356240989,-10.6640721216,29.6013328528,-16.8315586278,-25.4528535287
4,AB0090-C,BF3-10,ERS223936,BFM,Burkina Faso,Bana,Austin Burt,Sam O'Loughlin,2012,M,F,105012254,34.64,11.233,-4.472,2,0,+/F,+/S,0.0230731217828081,20.0,4754760.0,16.1964455617,-3.65671498231,32.0968063169,-89.2493228645,11.297362397,-7.25321336364,-36.0977560333,72.0363093309,-48.6074156394,26.4075318513,1.64322606755,0.102581833433,12.8971133302,22.1944442666,-48.8823782537,-52.4204868387


In [20]:
release_phase1_ar3.extras_dir

'/data/coluzzi/ag1000g/data/phase1/release/AR3/extras'

In [21]:
release_phase1_ar3.allele_counts

<HDF5 file "allele_counts.h5" (mode r)>

In [22]:
release_phase1_ar3.allele_counts_gq10

<HDF5 file "allele_counts.gq10.h5" (mode r)>

In [23]:
release_phase1_ar3.outgroup_alleles

<HDF5 file "outgroup_alleles.h5" (mode r)>

In [24]:
release_phase1_ar3.outgroup_allele_counts

<HDF5 file "outgroup_allele_counts.h5" (mode r)>

In [25]:
# release_phase1_ar3_1.haplotypes

Group(/, 6)
  arrays: 1; samples
  groups: 5; 2L, 2R, 3L, 3R, X
  store: DirectoryStore

In [26]:
# release_phase1_ar3_1.phased_samples[:5]

['AB0085-C', 'AB0087-C', 'AB0088-C', 'AB0089-C', 'AB0090-C']

In [27]:
# release_phase1_ar3_1.haplotype_labels[:5]

['AB0085-Ca', 'AB0085-Cb', 'AB0087-Ca', 'AB0087-Cb', 'AB0088-Ca']

In [28]:
# release_phase1_ar3_1.tbl_haplotypes

0|index,1|label,2|ox_code,3|population,4|label_aug,5|country,6|region,7|sex,8|m_s,9|kt_2la,10|kt_2rb
0,AB0085-Ca,AB0085-C,BFS,"AB0085-Ca [Burkina Faso, Pala, S, F]",Burkina Faso,Pala,F,S,2,2
1,AB0085-Cb,AB0085-C,BFS,"AB0085-Cb [Burkina Faso, Pala, S, F]",Burkina Faso,Pala,F,S,2,2
2,AB0087-Ca,AB0087-C,BFM,"AB0087-Ca [Burkina Faso, Bana, M, F]",Burkina Faso,Bana,F,M,2,1
3,AB0087-Cb,AB0087-C,BFM,"AB0087-Cb [Burkina Faso, Bana, M, F]",Burkina Faso,Bana,F,M,2,1
4,AB0088-Ca,AB0088-C,BFM,"AB0088-Ca [Burkina Faso, Bana, M, F]",Burkina Faso,Bana,F,M,2,0


In [29]:
release_phase1_ar3.tbl_chromatin

0|name,1|chrom,2|start,3|stop
CHX,X,20009764,24393108
CH2R,2R,58984778,61545105
CH2L,2L,1,2431617
PEU2L,2L,2487770,5042389
IH2L,2L,5078962,5788875


In [30]:
release_phase1_ar3.pop_colours

{'AOM': '#a93b03',
 'BFM': '#fa6949',
 'BFS': '#6aadd5',
 'CMS': '#2070b4',
 'GAS': '#37a055',
 'GNS': '#c5daee',
 'GWA': '#fece65',
 'KES': '#959595',
 'UGS': '#addea7',
 'colony': '#333333'}