In [6]:
from __future__ import print_function, division, absolute_import
import datetime

In [7]:
import sys


def log(*msg):
    print(str(datetime.datetime.now()) + ' :: ' + ' '.join(map(str, msg)),
          file=sys.stderr)
    sys.stderr.flush()

In [8]:
from IPython.display import display_html


def pre(msg):
    display_html('<pre style="line-height: 100%; font-size: .7em; display: inline-block; white-space: pre; background-color: #ff6; border: 0px solid #ddd; padding: 1px">' + str(msg).replace('<', '&lt;') + '</pre>', raw=True)

    
def html(content):
    display_html(content, raw=True)
    

In [9]:
import numpy as np


nnz = np.count_nonzero


def ntitv(ref, alt):
    a = np.char.add(ref.astype('S1'), alt.astype('S1'))
    a = np.char.upper(a)
    ti = np.count_nonzero((a == 'AG') | (a == 'GA') | (a == 'CT') | (a == 'TC'))
    tv = a.size - ti
    return ti, tv


def site_nucl_diversity(af, an):
    p = af
    q = 1 - p
    return (an/(an-1))*2*p*q


def site_heterozygosity(af):
    # assume af has frequencies for all alleles, where axis=1 is alleles axis
    p_same_allele = np.power(af, 2)
    p_hom = np.sum(p_same_allele, axis=1)
    p_het = 1 - p_hom
    return p_het


In [10]:
import os
import pyfasta

autosomes = '2R', '2L', '3R', '3L'
chromosomes = autosomes + ('X',)

agamp3_fasta_fn = '/data/anopheles/ag1000g/data/genome/AgamP3/Anopheles-gambiae-PEST_CHROMOSOMES_AgamP3.fa'
if os.path.exists(agamp3_fasta_fn):
    agamp3_genome = pyfasta.Fasta(agamp3_fasta_fn, key_fn=lambda key: key.split()[0])
else:
    log('AgamP3 reference genome not available')

agamp3_gff_fn = '/data/anopheles/ag1000g/data/genome/AgamP3/Anopheles-gambiae-PEST_BASEFEATURES_AgamP3.7.gff3'
agamp3_gff_sorted_fn = agamp3_gff_fn + '.sorted.gff.gz'

agamp4_fasta_fn = '/data/anopheles/ag1000g/data/genome/AgamP4/Anopheles-gambiae-PEST_CHROMOSOMES_AgamP4.fa'
if os.path.exists(agamp4_fasta_fn):
    agamp4_genome = pyfasta.Fasta(agamp4_fasta_fn, key_fn=lambda key: key.split()[0])
else:
    log('AgamP4 reference genome not available')

agamp42_gff_fn = '/data/coluzzi/ag1000g/data/phase1/release/AR3/geneset/Anopheles-gambiae-PEST_BASEFEATURES_AgamP4.2.gff3.gz'
agamp42_gff_sorted_fn = '/data/coluzzi/ag1000g/data/phase1/release/AR3/geneset/Anopheles-gambiae-PEST_BASEFEATURES_AgamP4.2.sorted.gff3.gz'

agamp3_repeats_gff_fn = '/data/anopheles/ag1000g/data/genome/AgamP3/Anopheles-gambiae-PEST_REPEATFEATURES_AgamP3.gff3.gz'

2016-06-28 10:31:00.333752 :: AgamP4 reference genome not available


In [11]:
country_to_prefix = {
    'Angola': 'AR',
    'Burkina Faso': 'AB',
    'Cameroon': 'AN',
    "Cote d'Ivoire": 'AY',
    'Equatorial Guinea': 'AQ',
    'France': 'AP',
    'Gabon': 'AS',
    'Gambia, The': 'AG',
    'Ghana': 'AA',
    'Guinea': 'AV',
    'Guinea-Bissau': 'AJ',
    'Kenya': 'AK',
    'Uganda': 'AC'
}
prefix_to_country = {v: k for (k, v) in country_to_prefix.items()}