In [38]:
from statistics import mean, stdev
from math import sqrt
import vcf

### Time to the most recent common ancestor of all anatomically modern humans

In [2]:
genome_a00_tmrca = 275
exome_a00_tmrca = 120

### Counts of branch mutations published by Mendez et al.

In [3]:
mendez_counts = {i : 0 for i in list('abcdef') + ['inconsistent', 'variable']}
mendez_counts['a'] = 24
mendez_counts['b'] = 4
mendez_counts['c'] = 0
mendez_counts['d'] = 16
mendez_counts['e'] = 22
mendez_counts['f'] = 77
mendez_counts['variable'] = 146

mendez_counts

{'a': 24,
 'b': 4,
 'c': 0,
 'd': 16,
 'e': 22,
 'f': 77,
 'inconsistent': 0,
 'variable': 146}

### Function definitions

In [4]:
def get_base(sample_id, vcf_rec):
    '''Return the base of a given sample in a VCF record.'''
    if sample_id == 'ref':
        return vcf_rec.REF
    else:
        return vcf_rec.genotype(sample_id).gt_bases

In [5]:
def update_counter(vcf_rec, counter, human):
    '''Update the counter of mutations on each possible branch.'''
    chimp, hum, nea, a00 = (get_base(s, vcf_rec) for s in ['Chimp', human, 'ElSidron', 'A00'])
    
    # skip sites with missing base in any of the samples
    if not (chimp and hum and nea and a00): return
    # skip non-variable sites
    if (hum == nea == a00): return
    
    # branch a mutation
    if chimp == nea != hum == a00:
        counter['a'] += 1
    
    # branch b mutation
    elif chimp == a00 != nea == hum:
        counter['b'] += 1
    
    # branch c mutation
    elif chimp == hum != a00 == nea:
        counter['c'] += 1
        
    # branch d mutation
    elif hum != chimp == a00 == nea:
        counter['d'] += 1
    
    # branch e mutation
    elif a00 != chimp == nea == hum:
        counter['e'] += 1
        
    # branch f mutation
    elif nea != chimp == a00 == hum:
        counter['f'] += 1
        
    # inconsistent site
    else:
        print(vcf_rec.POS, 'chimp: {}\tnea: {}\ta00: {}\hum: {}'.format(chimp, nea, a00, ref))
        counter['inconsistent'] += 1
    
    counter['variable'] += 1

In [6]:
def count_mutations_in_vcf(vcf_file, human='ref'):
    '''Count the number of mutations on all possible branches
    of the tree with chimpanzee, El Sidron, A00 and hg19 in
    a given VCF.
    '''
    vcf_reader = vcf.Reader(open(vcf_file, 'rb'))
    counter = {i : 0 for i in list('abcdef') + ['inconsistent', 'variable']}
    
    for rec in vcf_reader.fetch('Y'):
        update_counter(rec, counter, human)
        
    return counter

In [7]:
def calc_proportions(counter):
    for i in list('abcdef'):
        print('number of mutations on branch {}: \t{}\t({:.2%})'.format(i, counter[i], counter[i] / counter['variable']))

In [8]:
def branch_a_prop(counter):
    a, d, e = counter['a'], counter['d'], counter['e']
    return a / (a + d + e)

def calc_alpha_1(counter):
    p = branch_a_prop(counter)
    return (1 + p) / (1 - p)

### Proportions of branch mutations in Mendez et al. data

In [29]:
calc_proportions(mendez_counts)

number of mutations on branch a: 	24	(16.44%)
number of mutations on branch b: 	4	(2.74%)
number of mutations on branch c: 	0	(0.00%)
number of mutations on branch d: 	16	(10.96%)
number of mutations on branch e: 	22	(15.07%)
number of mutations on branch f: 	77	(52.74%)


# exome 120 kb &mdash; using different modern humans as 'ref'

In [11]:
samples = ['ref', 'Dai', 'French', 'Han', 'Mandenka', 'Mbuti', 'Papuan', 'San', 'Sardinian', 'Yoruba', 'Karitiana', 'Australian', 'Dinka']

In [12]:
exome_alphas = []

In [13]:
for s in samples:
    print(s, end='\n\n')
    counts = count_mutations_in_vcf('../vcf/merged_exome.vcf.gz', s)
    calc_proportions(counts)
    print()
    alpha = calc_alpha_1(counts)
    print('alpha: {:.2}\nT_NR: {:.2}'.format(alpha, alpha * exome_a00_tmrca))
    print('======================================\n\n')
    
    exome_alphas.append(alpha)

ref

number of mutations on branch a: 	15	(15.15%)
number of mutations on branch b: 	3	(3.03%)
number of mutations on branch c: 	0	(0.00%)
number of mutations on branch d: 	15	(15.15%)
number of mutations on branch e: 	18	(18.18%)
number of mutations on branch f: 	48	(48.48%)

alpha: 1.9
T_NR: 229.0909090909091


Dai

number of mutations on branch a: 	14	(13.86%)
number of mutations on branch b: 	3	(2.97%)
number of mutations on branch c: 	1	(0.99%)
number of mutations on branch d: 	17	(16.83%)
number of mutations on branch e: 	18	(17.82%)
number of mutations on branch f: 	48	(47.52%)

alpha: 1.8
T_NR: 215.99999999999997


French

number of mutations on branch a: 	14	(14.43%)
number of mutations on branch b: 	3	(3.09%)
number of mutations on branch c: 	0	(0.00%)
number of mutations on branch d: 	14	(14.43%)
number of mutations on branch e: 	18	(18.56%)
number of mutations on branch f: 	48	(49.48%)

alpha: 1.9
T_NR: 225.0


Han

number of mutations on branch a: 	15	(15.15%)
number of mu

In [47]:
mean(exome_alphas)

1.8621874609431173

In [23]:
mean(exome_alphas) * exome_a00_tmrca

223.46249531317406

In [46]:
mean(exome_alphas) * genome_a00_tmrca

512.1015517593572

In [49]:
1.9 * exome_a00_tmrca

228.0

In [50]:
1.9 * genome_a00_tmrca

522.5

# 560 kb &mdash; using different modern humans as 'ref'

In [17]:
samples = ['ref', 'Dai', 'French', 'Han', 'Mandenka', 'Mbuti', 'Papuan', 'San', 'Sardinian', 'Yoruba', 'Karitiana', 'Australian', 'Dinka']
lippold_alphas = []

In [18]:
for s in samples:
    print(s, end='\n\n')
    counts = count_mutations_in_vcf('../vcf/merged_lippold.vcf.gz', s)
    calc_proportions(counts)
    print()
    alpha = calc_alpha_1(counts)
    print('alpha: {:.2}\nT_NR: {:.2}'.format(alpha, alpha * genome_a00_tmrca))
    print('======================================\n\n')
    
    lippold_alphas.append(alpha)

ref

number of mutations on branch a: 	62	(13.03%)
number of mutations on branch b: 	4	(0.84%)
number of mutations on branch c: 	12	(2.52%)
number of mutations on branch d: 	180	(37.82%)
number of mutations on branch e: 	87	(18.28%)
number of mutations on branch f: 	131	(27.52%)

alpha: 1.5	T_NR: 402.71535580524346


Dai

number of mutations on branch a: 	55	(15.15%)
number of mutations on branch b: 	4	(1.10%)
number of mutations on branch c: 	3	(0.83%)
number of mutations on branch d: 	84	(23.14%)
number of mutations on branch e: 	88	(24.24%)
number of mutations on branch f: 	129	(35.54%)

alpha: 1.6	T_NR: 450.8720930232558


French

number of mutations on branch a: 	62	(13.28%)
number of mutations on branch b: 	4	(0.86%)
number of mutations on branch c: 	13	(2.78%)
number of mutations on branch d: 	170	(36.40%)
number of mutations on branch e: 	87	(18.63%)
number of mutations on branch f: 	131	(28.05%)

alpha: 1.5	T_NR: 407.6848249027237


Han

number of mutations on branch a: 	55	(1

In [48]:
mean(lippold_alphas)

1.5795288366726294

In [25]:
mean(lippold_alphas) * genome_a00_tmrca

434.3704300849731