In [2]:
import vcf

### Time to the most recent common ancestor of all anatomically modern humans

In [21]:
a00_tmrca = 275

### Function definitions

In [3]:
def get_base(sample_id, vcf_rec):
    '''Return the base of a given sample in a VCF record.'''
    return vcf_rec.genotype(sample_id).gt_bases

In [4]:
def update_counter(vcf_rec, counter):
    '''Update the counter of mutations on each possible branch.'''
    ref = vcf_rec.REF
    chimp, nea, a00 = (get_base(s, vcf_rec) for s in ['Chimp', 'ElSidron', 'A00'])
    
    # skip sites with missing base in any of the samples
    if not (chimp and nea and a00): return
    # skip non-variable sites
    if (ref == nea == a00): return
    
    # branch a mutation
    if chimp == nea != ref == a00:
        counter['a'] += 1
    
    # branch b mutation
    elif chimp == a00 != nea == ref:
        counter['b'] += 1
    
    # branch c mutation
    elif chimp == ref != a00 == nea:
        counter['c'] += 1
        
    # branch d mutation
    elif ref != chimp == a00 == nea:
        counter['d'] += 1
    
    # branch e mutation
    elif a00 != chimp == nea == ref:
        counter['e'] += 1
        
    # branch f mutation
    elif nea != chimp == a00 == ref:
        counter['f'] += 1
        
    # inconsistent site
    else:
        print(vcf_rec.POS, 'chimp: {}\tnea: {}\ta00: {}\tref: {}'.format(chimp, nea, a00, ref))
        counter['inconsistent'] += 1
    
    counter['variable'] += 1

In [8]:
def count_mutations_in_vcf(vcf_file):
    '''Count the number of mutations on all possible branches
    of the tree with chimpanzee, El Sidron, A00 and hg19 in
    a given VCF.
    '''
    vcf_reader = vcf.Reader(open(vcf_file, 'rb'))
    counter = {i : 0 for i in list('abcdef') + ['inconsistent', 'variable']}
    
    for rec in vcf_reader.fetch('Y'):
        update_counter(rec, counter)
        
    return counter

In [16]:
def calc_proportions(counter):
    for i in list('abcdef'):
        print('proportion of mutations on branch {}: {:.2%}'.format(i, counter[i] / counter['variable']))

In [23]:
def branch_a_prop(counter):
    a, d, e = counter['a'], counter['d'], counter['e']
    return a / (a + d + e)

def calc_alpha_1(counter):
    p = branch_a_prop(counter)
    return (1 + p) / (1 - p)

### Analysis of exome data

In [9]:
exome_counts = count_mutations_in_vcf('../vcf/merged_exome.vcf.gz')

In [10]:
exome_counts

{'a': 15,
 'b': 3,
 'c': 0,
 'd': 15,
 'e': 18,
 'f': 48,
 'inconsistent': 0,
 'variable': 99}

### Counts from Mendez et al.

In [15]:
mendez_counts = {i : 0 for i in list('abcdef') + ['inconsistent', 'variable']}
mendez_counts['a'] = 24
mendez_counts['b'] = 4
mendez_counts['c'] = 0
mendez_counts['d'] = 16
mendez_counts['e'] = 22
mendez_counts['f'] = 77
mendez_counts['variable'] = 146

mendez_counts

{'a': 24,
 'b': 4,
 'c': 0,
 'd': 16,
 'e': 22,
 'f': 77,
 'inconsistent': 0,
 'variable': 146}

### Proportions of branch mutations in our analysis of exome data

In [17]:
calc_proportions(exome_counts)

proportion of mutations on branch a: 15.15%
proportion of mutations on branch b: 3.03%
proportion of mutations on branch c: 0.00%
proportion of mutations on branch d: 15.15%
proportion of mutations on branch e: 18.18%
proportion of mutations on branch f: 48.48%


### Proportions of branch mutations in Mendez et al. data

In [18]:
calc_proportions(mendez_counts)

proportion of mutations on branch a: 16.44%
proportion of mutations on branch b: 2.74%
proportion of mutations on branch c: 0.00%
proportion of mutations on branch d: 10.96%
proportion of mutations on branch e: 15.07%
proportion of mutations on branch f: 52.74%


# Estimation of $\alpha$ &mdash; method 1

In [28]:
exome_alpha = calc_alpha_1(exome_counts)

In [40]:
exome_alpha

1.9090909090909092

In [29]:
1.64 * a00_tmrca, 2.14 * a00_tmrca, 2.89 * a00_tmrca

(451.0, 588.5, 794.75)

In [30]:
exome_alpha * a00_tmrca

525.0

# 570 kb data

In [31]:
lippold_counts = count_mutations_in_vcf('../vcf/merged_lippold.vcf.gz')

In [34]:
lippold_counts

{'a': 62,
 'b': 4,
 'c': 12,
 'd': 180,
 'e': 87,
 'f': 131,
 'inconsistent': 0,
 'variable': 476}

### Proportions of branch mutations in our data

In [36]:
calc_proportions(lippold_counts)

proportion of mutations on branch a: 13.03%
proportion of mutations on branch b: 0.84%
proportion of mutations on branch c: 2.52%
proportion of mutations on branch d: 37.82%
proportion of mutations on branch e: 18.28%
proportion of mutations on branch f: 27.52%


# Estimation of $\alpha$ &mdash; method 1

In [37]:
lippold_alpha = calc_alpha_1(lippold_counts)

In [38]:
lippold_alpha

1.4644194756554307

In [39]:
lippold_alpha * 275

402.71535580524346