In [29]:
from statistics import mean, stdev
from math import sqrt
import vcf

In [30]:
def get_base(sample_id, vcf_rec):
    '''Return the base of a given sample in a VCF record.'''
    if sample_id == 'ref':
        return vcf_rec.REF
    else:
        return vcf_rec.genotype(sample_id).gt_bases

In [31]:
mutation_rate = 7.4e-10

In [32]:
exome_length   = 118643
lippold_length = 556259
ychr_length    = 6913053

# Re-analysis of Mendez et al. approach to time the A00 divergence

In [5]:
vcf_file = '../../y-selection/vcf/merged_y.vcf.gz'
reader = vcf.Reader(open(vcf_file, 'rb'))

In [6]:
counter = {i : 0 for i in list('adef') + ['inconsistent', 'variable']}

for vcf_rec in reader.fetch('Y'):
    chimp, a00, ui, hum = (get_base(s, vcf_rec) for s in ['Chimp', 'A00', 'Ust_Ishim', 'ref'])
    
    # skip sites with missing base in any of the samples
    if not (chimp and a00 and ui and hum): continue
    # skip non-variable sites
    if (a00 == ui == hum): continue
    
    # branch a mutation
    if chimp == a00 != hum == ui:
        counter['a'] += 1

    # branch d mutation
    elif chimp == a00 == ui != hum:
        counter['d'] += 1

    # branch e mutation
    elif chimp == a00 == hum != ui:
        counter['e'] += 1

    # branch f mutation
    elif chimp == hum == ui != a00:
        counter['f'] += 1
        
    else:
        # print(vcf_rec.POS, '\tchimp: {}\ta00: {}\tui: {}\thum: {}'.format(chimp, a00, ui, hum))
        counter['inconsistent'] += 1

    counter['variable'] += 1

In [7]:
counter

{'a': 1136, 'd': 253, 'e': 56, 'f': 1187, 'inconsistent': 43, 'variable': 2675}

In [8]:
mean([counter['a'] + counter['d'], counter['f']]) / (ychr_length * mutation_rate)

251775.95782074006

In [35]:
(1136 + 253) / (ychr_length * mutation_rate)

271519.25886103103

In [36]:
1187 / (ychr_length * mutation_rate)

232032.65678044912

# Direct estimate of A00/B-team divergence time

Using counts of private mutations on the A00 lineage.

## Exome

In [9]:
vcf_file = '../vcf/merged_exome.vcf.gz'
reader = vcf.Reader(open(vcf_file, 'rb'))

In [10]:
samples = ['ref', 'Dai', 'French', 'Han', 'Mandenka', 'Mbuti', 'Papuan', 'San', 'Sardinian', 'Yoruba', 'Karitiana', 'Australian', 'Dinka']
exome_counts = []

In [11]:
for s in samples:
    counter = {'a00' : 0, 'hum' : 0, 'both' : 0, 'total' : 0}

    for vcf_rec in reader.fetch('Y'):
        chimp, a00, hum = (get_base(s, vcf_rec) for s in ['Chimp', 'A00', s])

        # skip sites with missing base in any of the samples
        if not (chimp and a00 and hum): continue

        # shared mutation
        if chimp != a00 == hum:
            counter['both'] += 1

        # A00-specific mutation
        elif chimp == hum != a00:
            counter['a00'] += 1

        # ref-specific mutation
        elif chimp == a00 != hum:
            counter['hum'] += 1

        counter['total'] += 1
    
    exome_counts.append((counter['a00'], counter['hum']))

In [12]:
exome_counts

[(19, 19),
 (20, 20),
 (19, 18),
 (18, 20),
 (19, 25),
 (19, 20),
 (18, 16),
 (19, 22),
 (19, 20),
 (20, 21),
 (19, 18),
 (19, 15),
 (20, 22)]

In [13]:
exome_a00_tmrca = mean(mean(x) for x in exome_counts) / (exome_length * mutation_rate)

In [14]:
exome_a00_tmrca

220792.00791809204

## Lippold

In [15]:
vcf_file = '../vcf/merged_lippold.vcf.gz'
reader = vcf.Reader(open(vcf_file, 'rb'))

In [16]:
samples = ['ref', 'Dai', 'French', 'Han', 'Mandenka', 'Mbuti', 'Papuan', 'San', 'Sardinian', 'Yoruba', 'Karitiana', 'Australian', 'Dinka']
lippold_counts = []

In [17]:
for s in samples:
    counter = {'a00' : 0, 'hum' : 0, 'both' : 0, 'total' : 0}

    for vcf_rec in reader.fetch('Y'):
        chimp, a00, hum = (get_base(s, vcf_rec) for s in ['Chimp', 'A00', s])

        # skip sites with missing base in any of the samples
        if not (chimp and a00 and hum): continue

        # shared mutation
        if chimp != a00 == hum:
            counter['both'] += 1

        # A00-specific mutation
        elif chimp == hum != a00:
            counter['a00'] += 1

        # ref-specific mutation
        elif chimp == a00 != hum:
            counter['hum'] += 1

        counter['total'] += 1
    
    lippold_counts.append((counter['a00'], counter['hum']))

In [18]:
lippold_counts

[(101, 197),
 (93, 96),
 (102, 187),
 (95, 131),
 (95, 118),
 (93, 108),
 (95, 88),
 (97, 96),
 (97, 131),
 (97, 158),
 (95, 145),
 (93, 98),
 (96, 92)]

In [19]:
lippold_a00_tmrca = mean(mean(x) for x in lippold_counts) / (lippold_length * mutation_rate)

In [20]:
lippold_a00_tmrca

270406.05260463274

## Whole Y chromosome

In [21]:
vcf_file = '../../y-selection/vcf/merged_y.vcf.gz'
reader = vcf.Reader(open(vcf_file, 'rb'))

In [22]:
samples = ['ref', 'Dai', 'French', 'Han', 'Mandenka', 'Mbuti', 'Papuan', 'San', 'Sardinian', 'Yoruba', 'Karitiana', 'Australian', 'Dinka']
genome_counts = []

In [23]:
for s in samples:
    counter = {'a00' : 0, 'hum' : 0, 'both' : 0, 'total' : 0}

    for vcf_rec in reader.fetch('Y'):
        chimp, a00, hum = (get_base(s, vcf_rec) for s in ['Chimp', 'A00', s])

        # skip sites with missing base in any of the samples
        if not (chimp and a00 and hum): continue

        # shared mutation
        if chimp != a00 == hum:
            counter['both'] += 1

        # A00-specific mutation
        elif chimp == hum != a00:
            counter['a00'] += 1

        # ref-specific mutation
        elif chimp == a00 != hum:
            counter['hum'] += 1

        counter['total'] += 1
    
    genome_counts.append((counter['a00'], counter['hum']))

In [24]:
genome_counts

[(1212, 1388),
 (1188, 1343),
 (1192, 1365),
 (1185, 1344),
 (1194, 1378),
 (1190, 1362),
 (1195, 1328),
 (1193, 1296),
 (1189, 1376),
 (1196, 1375),
 (1194, 1362),
 (1189, 1362),
 (1207, 1293)]

In [25]:
chromosome_a00_tmrca = mean(mean(x) for x in genome_counts) / (ychr_length * mutation_rate)

In [26]:
chromosome_a00_tmrca

248828.7476121361

In [41]:
1388 / (ychr_length * mutation_rate)

271323.78063290933

# Re-analysis of Mendez et al. approach to time the Neanderthal divergence

Using A00 TMRCAs estimated above.

### Time to the most recent common ancestor of all anatomically modern humans

In [175]:
beast_genome_a00_tmrca = 275
beast_exome_a00_tmrca = 120

### Function definitions

In [56]:
def update_counter(vcf_rec, counter, human):
    '''Update the counter of mutations on each possible branch.'''
    chimp, hum, nea, a00 = (get_base(s, vcf_rec) for s in ['Chimp', human, 'ElSidron', 'A00'])
    
    # skip sites with missing base in any of the samples
    if not (chimp and hum and nea and a00): return
    # skip non-variable sites
    if (hum == nea == a00): return
    
    # branch a mutation
    if chimp == nea != hum == a00:
        counter['a'] += 1
    
    # branch b mutation
    elif chimp == a00 != nea == hum:
        counter['b'] += 1
    
    # branch c mutation
    elif chimp == hum != a00 == nea:
        counter['c'] += 1
        
    # branch d mutation
    elif hum != chimp == a00 == nea:
        counter['d'] += 1
    
    # branch e mutation
    elif a00 != chimp == nea == hum:
        counter['e'] += 1
        
    # branch f mutation
    elif nea != chimp == a00 == hum:
        counter['f'] += 1
        
    # inconsistent site
    else:
        print(vcf_rec.POS, 'chimp: {}\tnea: {}\ta00: {}\hum: {}'.format(chimp, nea, a00, ref))
        counter['inconsistent'] += 1
    
    counter['variable'] += 1

In [57]:
def count_mutations_in_vcf(vcf_file, human='ref'):
    '''Count the number of mutations on all possible branches
    of the tree with chimpanzee, El Sidron, A00 and hg19 in
    a given VCF.
    '''
    vcf_reader = vcf.Reader(open(vcf_file, 'rb'))
    counter = {i : 0 for i in list('abcdef') + ['inconsistent', 'variable']}
    
    for rec in vcf_reader.fetch('Y'):
        update_counter(rec, counter, human)
        
    return counter

In [58]:
def calc_proportions(counter):
    for i in list('abcdef'):
        print('number of mutations on branch {}: \t{}\t({:.2%})'.format(i, counter[i], counter[i] / counter['variable']))

In [59]:
def branch_a_prop(counter):
    a, d, e = counter['a'], counter['d'], counter['e']
    return a / (a + d + e)

def calc_alpha_1(counter):
    p = branch_a_prop(counter)
    return (1 + p) / (1 - p)

### Counts of branch mutations published by Mendez et al.

In [38]:
mendez_counts = {i : 0 for i in list('abcdef') + ['inconsistent', 'variable']}
mendez_counts['a'] = 24
mendez_counts['b'] = 4
mendez_counts['c'] = 0
mendez_counts['d'] = 16
mendez_counts['e'] = 22
mendez_counts['f'] = 77
mendez_counts['variable'] = 146

mendez_counts

{'a': 24,
 'b': 4,
 'c': 0,
 'd': 16,
 'e': 22,
 'f': 77,
 'inconsistent': 0,
 'variable': 146}

In [60]:
calc_proportions(mendez_counts)

number of mutations on branch a: 	24	(16.44%)
number of mutations on branch b: 	4	(2.74%)
number of mutations on branch c: 	0	(0.00%)
number of mutations on branch d: 	16	(10.96%)
number of mutations on branch e: 	22	(15.07%)
number of mutations on branch f: 	77	(52.74%)


# exome 120 kb &mdash; using different modern humans as 'ref'

In [166]:
samples = ['ref', 'Dai', 'French', 'Han', 'Mandenka', 'Mbuti', 'Papuan', 'San', 'Sardinian', 'Yoruba', 'Karitiana', 'Australian', 'Dinka']

In [167]:
exome_alphas = []

In [168]:
for s in samples:
    print(s, end='\n\n')
    counts = count_mutations_in_vcf('../vcf/merged_exome.vcf.gz', s)
    calc_proportions(counts)
    print()
    alpha = calc_alpha_1(counts)
    print('alpha: {:.2}\nT_NR: {:.2}'.format(alpha, alpha * exome_a00_tmrca))
    print('======================================\n\n')
    
    exome_alphas.append(alpha)

ref

number of mutations on branch a: 	15	(15.15%)
number of mutations on branch b: 	3	(3.03%)
number of mutations on branch c: 	0	(0.00%)
number of mutations on branch d: 	15	(15.15%)
number of mutations on branch e: 	18	(18.18%)
number of mutations on branch f: 	48	(48.48%)

alpha: 1.9
T_NR: 4.2e+05


Dai

number of mutations on branch a: 	14	(13.86%)
number of mutations on branch b: 	3	(2.97%)
number of mutations on branch c: 	1	(0.99%)
number of mutations on branch d: 	17	(16.83%)
number of mutations on branch e: 	18	(17.82%)
number of mutations on branch f: 	48	(47.52%)

alpha: 1.8
T_NR: 3.9e+05


French

number of mutations on branch a: 	14	(14.43%)
number of mutations on branch b: 	3	(3.09%)
number of mutations on branch c: 	0	(0.00%)
number of mutations on branch d: 	14	(14.43%)
number of mutations on branch e: 	18	(18.56%)
number of mutations on branch f: 	48	(49.48%)

alpha: 1.9
T_NR: 4.1e+05


Han

number of mutations on branch a: 	15	(15.15%)
number of mutations on branch b

#### Mean alpha estimate

In [169]:
mean(exome_alphas)

1.8621874609431173

#### Estimate of Neanderthal divergence time based on exome-specific value of A00 divergence

In [180]:
mean(exome_alphas) * exome_a00_tmrca

406506.618293196

#### Estimate of Neanderthal divergence time based on whole Y chromosome value of A00 divergence

In [179]:
mean(exome_alphas) * beast_genome_a00_tmrca

512.1015517593572

In [188]:
1.9 * genome_a00_tmrca

510339.32283932285

# 560 kb &mdash; using different modern humans as 'ref'

In [186]:
samples = ['ref', 'Dai', 'French', 'Han', 'Mandenka', 'Mbuti', 'Papuan', 'San', 'Sardinian', 'Yoruba', 'Karitiana', 'Australian', 'Dinka']
lippold_alphas = []

In [187]:
for s in samples:
    print(s, end='\n\n')
    counts = count_mutations_in_vcf('../vcf/merged_lippold.vcf.gz', s)
    calc_proportions(counts)
    print()
    alpha = calc_alpha_1(counts)
    print('alpha: {:.2}\nT_NR: {:.2}'.format(alpha, alpha * genome_a00_tmrca))
    print('======================================\n\n')
    
    lippold_alphas.append(alpha)

ref

number of mutations on branch a: 	62	(13.03%)
number of mutations on branch b: 	4	(0.84%)
number of mutations on branch c: 	12	(2.52%)
number of mutations on branch d: 	180	(37.82%)
number of mutations on branch e: 	87	(18.28%)
number of mutations on branch f: 	131	(27.52%)

alpha: 1.5
T_NR: 3.9e+05


Dai

number of mutations on branch a: 	55	(15.15%)
number of mutations on branch b: 	4	(1.10%)
number of mutations on branch c: 	3	(0.83%)
number of mutations on branch d: 	84	(23.14%)
number of mutations on branch e: 	88	(24.24%)
number of mutations on branch f: 	129	(35.54%)

alpha: 1.6
T_NR: 4.4e+05


French

number of mutations on branch a: 	62	(13.28%)
number of mutations on branch b: 	4	(0.86%)
number of mutations on branch c: 	13	(2.78%)
number of mutations on branch d: 	170	(36.40%)
number of mutations on branch e: 	87	(18.63%)
number of mutations on branch f: 	131	(28.05%)

alpha: 1.5
T_NR: 4e+05


Han

number of mutations on branch a: 	55	(13.82%)
number of mutations on bra

In [189]:
mean(lippold_alphas)

1.5795288366726294

In [190]:
mean(lippold_alphas) * genome_a00_tmrca

424260.88258562796

In [25]:
mean(lippold_alphas) * genome_a00_tmrca

434.3704300849731