In [1]:
import vcf
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
%matplotlib inline

In [3]:
vcf_reader = vcf.Reader(open('../vcf/merged_all_ontarget.vcf.gz', 'rb'))

In [4]:
def get_base(rec, sample_id):
    return rec.genotype(sample_id).gt_bases

def is_called(rec, sample_id):
    return rec.genotype(sample_id).called


def print_matches(sample_name, matches, total):
    print('Proportions of matches of {} to different individuals:'.format(sample_name))
    print(matches / total)
    print('\ntotal # of sites: ', total, '\n')

# Using all fragments in Denisova 8

In [5]:
sample_ids = ['Chimp', 'ElSidron', 'A00', 'HGDP00001', 'HGDP00099', 'HGDP00449',
              'HGDP00511', 'HGDP00540', 'HGDP00608', 'HGDP00703', 'HGDP00786']

nea_matches = pd.Series({ s : 0 for s in sample_ids + ['Den8']})
den_matches = pd.Series({ s : 0 for s in sample_ids + ['Den8']})

In [6]:
total = 0
for rec in vcf_reader.fetch('Y'):
    # skip sites with missing information in relevant samples
    if not (all(is_called(rec, s) for s in sample_ids + ['Den8'])):
        continue
    
    sidron = get_base(rec, 'ElSidron')
    den = get_base(rec, 'Den8')
    
    for s in sample_ids + ['Den8']:
        if sidron   == get_base(rec, s): nea_matches[s] += 1
        if den      == get_base(rec, s): den_matches[s] += 1
    
    total += 1

In [7]:
print_matches('El Sidron', nea_matches, total)
print_matches('Denisova', den_matches, total)

Proportions of matches of El Sidron to different individuals:
A00          0.999665
Chimp        0.961633
Den8         0.997825
ElSidron     1.000000
HGDP00001    0.999387
HGDP00099    0.999331
HGDP00449    0.999387
HGDP00511    0.999387
HGDP00540    0.999387
HGDP00608    0.999387
HGDP00703    0.999331
HGDP00786    0.999442
dtype: float64

total # of sites:  17932 

Proportions of matches of Denisova to different individuals:
A00          0.997937
Chimp        0.960462
Den8         1.000000
ElSidron     0.997825
HGDP00001    0.997992
HGDP00099    0.997825
HGDP00449    0.997769
HGDP00511    0.997881
HGDP00540    0.997881
HGDP00608    0.997881
HGDP00703    0.997825
HGDP00786    0.997937
dtype: float64

total # of sites:  17932 



# Using damaged fragments in Denisova 8 only

In [17]:
dmg_nea_matches = pd.Series({ s : 0 for s in sample_ids + ['Den8_deam']})
dmg_den_matches = pd.Series({ s : 0 for s in sample_ids + ['Den8_deam']})

In [18]:
dmg_total = 0
for rec in vcf_reader.fetch('Y'):
    # skip sites with missing information in relevant samples
    if not (all(is_called(rec, s) for s in ['Den8_deam'] + sample_ids)):
        continue

    sidron = get_base(rec, 'ElSidron')
    den = get_base(rec, 'Den8_deam')
    
    for s in sample_ids + ['Den8_deam']:
        if sidron   == get_base(rec, s): dmg_nea_matches[s] += 1
        if den      == get_base(rec, s): dmg_den_matches[s] += 1

    dmg_total += 1

In [19]:
print_matches('El Sidron', dmg_nea_matches, dmg_total)
print_matches('Denisova DEAM', dmg_den_matches, dmg_total)

Proportions of matches of El Sidron to different individuals:
A00          1.000000
Chimp        0.947152
Den8_deam    0.996770
ElSidron     1.000000
HGDP00001    1.000000
HGDP00099    0.999706
HGDP00449    1.000000
HGDP00511    1.000000
HGDP00540    0.999706
HGDP00608    0.999706
HGDP00703    1.000000
HGDP00786    1.000000
dtype: float64

total # of sites:  3406 

Proportions of matches of Denisova DEAM to different individuals:
A00          0.996770
Chimp        0.945097
Den8_deam    1.000000
ElSidron     0.996770
HGDP00001    0.996770
HGDP00099    0.996477
HGDP00449    0.996770
HGDP00511    0.996770
HGDP00540    0.996477
HGDP00608    0.996477
HGDP00703    0.996770
HGDP00786    0.996770
dtype: float64

total # of sites:  3406 

