In [1]:
import vcf
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
%matplotlib inline

In [4]:
def get_base(sample_id, rec):
    return rec.genotype(sample_id).gt_bases

def print_matches(sample_name, matches, total):
    print('Proportions of matches of {} to different individuals:'.format(sample_name))
    print(matches / total)
    print('\ntotal # of sites: ', total, '\n')

<br><br><br><br><br>
# All sites from VCF

### Using all fragments in Denisova 8

In [3]:
vcf_reader = vcf.Reader(open('../vcf/merged_all_ontarget.vcf.gz', 'rb'))

In [6]:
hum_count, nea_count, humnea_count, = 0, 0, 0
hum_tot, nea_tot, humnea_tot = 0, 0, 0

for rec in vcf_reader.fetch('Y'):
    chimp = get_base('Chimp', rec)
    den = get_base('Den8', rec)
    nea = get_base('ElSidron', rec)
    hum = get_base('HGDP00449', rec)
    
    # skip sites with missing information in relevant samples
    if not (chimp and nea and den and hum): continue
    
    # Denisova is on human lineage
    if hum != (nea == chimp):
        hum_tot += 1
        if den == hum: hum_count += 1
    
    # Denisova is on nea lineage
    if nea != (hum == chimp):
        nea_tot += 1
        if den == nea: nea_count += 1
    
    # Denisova is on shared hum-nea lineage
    if (hum == nea) != chimp:
        humnea_tot += 1
        if den == hum: humnea_count += 1

In [15]:
print('# of human-like alleles: {} out of {} ({:.2%}%)'.format(hum_count, hum_tot, hum_count / hum_tot))
print('# of nea-like alleles: {} out of {} ({:.2%}%)'.format(nea_count, nea_tot, nea_count / nea_tot))
print()
print('# of hum-nea-like alleles: {} out of {} ({:.2%}%)'.format(humnea_count, humnea_tot, humnea_count / humnea_tot))

# of human alleles: 1 out of 6 (16.67%%)
# of nea alleles: 1 out of 6 (16.67%%)

# of hum-nea alleles: 18942 out of 18985 (99.77%%)


### Using damaged fragments in Denisova 8

In [17]:
hum_count, nea_count, humnea_count, = 0, 0, 0
hum_tot, nea_tot, humnea_tot = 0, 0, 0

for rec in vcf_reader.fetch('Y'):
    chimp = get_base('Chimp', rec)
    den = get_base('Den8_deam', rec)
    nea = get_base('ElSidron', rec)
    hum = get_base('HGDP00449', rec)
    
    # skip sites with missing information in relevant samples
    if not (chimp and nea and den and hum): continue
    
    # Denisova is on human lineage
    if hum != (nea == chimp):
        hum_tot += 1
        if den == hum: hum_count += 1
    
    # Denisova is on nea lineage
    if nea != (hum == chimp):
        nea_tot += 1
        if den == nea: nea_count += 1
    
    # Denisova is on shared hum-nea lineage
    if (hum == nea) != chimp:
        humnea_tot += 1
        if den == hum: humnea_count += 1

In [20]:
print('# of human alleles: {} out of {}'.format(hum_count, hum_tot))
print('# of nea alleles: {} out of {}'.format(nea_count, nea_tot))
print()
print('# of hum-nea-like alleles: {} out of {} ({:.2%}%)'.format(humnea_count, humnea_tot, humnea_count / humnea_tot))

# of human alleles: 0 out of 0
# of nea alleles: 0 out of 0

# of hum-nea alleles: 3531 out of 3542


<br><br><br><br><br>
# Only variable sites from VCF

### Using all fragments in Denisova 8

In [21]:
vcf_reader = vcf.Reader(open('../vcf/merged_var_ontarget.vcf.gz', 'rb'))

In [22]:
hum_count, nea_count, humnea_count, = 0, 0, 0
hum_tot, nea_tot, humnea_tot = 0, 0, 0

for rec in vcf_reader.fetch('Y'):
    chimp = get_base('Chimp', rec)
    den = get_base('Den8', rec)
    nea = get_base('ElSidron', rec)
    hum = get_base('HGDP00449', rec)
    
    # skip sites with missing information in relevant samples
    if not (chimp and nea and den and hum): continue
    
    # Denisova is on human lineage
    if hum != (nea == chimp):
        hum_tot += 1
        if den == hum: hum_count += 1
    
    # Denisova is on nea lineage
    if nea != (hum == chimp):
        nea_tot += 1
        if den == nea: nea_count += 1
    
    # Denisova is on shared hum-nea lineage
    if (hum == nea) != chimp:
        humnea_tot += 1
        if den == hum: humnea_count += 1

In [26]:
print('# of human alleles: {} out of {}'.format(hum_count, hum_tot))
print('# of nea alleles: {} out of {}'.format(nea_count, nea_tot))
print()
print('# of hum-nea-like alleles: {} out of {} ({:.2%}%)'.format(humnea_count, humnea_tot, humnea_count / humnea_tot))

# of human alleles: 0 out of 0
# of nea alleles: 0 out of 0



ZeroDivisionError: division by zero

### Using damaged fragments in Denisova 8

In [24]:
hum_count, nea_count, humnea_count, = 0, 0, 0
hum_tot, nea_tot, humnea_tot = 0, 0, 0

for rec in vcf_reader.fetch('Y'):
    chimp = get_base('Chimp', rec)
    den = get_base('Den8_deam', rec)
    nea = get_base('ElSidron', rec)
    hum = get_base('HGDP00449', rec)
    
    # skip sites with missing information in relevant samples
    if not (chimp and nea and den and hum): continue
    
    # Denisova is on human lineage
    if hum != (nea == chimp):
        hum_tot += 1
        if den == hum: hum_count += 1
    
    # Denisova is on nea lineage
    if nea != (hum == chimp):
        nea_tot += 1
        if den == nea: nea_count += 1
    
    # Denisova is on shared hum-nea lineage
    if (hum == nea) != chimp:
        humnea_tot += 1
        if den == hum: humnea_count += 1

In [25]:
print('# of human alleles: {} out of {}'.format(hum_count, hum_tot))
print('# of nea alleles: {} out of {}'.format(nea_count, nea_tot))
print()
print('# of hum-nea-like alleles: {} out of {}'.format(humnea_count, humnea_tot))

# of human alleles: 0 out of 0
# of nea alleles: 0 out of 0

# of hum-nea alleles: 0 out of 0
