In [None]:
#ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/phase1/analysis_results/shapeit2_phased_haplotypes/ALL.chr21.SHAPEIT2_integrated_phase1_v3.20101123.snps_indels_svs.genotypes.all.vcf.gz
#ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/phase1/analysis_results/integrated_call_sets/integrated_call_samples.20101123.ped


#https://mathgen.stats.ox.ac.uk/impute/scripts/vcf2impute_legend_haps
#perl vcf2impute_legend_haps -vcf ALL.chr21.SHAPEIT2_integrated_phase1_v3.20101123.snps_indels_svs.genotypes.all.vcf.gz -leghap 21 -chr 21 -snps_only
#python merge.py 21.hap.gz 21.legend.gz 21 > 21.merge
#python clean_sample.py < 21.sample_list > 21.sample
#impute_to_ped  21.merge 21.sample g21
#germline  -input g21.ped 21.map -output good
#gzip good.match.gz

In [None]:
from collections import defaultdict
import gzip

import scipy.stats as stats

In [None]:
germline_file = 'good.match.gz'
sample_file = 'integrated_call_samples.20101123.ped'

In [None]:
inds = set()
ind_pop = {}
selected_inds = {}
pop_inds = defaultdict(list)

with gzip.open(germline_file, 'rt', encoding='utf-8') as f:
    for l in f:
        toks = l.rstrip().split()
        inds.add(toks[1])
        inds.add(toks[3])

with open(sample_file, 'rt', encoding='utf-8') as f:
    f.readline()  # header
    for l in f:
        toks = l.rstrip().split('\t')
        fam = toks[0]
        ind = toks[1]
        pop = toks[6]
        if ind not in inds:
            continue
        selected_inds[fam] = ind  # We just want one per family
        ind_pop[ind] = pop

for ind, pop in ind_pop.items():
    pop_inds[pop].append(ind)


In [None]:
larger_shared = (0, None, None)
sizes = []
all_sizes = []
shared = defaultdict(int)
with gzip.open(germline_file, 'rt', encoding='utf-8') as f:
    for l in f:
        toks = l.rstrip().split()
        ind1 = toks[1]
        ind2 = toks[3]
        start = int(toks[5])
        end = int(toks[6])
        size = float(toks[10])
        all_sizes.append(size)
        if start > 10000000 and end < 15000000:
            continue
        if size > larger_shared[0]:
            larger_shared = (size, ind1, ind2)
        shared[tuple(sorted((ind1, ind2)))] += size
        sizes.append(size)

In [None]:
print(stats.describe(all_sizes))
print(stats.describe(sizes))

In [None]:
print(larger_shared)
print(ind_pop[larger_shared[1]], ind_pop[larger_shared[2]])

In [None]:
pop_shared = defaultdict(list)
for inds, total_size in shared.items():
    ind1, ind2 = inds
    pop1 = ind_pop[ind1]
    pop2 = ind_pop[ind2]
    if pop1 == pop2:
        pop_shared[pop1].append(total_size)

In [None]:
for pop, total_sizes in pop_shared.items():
    print("%s %3d %3d %*.2f %*.2f" % (pop,
                                    len(pop_inds[pop]), len(total_sizes),
                                    6, scipy.mean(total_sizes),
                                    6, scipy.median(total_sizes)))
print(pop_inds.keys())