In [1]:
import sys, os

# force PYTHONPATH to look into the project directory for modules
rootdir = os.path.dirname(os.path.dirname(os.getcwd()))
sys.path.insert(0, rootdir)


# Pooling-imputation performance from real bead chip data

Performs pooling simulation and imputation on data from the chromosome 20 of 1000GP.
The 1000 first markers have been selected and all unrelated individuals (2504 samples). 
These samples are randomly subdivided into a reference panel REF and a study population IMP to be pooled and imputed.
Imputation is done with Beagle 4.1 and the default parameters.


In [2]:
print('Check number of samples')
%sx bcftools query -l ALL.chr20.snps.gt.vcf.gz | wc -l

Check number of samples


['2504']

In [3]:
print('Check number of intersected markers')
%sx bcftools view -H ALL.chr20.snps.gt.vcf.gz | wc -l

Check number of intersected markers


['1000']

In [4]:
print('Shuffle split into file for REF and IMP populations')
from VCFPooling.poolSNPs import poolvcf
from VCFPooling.poolSNPs.pooler import Design

ds = Design()
dm = ds.matrix

sfsp = poolvcf.ShuffleSplitVCF(dm, 'ALL.chr20.snps.gt.vcf.gz', stu_size=0.1, wd=os.getcwd())
sfsp.split_file('chr20.snps.gt.vcf.gz')

Shuffle split into file for REF and IMP populations
/home/camille/1000Genomes/src/VCFPooling/examples/REF.chr20.snps.gt.vcf.gz:
 File created? -> True
/home/camille/1000Genomes/src/VCFPooling/examples/REF.chr20.snps.gt.vcf.gz:
 File indexed? -> True
/home/camille/1000Genomes/src/VCFPooling/examples/IMP.chr20.snps.gt.vcf.gz:
 File created? -> True
/home/camille/1000Genomes/src/VCFPooling/examples/IMP.chr20.snps.gt.vcf.gz:
 File indexed? -> True


In [5]:
print('Apply pooling on the study population')
# get processor characteristics on Linux-based OS
%sx cat /proc/cpuinfo  | grep 'name'| uniq
# model name	: Intel(R) Core(TM) i7-7600U CPU @ 2.80GHz

Apply pooling on the study population


['model name\t: Intel(R) Core(TM) i7-7600U CPU @ 2.80GHz']

In [6]:
#pooling simulation
%sx python3 -u pooling-ex.py IMP.chr20.snps.gt.vcf.gz IMP.chr20.pooled.snps.gl.vcf.gz 
# Time for pooling 1000 variants = 6.537596869000481 sec

['',
 '*******************************************************************************',
 'Input file = IMP.chr20.snps.gt.vcf.gz',
 'Output file = IMP.chr20.pooled.snps.gl.vcf.gz',
 '*******************************************************************************',
 '',
 'Pooling data in /home/camille/1000Genomes/src/VCFPooling/examples/IMP.chr20.snps.gt.vcf.gz',
 '1 variants processed in 000.01 sec..............................................',
 '',
 'Writing metadata in /home/camille/1000Genomes/src/VCFPooling/examples/IMP.chr20.pooled.snps.gl.vcf',
 '',
 'Writing data in /home/camille/1000Genomes/src/VCFPooling/examples/IMP.chr20.pooled.snps.gl.vcf',
 'Writing data in /home/camille/1000Genomes/src/VCFPooling/examples/IMP.chr20.pooled.snps.gl.vcf: Done',
 '/home/camille/1000Genomes/src/VCFPooling/examples/IMP.chr20.pooled.snps.gl.vcf.gz:',
 ' File created? -> True',
 '/home/camille/1000Genomes/src/VCFPooling/examples/IMP.chr20.pooled.snps.gl.vcf.gz:',
 ' File indexed? -> True',
 'Tim

In [7]:
print('Impute missing genotypes in the pooled file')
%sx bash bin/beagle_pipeline.sh

Impute missing genotypes in the pooled file


['Contigs in the reference file',
 '.................................................................................',
 'Chromosome  20    Startpos = 60343    Endpos = 99420',
 '',
 '',
 'Check FORMAT field in files for imputation',
 '.................................................................................',
 'FORMAT in reference panel:  GT',
 'FORMAT in target:  GL',
 '',
 '',
 'Check number of samples and number of markers in files for imputation',
 '.................................................................................',
 'reference:',
 '2264',
 '',
 'target:',
 '240',
 '',
 '',
 'Phase reference and target with BEAGLE',
 '.................................................................................',
 'Beagle .jar file used at: /home/camille/1000Genomes/src/beagle.11Mar19.69c.jar',
 '',
 'FORMAT in the phased ref file: GT',
 'beagle.11Mar19.69c.jar (version 4.1)',
 'Copyright (C) 2014-2015 Brian L. Browning',
 'Enter "java -jar beagle.11Mar19.69c.jar" for a

In [8]:
# Verify files created at the different phasing and imputation steps
assert os.path.exists('IMP.chr20.pooled.unphased.vcf.gz')
assert os.path.exists('IMP.chr20.pooled.phased.vcf.gz')
assert os.path.exists('IMP.chr20.pooled.phased.dedup.vcf.gz')
assert os.path.exists('IMP.chr20.pooled.imputed.vcf.gz')

### Compute results with customized metrics
Show classification task-based metrics and classical genotype imputation metrics

In [9]:
%sx python3 -u ../poolSNPs/imputation_quality.py ./ IMP.chr20.snps.gt.vcf.gz IMP.chr20.pooled.imputed.vcf.gz bin/gt_to_gl.sh  

['Figure(2000x800)']

In [10]:
# Verify files created at the different phasing and imputation steps
assert os.path.exists('imputation_quality_gtgl.png')


In [11]:
# # if bcftools is configured for python 2.7 usage 
# print('Plotting results with bcftools stats')
# %sx deactivate
# # bcftools stats needs python 2.7
# %sx bcftools stats --af-bins 0.01,0.02,0.04,0.08,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,0.98 --collapse snps -S study.population IMP.chr20.pooled.imputed.vcf.gz IMP.chr20.snps.gt.vcf.gz > filestats.vchk
# %sx plot-vcfstats -p bcftoolstats -s filestats.vchk
