# Variant Filtering

### Steps
1. Remove monomorphic REF sites <br/>
   `bcftools view -m2 -Oz -o out.vcf.gz in.vcf.gz`
    <br/>
   
3. Remove INDELs (with SnpGap 0/5/10) <br/>
   `bcftools filter --SnpGap <INT> in.vcf.gz | bcftools view -V indels -Oz -o out.vcf.gz`
   <br/>

5. Remove monomorphic SNPs <br/>
   `bcftools view -e "AC==AN || AC==0" -Oz -o $outVCF $inVCF`
   <br/>

7. Remove multi-allelic SNPs <br/>
   `bcftools view -m2 -M2 -Oz -o $outVCF $inVCF`
   <br/>

9. Filter based on DEPTH, QUALITY, and MQ. <br/>
   `bcftools filter -e 'INFO/DP<500 | INFO/DP>7732 | QUAL<20 | MQ<30' -Oz -o $outVCF $inVCF`
   <br/>


__NB:__ "SNPs" in VCF files incl. all sites that are 
- bi-allelic SNPs, ie. only 1 ALT allele
- multi-allelic SNPs ie. >1 ALT allele
- fixed at ALT

<br/>


In [4]:
## Load modules
module load bcftools vcftools python3

<br/>

## Test variant filtering steps

In [7]:
cd   ~/snap_hap/variants/vcf_bcftools_Am_all/test

## Test VCF
bcftools view -r Chr8:1-10000 -Oz -o ~/snap_hap/variants/vcf_bcftools_Am_all/test/test.vcf.gz \
    ~/snap_hap/variants/vcf_bcftools_Am_all/Chr8/Am_all_bcftools_Chr8.vcf.gz

## Initialize variables
inVCF=/nfs/scistore18/bartogrp/apal/snap_hap/variants/vcf_bcftools_Am_all/test/test.vcf.gz 

## Index VCF
bcftools tabix -f $inVCF

## bcftools stats
bcftools stats --threads 10 -s- --af-bins <(seq 0 0.1 1) --depth 0,25000,25 $inVCF > ${inVCF/.vcf.gz/_stats.vchk}
head -31 ${inVCF/.vcf.gz/_stats.vchk}

# This file was produced by bcftools stats (1.16+htslib-1.16) and can be plotted using plot-vcfstats.
# The command line was:	bcftools stats  --threads 10 -s- --af-bins /dev/fd/63 --depth 0,25000,25 /nfs/scistore18/bartogrp/apal/snap_hap/variants/vcf_bcftools_Am_all/test/test.vcf.gz
#
# Definition of sets:
# ID	[2]id	[3]tab-separated file names
ID	0	/nfs/scistore18/bartogrp/apal/snap_hap/variants/vcf_bcftools_Am_all/test/test.vcf.gz
# SN, Summary numbers:
#   number of records   .. number of data rows in the VCF
#   number of no-ALTs   .. reference-only sites, ALT is either "." or identical to REF
#   number of SNPs      .. number of rows with a SNP
#   number of MNPs      .. number of rows with a MNP, such as CC>TT
#   number of indels    .. number of rows with an indel
#   number of others    .. number of rows with other type, for example a symbolic allele or
#                          a complex substitution, such as ACT>TCGA
#   number of multiallelic sites     .. number of rows wit

In [9]:
cd   ~/snap_hap/variants/vcf_bcftools_Am_all/test

## 1. Remove monomorphic REF sites
inVCF=test.vcf.gz
outVCF=test_allSNPs+allINDELs.vcf.gz
bcftools view --threads 20 -m2 -Oz -o $outVCF $inVCF
bcftools stats --threads 20 -s- --af-bins <(seq 0 0.1 1) --depth 0,25000,25 $outVCF > ${outVCF/.vcf.gz/_stats.vchk}


## 2. Remove INDELs (with SnpGap 0)
inVCF=test_allSNPs+allINDELs.vcf.gz
outVCF=test_SnpGap0_allSNPs.vcf.gz
bcftools view --threads 20 -V indels -Oz -o $outVCF $inVCF
bcftools stats --threads 20 -s- --af-bins <(seq 0 0.1 1) --depth 0,25000,25 $outVCF > ${outVCF/.vcf.gz/_stats.vchk}


## 3. Remove monomorphic SNPs
inVCF=test_SnpGap0_allSNPs.vcf.gz
outVCF=test_SnpGap0_polySNPs.vcf.gz
bcftools view --threads 20 -e AC==AN || AC==0" -Oz -o $outVCF $inVCF
bcftools stats --threads 20 -s- --af-bins <(seq 0 0.1 1) --depth 0,25000,25 $outVCF > ${outVCF/.vcf.gz/_stats.vchk}


## 4. Remove multi-allelic SNPs
inVCF=test_SnpGap0_polySNPs.vcf.gz
outVCF=test_SnpGap0_biSNPs.vcf.gz
bcftools view --threads 20 -m2 -M2 -v snps -Oz -o $outVCF $inVCF
bcftools stats --threads 20 -s- --af-bins <(seq 0 0.1 1) --depth 0,25000,25 $outVCF > ${outVCF/.vcf.gz/_stats.vchk}

In [21]:
echo -e "Records from the original VCF"

echo Total records = `bcftools view -H test.vcf.gz | wc -l`
echo Fixed REFs = `bcftools view -H -M1 test.vcf.gz | wc -l`
echo without Fixed REFs = `bcftools view -H -m2 test.vcf.gz | wc -l`

echo Fixed ALT indels = `bcftools view -H -v indels -i "AC==AN" test.vcf.gz | wc -l`
echo Fixed ALT snps = `bcftools view -H -v snps -i "AC==AN" test.vcf.gz | wc -l`

echo Multi-allelic sites = `bcftools view -H -m3 -e "AC==AN" test.vcf.gz | wc -l`
echo indels at multi-allelic sites = `bcftools view -H -v indels -m3 -e "AC==AN" test.vcf.gz | wc -l`
echo snps at multi-allelic sites = `bcftools view -H -v snps -m3 -e "AC==AN" test.vcf.gz | wc -l`

echo indels at TRUE bi-allelic sites =  `bcftools view -H -v indels -m2 -M2 -e "AC==AN" test.vcf.gz | wc -l`
echo snps at TRUE biallelic sites = `bcftools view -H -v snps -m2 -M2 -e "AC==AN" test.vcf.gz | wc -l`

echo -e "\n\nRecords from the filtered VCF"
echo Total Records: `bcftools view -H test.vcf.gz | wc -l` 
echo allSNPs + allINDELs: `bcftools view -H test_allSNPs+allINDELs.vcf.gz | wc -l`
echo allSNPs: `bcftools view -H test_SnpGap0_allSNPs.vcf.gz | wc -l`
echo polySNPs: `bcftools view -H test_SnpGap0_polySNPs.vcf.gz | wc -l`
echo biSNPs: `bcftools view -H test_SnpGap0_biSNPs.vcf.gz | wc -l`

Records from the original VCF
Total records = 10085
Fixed REFs = 9720
without Fixed REFs = 365
Fixed ALT indels = 25
Fixed ALT snps = 16
Multi-allelic sites = 19
indels at multi-allelic sites = 3
snps at multi-allelic sites = 16
indels at TRUE bi-allelic sites = 11
snps at TRUE biallelic sites = 294


Records from the filtered VCF
Total Records: 10085
allSNPs + allINDELs: 365
allSNPs: 326
polySNPs: 310
biSNPs: 294


<br/>
<hr/>

## Variant Filtering on cluster

### Filter1: Remove monomorphic REFs

In [None]:
cd /nfs/scistore18/bartogrp/apal/snap_hap/variants/jobs/bcftools/4-variant-filtering
#chr=Chr1
#threads=10

for chr in Chr{1..8}
do
    sbatch -J ${chr}-filter1-removeREFs ~/snap_hap/_scripts/sbatch/vcf-utils/job-bcftools_filter1-removeREFs.sbatch \
        ~/snap_hap/variants/vcf_bcftools_Am_all/${chr}/Am_all_bcftools_${chr}.vcf.gz \
        ~/snap_hap/variants/vcf_bcftools_Am_all/${chr}/Am_all_bcftools_${chr}_allSNPs+allINDELs.vcf.gz \
        $threads
done

## Run bcftools stats
cd /nfs/scistore18/bartogrp/apal/snap_hap/variants/jobs/bcftools/3-stats
for inVCF in ~/snap_hap/variants/vcf_bcftools_Am_all/*/Am_all_bcftools_*_allSNPs+allINDELs.vcf.gz
do
    echo $inVCF
    sbatch -J vcf-stats ~/snap_hap/_scripts/sbatch/vcf-utils/job-bcftools_stats.sbatch $inVCF $threads
done

```
#SBATCH --partition=defaultp
#SBATCH --exclude=zeta[243-262],beta[231-235]
#SBATCH --time=120:00:00
#SBATCH --cpus-per-task=1
#SBATCH --mem-per-cpu=8G
#SBATCH --ntasks-per-node=1
#SBATCH --ntasks=1
```

### Filter2: Remove INDELs (incl. SNPs in vicinity)

In [None]:
cd /nfs/scistore18/bartogrp/apal/snap_hap/variants/jobs/bcftools/4-variant-filtering
#chr=Chr1 ## Chr{1..8}
#threads=10 
#SnpGap=10 ## SnpGap: 0,5,10

for chr in Chr{1..8}
do
    for SnpGap in 0 5 10
    do
        echo $chr SnpGap$SnpGap
        sbatch -J ${chr}-filter2-removeINDELs-SnpGap${SnpGap} ~/snap_hap/_scripts/sbatch/vcf-utils/job-bcftools_filter2-removeINDELs.sbatch \
            ~/snap_hap/variants/vcf_bcftools_Am_all/${chr}/Am_all_bcftools_${chr}_allSNPs+allINDELs.vcf.gz \
            ~/snap_hap/variants/vcf_bcftools_Am_all/${chr}/Am_all_bcftools_${chr}_SnpGap${SnpGap}_allSNPs.vcf.gz \
            $threads \
            $SnpGap
    done
done

## Run bcftools stats
cd /nfs/scistore18/bartogrp/apal/snap_hap/variants/jobs/bcftools/3-stats
for inVCF in ~/snap_hap/variants/vcf_bcftools_Am_all/*/Am_all_bcftools_*_allSNPs.vcf.gz
do
    echo $inVCF
    sbatch -J vcf-stats ~/snap_hap/_scripts/sbatch/vcf-utils/job-bcftools_stats.sbatch $inVCF $threads
done

### Filter3: Remove monomoprhic SNPs

### Filter4: Remove multiallelic SNPs

In [None]:
cd /nfs/scistore18/bartogrp/apal/snap_hap/variants/jobs/bcftools/4-variant-filtering
#chr=Chr1
#threads=10
#SnpGap=10

for chr in Chr{1..8}
do
    for SnpGap in 0 5 10
    do 
        echo $chr SnpGap$SnpGap
        sbatch -J ${chr}-filter4-removeMultiSNPs ~/snap_hap/_scripts/sbatch/vcf-utils/job-bcftools_filter4-removeMultiSNPs.sbatch \
            ~/snap_hap/variants/vcf_bcftools_Am_all/${chr}/Am_all_bcftools_${chr}_SnpGap${SnpGap}_polySNPs.vcf.gz \
            ~/snap_hap/variants/vcf_bcftools_Am_all/${chr}/Am_all_bcftools_${chr}_SnpGap${SnpGap}_biSNPs.vcf.gz \
            $threads
    done
done


## Run bcftools stats
cd /nfs/scistore18/bartogrp/apal/snap_hap/variants/jobs/bcftools/3-stats
threads=10
for inVCF in ~/snap_hap/variants/vcf_bcftools_Am_all/*/Am_all_bcftools_*_biSNPs.vcf.gz
do
    echo $inVCF
    sbatch -J vcf-stats ~/snap_hap/_scripts/sbatch/vcf-utils/job-bcftools_stats.sbatch $inVCF $threads
done


## Run bcftools query to extract params
cd /nfs/scistore18/bartogrp/apal/snap_hap/variants/jobs/bcftools/5-extract-params
threads=10
for inVCF in ~/snap_hap/variants/vcf_bcftools_Am_all/*/Am_all_bcftools_*_SnpGap*_biSNPs.vcf.gz
do
    echo $inVCF
    sbatch -J extract-params-biSNPs ~/snap_hap/_scripts/sbatch/vcf-utils/job-bcftools_extract-params.sbatch $inVCF
done

<br/>

#### Checking filtered variants

In [1]:
cat ~/snap_hap/variants/vcf_bcftools_Am_all/Chr1/Am_all_bcftools_Chr1_SnpGap5_biSNPs_params.txt | \
    grep -w -e 955185 \
            -e 45535107 \
            -e 47381180 \
            -e 51252783 \
            -e 60537705 \
            -e 71126303
cat ~/snap_hap/variants/vcf_bcftools_Am_all/Chr2/Am_all_bcftools_Chr2_SnpGap5_biSNPs_params.txt |  grep -w -e 55473945 -e 58751341
cat ~/snap_hap/variants/vcf_bcftools_Am_all/Chr3/Am_all_bcftools_Chr3_SnpGap5_biSNPs_params.txt | grep -w -e 1055959 -e 3488331 -e 44197469
cat ~/snap_hap/variants/vcf_bcftools_Am_all/Chr4/Am_all_bcftools_Chr4_SnpGap5_biSNPs_params.txt | grep -w -e 7115521 -e 38397179 -e 44460308 -e 50972302
cat ~/snap_hap/variants/vcf_bcftools_Am_all/Chr6/Am_all_bcftools_Chr6_SnpGap5_biSNPs_params.txt | grep -w -e 15125731 -e 32503688 -e 52921613 -e 52922047 \
    -e 52966811 -e 52999112 -e 53016551 -e 53061080 -e 53083229 -e 53090212 -e 53094790 -e 53104387 -e 53195127 -e 53944185
cat ~/snap_hap/variants/vcf_bcftools_Am_all/Chr7/Am_all_bcftools_Chr7_SnpGap5_biSNPs_params.txt | grep -w -e 49828363
cat ~/snap_hap/variants/vcf_bcftools_Am_all/Chr8/Am_all_bcftools_Chr8_SnpGap5_biSNPs_params.txt | grep -w -e 27807304 -e 31864104 -e 38142268 -e 44933278 -e 53715058

Chr1	955185	C	A	26533.4	2130	853	1246	0	42
Chr1	45535107	C	A	28863.3	2464	836	1336	0	54
Chr1	47381180	C	A	16317.9	2576	314	1330	0	52
Chr1	51252783	A	C	18432.5	2946	388	1364	0	53
Chr1	60537705	T	C	14255.6	2498	311	1304	0	46
Chr1	71126303	T	G	34834.4	2830	909	1432	0	54
Chr2	55473945	A	G	22973.6	2175	445	1324	0	57
Chr2	58751341	A	G	8338.71	1981	221	1110	0	49
Chr3	1055959	T	A	25268.3	2470	813	1316	0	41
Chr3	3488331	C	T	29029.6	2497	845	1350	0	48
Chr3	44197469	G	C	18392.9	2598	368	1368	0	50
Chr4	7115521	T	G	32824.4	2682	948	1408	0	48
Chr4	38397179	T	A	20768.1	1917	471	1214	0	54
Chr4	44460308	A	G	26240.1	3136	462	1494	0	55
Chr4	50972302	G	A	30206.3	2642	831	1364	0	53
Chr6	15125731	A	T	33795.9	2614	965	1440	0	53
Chr6	32503688	A	G	25119.7	2748	457	1398	0	54
Chr6	52921613	C	G	18310.3	2466	369	1346	0	55
Chr6	52922047	G	T	22605	2515	458	1334	0	51
Chr6	52966811	T	A	12147	1920	313	1208	0	49
Chr6	52999112	C	T	14710.1	2106	346	1200	0	50
Chr6	53016551	C	G	18626.9	2535	318	1364	0	57
Chr6	53061080	A	G	2

<br>

### Filter5: Filter SNPs based on QUAL, DEPTH and MQ

Only SnpGap5 is retained. 


The average read depth for all samples (n=1074) is 3.6. So, maximum depth allowed = 3.6x1074x2 = 7732.8  
Theoretically, the total expected depth should be -  (130x2)+(120x1)+(30x6)+(20x12)+(12x152)+(2x787)+(1x48)+(1x38)+(4x15)+(4x13) = 4396

`bcftools filter --threads $threads -e "INFO/DP<500 | INFO/DP>7732 | QUAL<20 | MQ<30" -Oz -o $outVCF $inVCF`

## No. of SNPs per sample

## Extract mpileup genotypes for validation against 30 KASP markers

### Chr1

In [36]:
cd /nfs/scistore18/bartogrp/apal/snap_hap/variants/vcf_bcftools_Am_all
KASPcoords=./KASPcoords.pos

chrom=Chr1
vcf=./${chrom}/Am_all_bcftools_${chrom}_SnpGap5_biSNPs_filtered-DP_500-7732_QUAL20_MQ30.vcf.gz
outDIR=./validation

for snpID in {1..6}
do
    markerID=$(grep -w $chrom $KASPcoords | sed -n "${snpID}p" | cut -f3)
    pos=$(grep -w $chrom $KASPcoords | sed -n "${snpID}p" | cut -f4)
    outVCF=mpileup-${markerID}-${chrom}-${pos}
    outGT=${markerID#*_}-${chrom}-${pos}
    echo -e "\n"$chrom"\t"$markerID"\t"$pos"\t"$outGT"\n"
    
    ## No Filter
    echo -e '\nNo Filter'
    time bcftools view -r ${chrom}:${pos} -Oz -o $outDIR/$outVCF.noFilter.vcf.gz $vcf
    bcftools tabix -f $outDIR/$outVCF.noFilter.vcf.gz
    vcftools --gzvcf $outDIR/$outVCF.noFilter.vcf.gz --chr $chrom --from-bp $((pos-2)) --to-bp $((pos+2)) --012 --out $outDIR/${outGT}_noFilter

    ## Softmask FORMAT/DP<6 
    echo -e '\nSoftmask FORMAT/DP<6'
    time bcftools filter -r ${chrom}:${pos} -S . -e "FMT/DP<6" -Oz -o $outDIR/$outVCF.DP6.vcf.gz $vcf
    bcftools tabix -f $outDIR/$outVCF.DP6.vcf.gz
    vcftools --gzvcf $outDIR/$outVCF.DP6.vcf.gz --chr $chrom --from-bp $((pos-2)) --to-bp $((pos+2)) --012 --out $outDIR/${outGT}_DP6

    ## Softmask FORMAT/DP<10
    echo -e '\nSoftmask FORMAT/DP<10'
    time bcftools filter -r ${chrom}:${pos} -S . -e "FMT/DP<10" -Oz -o $outDIR/$outVCF.DP10.vcf.gz $vcf
    bcftools tabix -f $outDIR/$outVCF.DP10.vcf.gz
    vcftools --gzvcf $outDIR/$outVCF.DP10.vcf.gz --chr $chrom --from-bp $((pos-2)) --to-bp $((pos+2)) --012 --out $outDIR/${outGT}_DP10

    ## Change longFormat sample names to shortFormat
    echo -e '\nChange longFormat sample names to shortFormat'
    cat $outDIR/${markerID#*_}-${chrom}-${pos}_DP10.012.indv | \
        sed 's/\(10x_\|10x2_\|10xNEW_\|Pla_\|Ave_\|GH_\|60x_\|60x2_\|2x_\)//g; s/\(TRIO-\|n96-\)//g' > $outDIR/${outGT}.shortFormat.indv

    ## Create file with stitch genotypes
    echo -e '\nCreate combined GT file'
    paste $outDIR/${outGT}.shortFormat.indv \
        <(cut -f2 $outDIR/${outGT}_noFilter.012) \
        <(cut -f2 $outDIR/${outGT}_DP6.012) \
        <(cut -f2 $outDIR/${outGT}_DP10.012) > $outDIR/mpileup_$outGT.gt
done


Chr1	1_1	955185	1-Chr1-955185


No Filter

real	0m0.108s
user	0m0.098s
sys	0m0.008s

VCFtools - 0.1.16
(C) Adam Auton and Anthony Marcketta 2009

Parameters as interpreted:
	--gzvcf ./validation/mpileup-1_1-Chr1-955185.noFilter.vcf.gz
	--chr Chr1
	--to-bp 955187
	--012
	--out ./validation/1-Chr1-955185_noFilter
	--from-bp 955183

Using zlib version: 1.2.11
After filtering, kept 1074 out of 1074 Individuals
Writing 012 matrix files ... Done.
After filtering, kept 1 out of a possible 1 Sites
Run Time = 0.00 seconds

Softmask FORMAT/DP<6

real	0m0.087s
user	0m0.078s
sys	0m0.008s

VCFtools - 0.1.16
(C) Adam Auton and Anthony Marcketta 2009

Parameters as interpreted:
	--gzvcf ./validation/mpileup-1_1-Chr1-955185.DP6.vcf.gz
	--chr Chr1
	--to-bp 955187
	--012
	--out ./validation/1-Chr1-955185_DP6
	--from-bp 955183

Using zlib version: 1.2.11
After filtering, kept 1074 out of 1074 Individuals
Writing 012 matrix files ... Done.
After filtering, kept 1 out of a possible 1 Sites
Run Time = 0.00

### Chr2

In [35]:
cd /nfs/scistore18/bartogrp/apal/snap_hap/variants/vcf_bcftools_Am_all
KASPcoords=./KASPcoords.pos

chrom=Chr2
vcf=./${chrom}/Am_all_bcftools_${chrom}_SnpGap5_biSNPs_filtered-DP_500-7732_QUAL20_MQ30.vcf.gz
outDIR=./validation

for snpID in {1..2}
do
    markerID=$(grep -w $chrom $KASPcoords | sed -n "${snpID}p" | cut -f3)
    pos=$(grep -w $chrom $KASPcoords | sed -n "${snpID}p" | cut -f4)
    outVCF=mpileup-${markerID}-${chrom}-${pos}
    outGT=${markerID#*_}-${chrom}-${pos}
    echo -e "\n"$chrom"\t"$markerID"\t"$pos"\t"$outGT"\n"
    
    ## No Filter
    echo -e '\nNo Filter'
    time bcftools view -r ${chrom}:${pos} -Oz -o $outDIR/$outVCF.noFilter.vcf.gz $vcf
    bcftools tabix -f $outDIR/$outVCF.noFilter.vcf.gz
    vcftools --gzvcf $outDIR/$outVCF.noFilter.vcf.gz --chr $chrom --from-bp $((pos-2)) --to-bp $((pos+2)) --012 --out $outDIR/${outGT}_noFilter

    ## Softmask FORMAT/DP<6 
    echo -e '\nSoftmask FORMAT/DP<6'
    time bcftools filter -r ${chrom}:${pos} -S . -e "FMT/DP<6" -Oz -o $outDIR/$outVCF.DP6.vcf.gz $vcf
    bcftools tabix -f $outDIR/$outVCF.DP6.vcf.gz
    vcftools --gzvcf $outDIR/$outVCF.DP6.vcf.gz --chr $chrom --from-bp $((pos-2)) --to-bp $((pos+2)) --012 --out $outDIR/${outGT}_DP6

    ## Softmask FORMAT/DP<10
    echo -e '\nSoftmask FORMAT/DP<10'
    time bcftools filter -r ${chrom}:${pos} -S . -e "FMT/DP<10" -Oz -o $outDIR/$outVCF.DP10.vcf.gz $vcf
    bcftools tabix -f $outDIR/$outVCF.DP10.vcf.gz
    vcftools --gzvcf $outDIR/$outVCF.DP10.vcf.gz --chr $chrom --from-bp $((pos-2)) --to-bp $((pos+2)) --012 --out $outDIR/${outGT}_DP10

    ## Change longFormat sample names to shortFormat
    echo -e '\nChange longFormat sample names to shortFormat'
    cat $outDIR/${markerID#*_}-${chrom}-${pos}_DP10.012.indv | \
        sed 's/\(10x_\|10x2_\|10xNEW_\|Pla_\|Ave_\|GH_\|60x_\|60x2_\|2x_\)//g; s/\(TRIO-\|n96-\)//g' > $outDIR/${outGT}.shortFormat.indv

    ## Create file with stitch genotypes
    echo -e '\nCreate combined GT file'
    paste $outDIR/${outGT}.shortFormat.indv \
        <(cut -f2 $outDIR/${outGT}_noFilter.012) \
        <(cut -f2 $outDIR/${outGT}_DP6.012) \
        <(cut -f2 $outDIR/${outGT}_DP10.012) > $outDIR/mpileup_$outGT.gt
done


Chr2	2_1	55473945	1-Chr2-55473945


No Filter

real	0m0.088s
user	0m0.074s
sys	0m0.012s

VCFtools - 0.1.16
(C) Adam Auton and Anthony Marcketta 2009

Parameters as interpreted:
	--gzvcf ./validation/mpileup-2_1-Chr2-55473945.noFilter.vcf.gz
	--chr Chr2
	--to-bp 55473947
	--012
	--out ./validation/1-Chr2-55473945_noFilter
	--from-bp 55473943

Using zlib version: 1.2.11
After filtering, kept 1074 out of 1074 Individuals
Writing 012 matrix files ... Done.
After filtering, kept 1 out of a possible 1 Sites
Run Time = 0.00 seconds

Softmask FORMAT/DP<6

real	0m0.090s
user	0m0.080s
sys	0m0.008s

VCFtools - 0.1.16
(C) Adam Auton and Anthony Marcketta 2009

Parameters as interpreted:
	--gzvcf ./validation/mpileup-2_1-Chr2-55473945.DP6.vcf.gz
	--chr Chr2
	--to-bp 55473947
	--012
	--out ./validation/1-Chr2-55473945_DP6
	--from-bp 55473943

Using zlib version: 1.2.11
After filtering, kept 1074 out of 1074 Individuals
Writing 012 matrix files ... Done.
After filtering, kept 1 out of a possible 1 S

### Chr3

In [34]:
cd /nfs/scistore18/bartogrp/apal/snap_hap/variants/vcf_bcftools_Am_all
KASPcoords=./KASPcoords.pos

chrom=Chr3
vcf=./${chrom}/Am_all_bcftools_${chrom}_SnpGap5_biSNPs_filtered-DP_500-7732_QUAL20_MQ30.vcf.gz
outDIR=./validation

for snpID in {1..3}
do
    markerID=$(grep -w $chrom $KASPcoords | sed -n "${snpID}p" | cut -f3)
    pos=$(grep -w $chrom $KASPcoords | sed -n "${snpID}p" | cut -f4)
    outVCF=mpileup-${markerID}-${chrom}-${pos}
    outGT=${markerID#*_}-${chrom}-${pos}
    echo -e "\n"$chrom"\t"$markerID"\t"$pos"\t"$outGT"\n"
    
    ## No Filter
    echo -e '\nNo Filter'
    time bcftools view -r ${chrom}:${pos} -Oz -o $outDIR/$outVCF.noFilter.vcf.gz $vcf
    bcftools tabix -f $outDIR/$outVCF.noFilter.vcf.gz
    vcftools --gzvcf $outDIR/$outVCF.noFilter.vcf.gz --chr $chrom --from-bp $((pos-2)) --to-bp $((pos+2)) --012 --out $outDIR/${outGT}_noFilter

    ## Softmask FORMAT/DP<6 
    echo -e '\nSoftmask FORMAT/DP<6'
    time bcftools filter -r ${chrom}:${pos} -S . -e "FMT/DP<6" -Oz -o $outDIR/$outVCF.DP6.vcf.gz $vcf
    bcftools tabix -f $outDIR/$outVCF.DP6.vcf.gz
    vcftools --gzvcf $outDIR/$outVCF.DP6.vcf.gz --chr $chrom --from-bp $((pos-2)) --to-bp $((pos+2)) --012 --out $outDIR/${outGT}_DP6

    ## Softmask FORMAT/DP<10
    echo -e '\nSoftmask FORMAT/DP<10'
    time bcftools filter -r ${chrom}:${pos} -S . -e "FMT/DP<10" -Oz -o $outDIR/$outVCF.DP10.vcf.gz $vcf
    bcftools tabix -f $outDIR/$outVCF.DP10.vcf.gz
    vcftools --gzvcf $outDIR/$outVCF.DP10.vcf.gz --chr $chrom --from-bp $((pos-2)) --to-bp $((pos+2)) --012 --out $outDIR/${outGT}_DP10

    ## Change longFormat sample names to shortFormat
    echo -e '\nChange longFormat sample names to shortFormat'
    cat $outDIR/${markerID#*_}-${chrom}-${pos}_DP10.012.indv | \
        sed 's/\(10x_\|10x2_\|10xNEW_\|Pla_\|Ave_\|GH_\|60x_\|60x2_\|2x_\)//g; s/\(TRIO-\|n96-\)//g' > $outDIR/${outGT}.shortFormat.indv

    ## Create file with stitch genotypes
    echo -e '\nCreate combined GT file'
    paste $outDIR/${outGT}.shortFormat.indv \
        <(cut -f2 $outDIR/${outGT}_noFilter.012) \
        <(cut -f2 $outDIR/${outGT}_DP6.012) \
        <(cut -f2 $outDIR/${outGT}_DP10.012) > $outDIR/mpileup_$outGT.gt
done


Chr3	3_1	1055959	1-Chr3-1055959


No Filter

real	0m0.097s
user	0m0.091s
sys	0m0.004s

VCFtools - 0.1.16
(C) Adam Auton and Anthony Marcketta 2009

Parameters as interpreted:
	--gzvcf ./validation/mpileup-3_1-Chr3-1055959.noFilter.vcf.gz
	--chr Chr3
	--to-bp 1055961
	--012
	--out ./validation/1-Chr3-1055959_noFilter
	--from-bp 1055957

Using zlib version: 1.2.11
After filtering, kept 1074 out of 1074 Individuals
Writing 012 matrix files ... Done.
After filtering, kept 1 out of a possible 1 Sites
Run Time = 1.00 seconds

Softmask FORMAT/DP<6

real	0m0.088s
user	0m0.082s
sys	0m0.005s

VCFtools - 0.1.16
(C) Adam Auton and Anthony Marcketta 2009

Parameters as interpreted:
	--gzvcf ./validation/mpileup-3_1-Chr3-1055959.DP6.vcf.gz
	--chr Chr3
	--to-bp 1055961
	--012
	--out ./validation/1-Chr3-1055959_DP6
	--from-bp 1055957

Using zlib version: 1.2.11
After filtering, kept 1074 out of 1074 Individuals
Writing 012 matrix files ... Done.
After filtering, kept 1 out of a possible 1 Sites
Run T

### Chr4

In [33]:
cd /nfs/scistore18/bartogrp/apal/snap_hap/variants/vcf_bcftools_Am_all
KASPcoords=./KASPcoords.pos

chrom=Chr4
vcf=./${chrom}/Am_all_bcftools_${chrom}_SnpGap5_biSNPs_filtered-DP_500-7732_QUAL20_MQ30.vcf.gz
outDIR=./validation

for snpID in {1..3}
do
    markerID=$(grep -w $chrom $KASPcoords | sed -n "${snpID}p" | cut -f3)
    pos=$(grep -w $chrom $KASPcoords | sed -n "${snpID}p" | cut -f4)
    outVCF=mpileup-${markerID}-${chrom}-${pos}
    outGT=${markerID#*_}-${chrom}-${pos}
    echo -e "\n"$chrom"\t"$markerID"\t"$pos"\t"$outGT"\n"
    
    ## No Filter
    echo -e '\nNo Filter'
    time bcftools view -r ${chrom}:${pos} -Oz -o $outDIR/$outVCF.noFilter.vcf.gz $vcf
    bcftools tabix -f $outDIR/$outVCF.noFilter.vcf.gz
    vcftools --gzvcf $outDIR/$outVCF.noFilter.vcf.gz --chr $chrom --from-bp $((pos-2)) --to-bp $((pos+2)) --012 --out $outDIR/${outGT}_noFilter

    ## Softmask FORMAT/DP<6 
    echo -e '\nSoftmask FORMAT/DP<6'
    time bcftools filter -r ${chrom}:${pos} -S . -e "FMT/DP<6" -Oz -o $outDIR/$outVCF.DP6.vcf.gz $vcf
    bcftools tabix -f $outDIR/$outVCF.DP6.vcf.gz
    vcftools --gzvcf $outDIR/$outVCF.DP6.vcf.gz --chr $chrom --from-bp $((pos-2)) --to-bp $((pos+2)) --012 --out $outDIR/${outGT}_DP6

    ## Softmask FORMAT/DP<10
    echo -e '\nSoftmask FORMAT/DP<10'
    time bcftools filter -r ${chrom}:${pos} -S . -e "FMT/DP<10" -Oz -o $outDIR/$outVCF.DP10.vcf.gz $vcf
    bcftools tabix -f $outDIR/$outVCF.DP10.vcf.gz
    vcftools --gzvcf $outDIR/$outVCF.DP10.vcf.gz --chr $chrom --from-bp $((pos-2)) --to-bp $((pos+2)) --012 --out $outDIR/${outGT}_DP10

    ## Change longFormat sample names to shortFormat
    echo -e '\nChange longFormat sample names to shortFormat'
    cat $outDIR/${markerID#*_}-${chrom}-${pos}_DP10.012.indv | \
        sed 's/\(10x_\|10x2_\|10xNEW_\|Pla_\|Ave_\|GH_\|60x_\|60x2_\|2x_\)//g; s/\(TRIO-\|n96-\)//g' > $outDIR/${outGT}.shortFormat.indv

    ## Create file with stitch genotypes
    echo -e '\nCreate combined GT file'
    paste $outDIR/${outGT}.shortFormat.indv \
        <(cut -f2 $outDIR/${outGT}_noFilter.012) \
        <(cut -f2 $outDIR/${outGT}_DP6.012) \
        <(cut -f2 $outDIR/${outGT}_DP10.012) > $outDIR/mpileup_$outGT.gt
done


Chr4	4_1	7115521	1-Chr4-7115521


No Filter

real	0m0.061s
user	0m0.055s
sys	0m0.004s

VCFtools - 0.1.16
(C) Adam Auton and Anthony Marcketta 2009

Parameters as interpreted:
	--gzvcf ./validation/mpileup-4_1-Chr4-7115521.noFilter.vcf.gz
	--chr Chr4
	--to-bp 7115523
	--012
	--out ./validation/1-Chr4-7115521_noFilter
	--from-bp 7115519

Using zlib version: 1.2.11
After filtering, kept 1074 out of 1074 Individuals
Writing 012 matrix files ... Done.
After filtering, kept 1 out of a possible 1 Sites
Run Time = 1.00 seconds

Softmask FORMAT/DP<6

real	0m0.061s
user	0m0.048s
sys	0m0.012s

VCFtools - 0.1.16
(C) Adam Auton and Anthony Marcketta 2009

Parameters as interpreted:
	--gzvcf ./validation/mpileup-4_1-Chr4-7115521.DP6.vcf.gz
	--chr Chr4
	--to-bp 7115523
	--012
	--out ./validation/1-Chr4-7115521_DP6
	--from-bp 7115519

Using zlib version: 1.2.11
After filtering, kept 1074 out of 1074 Individuals
Writing 012 matrix files ... Done.
After filtering, kept 1 out of a possible 1 Sites
Run T

### Chr6

In [32]:
cd /nfs/scistore18/bartogrp/apal/snap_hap/variants/vcf_bcftools_Am_all
KASPcoords=./KASPcoords.pos

chrom=Chr6
vcf=./${chrom}/Am_all_bcftools_${chrom}_SnpGap5_biSNPs_filtered-DP_500-7732_QUAL20_MQ30.vcf.gz
outDIR=./validation

for snpID in {1..13}
do
    markerID=$(grep -w $chrom $KASPcoords | sed -n "${snpID}p" | cut -f3)
    pos=$(grep -w $chrom $KASPcoords | sed -n "${snpID}p" | cut -f4)
    outVCF=mpileup-${markerID}-${chrom}-${pos}
    outGT=${markerID#*_}-${chrom}-${pos}
    echo -e "\n"$chrom"\t"$markerID"\t"$pos"\t"$outGT"\n"
    
    ## No Filter
    echo -e '\nNo Filter'
    time bcftools view -r ${chrom}:${pos} -Oz -o $outDIR/$outVCF.noFilter.vcf.gz $vcf
    bcftools tabix -f $outDIR/$outVCF.noFilter.vcf.gz
    vcftools --gzvcf $outDIR/$outVCF.noFilter.vcf.gz --chr $chrom --from-bp $((pos-2)) --to-bp $((pos+2)) --012 --out $outDIR/${outGT}_noFilter

    ## Softmask FORMAT/DP<6 
    echo -e '\nSoftmask FORMAT/DP<6'
    time bcftools filter -r ${chrom}:${pos} -S . -e "FMT/DP<6" -Oz -o $outDIR/$outVCF.DP6.vcf.gz $vcf
    bcftools tabix -f $outDIR/$outVCF.DP6.vcf.gz
    vcftools --gzvcf $outDIR/$outVCF.DP6.vcf.gz --chr $chrom --from-bp $((pos-2)) --to-bp $((pos+2)) --012 --out $outDIR/${outGT}_DP6

    ## Softmask FORMAT/DP<10
    echo -e '\nSoftmask FORMAT/DP<10'
    time bcftools filter -r ${chrom}:${pos} -S . -e "FMT/DP<10" -Oz -o $outDIR/$outVCF.DP10.vcf.gz $vcf
    bcftools tabix -f $outDIR/$outVCF.DP10.vcf.gz
    vcftools --gzvcf $outDIR/$outVCF.DP10.vcf.gz --chr $chrom --from-bp $((pos-2)) --to-bp $((pos+2)) --012 --out $outDIR/${outGT}_DP10

    ## Change longFormat sample names to shortFormat
    echo -e '\nChange longFormat sample names to shortFormat'
    cat $outDIR/${markerID#*_}-${chrom}-${pos}_DP10.012.indv | \
        sed 's/\(10x_\|10x2_\|10xNEW_\|Pla_\|Ave_\|GH_\|60x_\|60x2_\|2x_\)//g; s/\(TRIO-\|n96-\)//g' > $outDIR/${outGT}.shortFormat.indv

    ## Create file with stitch genotypes
    echo -e '\nCreate combined GT file'
    paste $outDIR/${outGT}.shortFormat.indv \
        <(cut -f2 $outDIR/${outGT}_noFilter.012) \
        <(cut -f2 $outDIR/${outGT}_DP6.012) \
        <(cut -f2 $outDIR/${outGT}_DP10.012) > $outDIR/mpileup_$outGT.gt
done


Chr6	6_1	15125731	1-Chr6-15125731


No Filter

real	0m0.043s
user	0m0.039s
sys	0m0.000s

VCFtools - 0.1.16
(C) Adam Auton and Anthony Marcketta 2009

Parameters as interpreted:
	--gzvcf ./validation/mpileup-6_1-Chr6-15125731.noFilter.vcf.gz
	--chr Chr6
	--to-bp 15125733
	--012
	--out ./validation/1-Chr6-15125731_noFilter
	--from-bp 15125729

Using zlib version: 1.2.11
After filtering, kept 1074 out of 1074 Individuals
Writing 012 matrix files ... Done.
After filtering, kept 1 out of a possible 1 Sites
Run Time = 0.00 seconds

Softmask FORMAT/DP<6

real	0m0.050s
user	0m0.031s
sys	0m0.008s

VCFtools - 0.1.16
(C) Adam Auton and Anthony Marcketta 2009

Parameters as interpreted:
	--gzvcf ./validation/mpileup-6_1-Chr6-15125731.DP6.vcf.gz
	--chr Chr6
	--to-bp 15125733
	--012
	--out ./validation/1-Chr6-15125731_DP6
	--from-bp 15125729

Using zlib version: 1.2.11
After filtering, kept 1074 out of 1074 Individuals
Writing 012 matrix files ... Done.
After filtering, kept 1 out of a possible 1 S

### Chr7

In [31]:
cd /nfs/scistore18/bartogrp/apal/snap_hap/variants/vcf_bcftools_Am_all
KASPcoords=./KASPcoords.pos

chrom=Chr7
vcf=./${chrom}/Am_all_bcftools_${chrom}_SnpGap5_biSNPs_filtered-DP_500-7732_QUAL20_MQ30.vcf.gz
outDIR=./validation

for snpID in 1
do
    markerID=$(grep -w $chrom $KASPcoords | sed -n "${snpID}p" | cut -f3)
    pos=$(grep -w $chrom $KASPcoords | sed -n "${snpID}p" | cut -f4)
    outVCF=mpileup-${markerID}-${chrom}-${pos}
    outGT=${markerID#*_}-${chrom}-${pos}
    echo -e "\n"$chrom"\t"$markerID"\t"$pos"\t"$outGT"\n"
    
    ## No Filter
    echo -e '\nNo Filter'
    time bcftools view -r ${chrom}:${pos} -Oz -o $outDIR/$outVCF.noFilter.vcf.gz $vcf
    bcftools tabix -f $outDIR/$outVCF.noFilter.vcf.gz
    vcftools --gzvcf $outDIR/$outVCF.noFilter.vcf.gz --chr $chrom --from-bp $((pos-2)) --to-bp $((pos+2)) --012 --out $outDIR/${outGT}_noFilter

    ## Softmask FORMAT/DP<6 
    echo -e '\nSoftmask FORMAT/DP<6'
    time bcftools filter -r ${chrom}:${pos} -S . -e "FMT/DP<6" -Oz -o $outDIR/$outVCF.DP6.vcf.gz $vcf
    bcftools tabix -f $outDIR/$outVCF.DP6.vcf.gz
    vcftools --gzvcf $outDIR/$outVCF.DP6.vcf.gz --chr $chrom --from-bp $((pos-2)) --to-bp $((pos+2)) --012 --out $outDIR/${outGT}_DP6

    ## Softmask FORMAT/DP<10
    echo -e '\nSoftmask FORMAT/DP<10'
    time bcftools filter -r ${chrom}:${pos} -S . -e "FMT/DP<10" -Oz -o $outDIR/$outVCF.DP10.vcf.gz $vcf
    bcftools tabix -f $outDIR/$outVCF.DP10.vcf.gz
    vcftools --gzvcf $outDIR/$outVCF.DP10.vcf.gz --chr $chrom --from-bp $((pos-2)) --to-bp $((pos+2)) --012 --out $outDIR/${outGT}_DP10

    ## Change longFormat sample names to shortFormat
    echo -e '\nChange longFormat sample names to shortFormat'
    cat $outDIR/${markerID#*_}-${chrom}-${pos}_DP10.012.indv | \
        sed 's/\(10x_\|10x2_\|10xNEW_\|Pla_\|Ave_\|GH_\|60x_\|60x2_\|2x_\)//g; s/\(TRIO-\|n96-\)//g' > $outDIR/${outGT}.shortFormat.indv

    ## Create file with stitch genotypes
    echo -e '\nCreate combined GT file'
    paste $outDIR/${outGT}.shortFormat.indv \
        <(cut -f2 $outDIR/${outGT}_noFilter.012) \
        <(cut -f2 $outDIR/${outGT}_DP6.012) \
        <(cut -f2 $outDIR/${outGT}_DP10.012) > $outDIR/mpileup_$outGT.gt
done


Chr7	7_1	49828363	1-Chr7-49828363


No Filter

real	0m0.107s
user	0m0.083s
sys	0m0.013s

VCFtools - 0.1.16
(C) Adam Auton and Anthony Marcketta 2009

Parameters as interpreted:
	--gzvcf ./validation/mpileup-7_1-Chr7-49828363.noFilter.vcf.gz
	--chr Chr7
	--to-bp 49828365
	--012
	--out ./validation/1-Chr7-49828363_noFilter
	--from-bp 49828361

Using zlib version: 1.2.11
After filtering, kept 1074 out of 1074 Individuals
Writing 012 matrix files ... Done.
After filtering, kept 1 out of a possible 1 Sites
Run Time = 0.00 seconds

Softmask FORMAT/DP<6

real	0m0.085s
user	0m0.075s
sys	0m0.009s

VCFtools - 0.1.16
(C) Adam Auton and Anthony Marcketta 2009

Parameters as interpreted:
	--gzvcf ./validation/mpileup-7_1-Chr7-49828363.DP6.vcf.gz
	--chr Chr7
	--to-bp 49828365
	--012
	--out ./validation/1-Chr7-49828363_DP6
	--from-bp 49828361

Using zlib version: 1.2.11
After filtering, kept 1074 out of 1074 Individuals
Writing 012 matrix files ... Done.
After filtering, kept 1 out of a possible 1 S

### Chr8

In [30]:
cd /nfs/scistore18/bartogrp/apal/snap_hap/variants/vcf_bcftools_Am_all
KASPcoords=./KASPcoords.pos

chrom=Chr8
vcf=./${chrom}/Am_all_bcftools_${chrom}_SnpGap5_biSNPs_filtered-DP_500-7732_QUAL20_MQ30.vcf.gz
outDIR=./validation

for snpID in {1..2}
do
    markerID=$(grep -w $chrom $KASPcoords | sed -n "${snpID}p" | cut -f3)
    pos=$(grep -w $chrom $KASPcoords | sed -n "${snpID}p" | cut -f4)
    outVCF=mpileup-${markerID}-${chrom}-${pos}
    outGT=${markerID#*_}-${chrom}-${pos}
    echo -e "\n"$chrom"\t"$markerID"\t"$pos"\t"$outGT"\n"
    
    ## No Filter
    echo -e '\nNo Filter'
    time bcftools view -r ${chrom}:${pos} -Oz -o $outDIR/$outVCF.noFilter.vcf.gz $vcf
    bcftools tabix -f $outDIR/$outVCF.noFilter.vcf.gz
    vcftools --gzvcf $outDIR/$outVCF.noFilter.vcf.gz --chr $chrom --from-bp $((pos-2)) --to-bp $((pos+2)) --012 --out $outDIR/${outGT}_noFilter

    ## Softmask FORMAT/DP<6 
    echo -e '\nSoftmask FORMAT/DP<6'
    time bcftools filter -r ${chrom}:${pos} -S . -e "FMT/DP<6" -Oz -o $outDIR/$outVCF.DP6.vcf.gz $vcf
    bcftools tabix -f $outDIR/$outVCF.DP6.vcf.gz
    vcftools --gzvcf $outDIR/$outVCF.DP6.vcf.gz --chr $chrom --from-bp $((pos-2)) --to-bp $((pos+2)) --012 --out $outDIR/${outGT}_DP6

    ## Softmask FORMAT/DP<10
    echo -e '\nSoftmask FORMAT/DP<10'
    time bcftools filter -r ${chrom}:${pos} -S . -e "FMT/DP<10" -Oz -o $outDIR/$outVCF.DP10.vcf.gz $vcf
    bcftools tabix -f $outDIR/$outVCF.DP10.vcf.gz
    vcftools --gzvcf $outDIR/$outVCF.DP10.vcf.gz --chr $chrom --from-bp $((pos-2)) --to-bp $((pos+2)) --012 --out $outDIR/${outGT}_DP10

    ## Change longFormat sample names to shortFormat
    echo -e '\nChange longFormat sample names to shortFormat'
    cat $outDIR/${markerID#*_}-${chrom}-${pos}_DP10.012.indv | \
        sed 's/\(10x_\|10x2_\|10xNEW_\|Pla_\|Ave_\|GH_\|60x_\|60x2_\|2x_\)//g; s/\(TRIO-\|n96-\)//g' > $outDIR/${outGT}.shortFormat.indv

    ## Create file with stitch genotypes
    echo -e '\nCreate combined GT file'
    paste $outDIR/${outGT}.shortFormat.indv \
        <(cut -f2 $outDIR/${outGT}_noFilter.012) \
        <(cut -f2 $outDIR/${outGT}_DP6.012) \
        <(cut -f2 $outDIR/${outGT}_DP10.012) > $outDIR/mpileup_$outGT.gt
done


Chr8	8_1	27807304	1-Chr8-27807304


No Filter

real	0m0.048s
user	0m0.043s
sys	0m0.004s

VCFtools - 0.1.16
(C) Adam Auton and Anthony Marcketta 2009

Parameters as interpreted:
	--gzvcf ./validation/mpileup-8_1-Chr8-27807304.noFilter.vcf.gz
	--chr Chr8
	--to-bp 27807306
	--012
	--out ./validation/1-Chr8-27807304_noFilter
	--from-bp 27807302

Using zlib version: 1.2.11
After filtering, kept 1074 out of 1074 Individuals
Writing 012 matrix files ... Done.
After filtering, kept 1 out of a possible 1 Sites
Run Time = 1.00 seconds

Softmask FORMAT/DP<6

real	0m0.052s
user	0m0.046s
sys	0m0.005s

VCFtools - 0.1.16
(C) Adam Auton and Anthony Marcketta 2009

Parameters as interpreted:
	--gzvcf ./validation/mpileup-8_1-Chr8-27807304.DP6.vcf.gz
	--chr Chr8
	--to-bp 27807306
	--012
	--out ./validation/1-Chr8-27807304_DP6
	--from-bp 27807302

Using zlib version: 1.2.11
After filtering, kept 1074 out of 1074 Individuals
Writing 012 matrix files ... Done.
After filtering, kept 1 out of a possible 1 S