# Variant calling and filtering with n96 samples

In [1]:
## Load modules
module load bcftools/1.18 python3
unset $PYTHONPATH
export PYTHONPATH=$PYTHONPATH:~/.local/lib/python3.9/site-packages
module list


Lmod is automatically replacing "JupyterPython/2.0" with "python/3.9.7".


Currently Loaded Modules:
  1) git/2.35.1   3) htslib/1.18   5) bcftools/1.18   7) python3
  2) GSL/2.7      4) perl/5.36.0   6) python/3.9.7

 



In [2]:
cd /nfs/scistore18/bartogrp/apal/snap_hap/variants/vcf_bcftools_n96

## *bcftools* mpileup ... | *bcftools* call ...

In [3]:
## Read and set variables
baseDIR=/nfs/scistore18/bartogrp/apal/snap_hap/variants/vcf_bcftools_n96

chrom=Chr1
ref=/nfs/scistore18/bartogrp/apal/snap_hap/ref_genome/v3.5/Amajus_v3.5.fa
bamlist=$baseDIR/bams_n96.txt
outVCF=${baseDIR}/n96_Chr1.vcf.gz

## Print variables
echo chrom: $chrom
echo Reference Genome: $ref
echo BAM Files: $bamlist
echo outVCF: $outVCF
echo -e "\n"

chrom: Chr1
Reference Genome: /nfs/scistore18/bartogrp/apal/snap_hap/ref_genome/v3.5/Amajus_v3.5.fa
BAM Files: /nfs/scistore18/bartogrp/apal/snap_hap/variants/vcf_bcftools_n96/bams_n96.txt
outVCF: /nfs/scistore18/bartogrp/apal/snap_hap/variants/vcf_bcftools_n96/n96_Chr1.vcf.gz




In [None]:
## Submit job on cluster
cd /nfs/scistore18/bartogrp/apal/snap_hap/variants/jobs/bcftools-n96

# chrom=Chr1
baseDIR=/nfs/scistore18/bartogrp/apal/snap_hap/variants/vcf_bcftools_n96
ref=/nfs/scistore18/bartogrp/apal/snap_hap/ref_genome/v3.5/Amajus_v3.5.fa
bamlist=$baseDIR/bams_n96.txt

for chrom in Chr{1..8}
do
    outVCF=${baseDIR}/n96_${chrom}.vcf.gz
    sbatch -J n96-${chrom}_bcftools ~/snap_hap/_scripts/sbatch/vcf-utils/job-bcftools_mpileup-n96.sbatch $chrom $ref $bamlist $outVCF
done

## *bcftools* filter

Average coverage = 0.86x  
Total expected coverage = 0.86x96 = 82.5   
Total allowed coverage for filtering = 82.5x2 = 165  

In [None]:
# chrom=Chr1
## Remove invariant sites, INDELs & SNPs within 5bp, multi-allelic & monomorphic sites
## Filter based on DP, QUAL, MQ, F_MISSING & MAF
bcftools filter --threads 5 -e "INFO/DP>165 | QUAL<20 | MQ<30 | F_MISSING>0.50 | AC==0 | AC==AN" --SnpGap 5 n96_${chrom}.vcf.gz | \
    bcftools view --threads 5 --write-index -m2 -M2 -v snps -Oz -o n96_${chrom}_biSNPs_filtered.vcf.gz -

In [None]:
## Submit job in cluster
cd /nfs/scistore18/bartogrp/apal/snap_hap/variants/jobs/bcftools-n96

baseDIR=/nfs/scistore18/bartogrp/apal/snap_hap/variants/vcf_bcftools_n96
ref=/nfs/scistore18/bartogrp/apal/snap_hap/ref_genome/v3.5/Amajus_v3.5.fa
bamlist=$baseDIR/bams_n96.txt

for chrom in Chr{1..8}
do
    outVCF=${baseDIR}/n96_${chrom}.vcf.gz
    sbatch -J n96-${chrom}_filter ~/snap_hap/_scripts/sbatch/vcf-utils/job-bcftools_mpileup-n96.sbatch $chrom $ref $bamlist $outVCF
done

## *bcftools* concat

In [14]:
realpath /nfs/scistore18/bartogrp/apal/snap_hap/variants/vcf_bcftools_n96/chromVCFs_filtered/*.vcf.gz > ./n96-vcfList.txt
cat ./n96-vcfList.txt

/nfs/scistore18/bartogrp/apal/snap_hap/variants/vcf_bcftools_n96/chromVCFs_filtered/n96_Chr1_biSNPs_filtered.vcf.gz
/nfs/scistore18/bartogrp/apal/snap_hap/variants/vcf_bcftools_n96/chromVCFs_filtered/n96_Chr2_biSNPs_filtered.vcf.gz
/nfs/scistore18/bartogrp/apal/snap_hap/variants/vcf_bcftools_n96/chromVCFs_filtered/n96_Chr3_biSNPs_filtered.vcf.gz
/nfs/scistore18/bartogrp/apal/snap_hap/variants/vcf_bcftools_n96/chromVCFs_filtered/n96_Chr4_biSNPs_filtered.vcf.gz
/nfs/scistore18/bartogrp/apal/snap_hap/variants/vcf_bcftools_n96/chromVCFs_filtered/n96_Chr5_biSNPs_filtered.vcf.gz
/nfs/scistore18/bartogrp/apal/snap_hap/variants/vcf_bcftools_n96/chromVCFs_filtered/n96_Chr6_biSNPs_filtered.vcf.gz
/nfs/scistore18/bartogrp/apal/snap_hap/variants/vcf_bcftools_n96/chromVCFs_filtered/n96_Chr7_biSNPs_filtered.vcf.gz
/nfs/scistore18/bartogrp/apal/snap_hap/variants/vcf_bcftools_n96/chromVCFs_filtered/n96_Chr8_biSNPs_filtered.vcf.gz


In [23]:
echo -e '\nConcatinating chromosome VCF'
time bcftools concat -a --threads 10 -Oz -o n96_biSNPs.filtered.vcf.gz -f n96-vcfList.txt

echo 'Indexing concatenated VCF'
time bcftools index -f n96_biSNPs.filtered.vcf.gz


Concatinating chromosome VCF
Checking the headers and starting positions of 8 files

real	2m15.494s
user	4m9.415s
sys	0m6.083s
Indexing concatenated VCF

real	0m15.604s
user	0m15.444s
sys	0m0.148s


In [15]:
bcftools query -l n96_biSNPs.filtered.vcf.gz

n96_Am_Ave_x3318_v3.5
n96_Am_Ave_x3327_v3.5
n96_Am_Ave_x3333_v3.5
n96_Am_Ave_x3394_v3.5
n96_Am_Ave_x4101_v3.5
n96_Am_Ave_x4102_v3.5
n96_Am_Ave_x4116_v3.5
n96_Am_Ave_x4128_v3.5
n96_Am_Ave_x4129_v3.5
n96_Am_Ave_x4134_v3.5
n96_Am_Ave_x4137_v3.5
n96_Am_Ave_x4140_v3.5
n96_Am_Ave_x4148_v3.5
n96_Am_Ave_x4149_v3.5
n96_Am_Ave_x4150_v3.5
n96_Am_Ave_x4153_v3.5
n96_Am_Ave_x4155_v3.5
n96_Am_Ave_x4158_v3.5
n96_Am_Ave_x4159_v3.5
n96_Am_Ave_x4161_v3.5
n96_Am_Ave_x4178_v3.5
n96_Am_Ave_x4359_v3.5
n96_Am_Ave_x4392_v3.5
n96_Am_Ave_x4429_v3.5
n96_Am_Ave_x4459_v3.5
n96_Am_Ave_x4463_v3.5
n96_Am_Ave_x4477_v3.5
n96_Am_Ave_x4491_v3.5
n96_Am_Ave_x4570_v3.5
n96_Am_Ave_x4579_v3.5
n96_Am_Ave_x4580_v3.5
n96_Am_Ave_x4585_v3.5
n96_Am_Ave_x4593_v3.5
n96_Am_Ave_x4608_v3.5
n96_Am_Ave_x4624_v3.5
n96_Am_Ave_x4786_v3.5
n96_Am_Ave_x4837_v3.5
n96_Am_Ave_x4867_v3.5
n96_Alatifgolium_Des_E373-3_v3.5
n96_Amolle_Cadi_x4248_v3.5
n96_Amolle_Cadi_x4251_v3.5
n96_Amolle_Cadi_x4258_v3.5
n96_Amolle_Cadi_x4260_v3.5
n96_Amolle_Des_D357-4_v

In [29]:
echo 'Total no. of variant sites'
bcftools view -H n96_biSNPs.filtered.vcf.gz | wc -l

Total no. of variant sites
1521204


## Convert VCF to phylip

In [34]:
time python ~/_softwares/vcf2phylip.py -i ./n96_biSNPs.filtered.vcf.gz -o n96_Morotium_Des_P168-1_v3.5


Converting file './n96_biSNPs.filtered.vcf.gz':

Number of samples in VCF: 96
500000 genotypes processed.
1000000 genotypes processed.
1500000 genotypes processed.
Total of genotypes processed: 1521204
Genotypes excluded because they exceeded the amount of missing data allowed: 0
Genotypes that passed missing data filter but were excluded for being MNPs: 0
SNPs that passed the filters: 1521204

Outgroup, 'n96_Morotium_Des_P168-1_v3.5', added to the matrix(ces).
Sample 1 of 96, 'n96_Am_Ave_x3318_v3.5', added to the nucleotide matrix(ces).
Sample 2 of 96, 'n96_Am_Ave_x3327_v3.5', added to the nucleotide matrix(ces).
Sample 3 of 96, 'n96_Am_Ave_x3333_v3.5', added to the nucleotide matrix(ces).
Sample 4 of 96, 'n96_Am_Ave_x3394_v3.5', added to the nucleotide matrix(ces).
Sample 5 of 96, 'n96_Am_Ave_x4101_v3.5', added to the nucleotide matrix(ces).
Sample 6 of 96, 'n96_Am_Ave_x4102_v3.5', added to the nucleotide matrix(ces).
Sample 7 of 96, 'n96_Am_Ave_x4116_v3.5', added to the nucleotide 

## Subset n96 *Amajus* samples from STITCH vcf

In [39]:
## subset n96 individuals from stitch vcfs
# for chrom in Chr{1..8}
# do
#     stitchVcf=~/snap_hap/variants/stitch/${chrom}/Am_all_stitch_${chrom}_SnpGap5_biSNPs_filtered-DP_500-7732_QUAL20_MQ30.PL.vcf.gz
#     outVcf=~/snap_hap/variants/vcf_bcftools_n96/chromVCFs_stitch/stitch-n96_${chrom}.vcf.gz
#     echo -e $chrom $stitchVcf
#     time bcftools view -S ~/snap_hap/variants/vcf_bcftools_n96/n96-samples.txt --force-samples -Oz -o $outVcf $stitchVcf                                                   
#     bcftools tabix -f $outVcf
# done

In [None]:
## Concat the chrom stitch vcfs to produce 1 n96 stitch vcf
realpath ./chromVCFs_stitch/*vcf.gz > n96-stitchVcfList.txt
cat ./n96-stitchVcfList.txt

echo -e '\nConcatinating chromosome VCF'
time bcftools concat -a --threads 10 -Oz -o ./n96_stitch.vcf.gz -f ./n96-stitchVcfList.txt
echo 'Indexing concatenated VCF'
time bcftools index -f ./n96_stitch.vcf.gz

/nfs/scistore18/bartogrp/apal/snap_hap/variants/vcf_bcftools_n96/chromVCFs_stitch/stitch-n96_Chr1.vcf.gz
/nfs/scistore18/bartogrp/apal/snap_hap/variants/vcf_bcftools_n96/chromVCFs_stitch/stitch-n96_Chr2.vcf.gz
/nfs/scistore18/bartogrp/apal/snap_hap/variants/vcf_bcftools_n96/chromVCFs_stitch/stitch-n96_Chr3.vcf.gz
/nfs/scistore18/bartogrp/apal/snap_hap/variants/vcf_bcftools_n96/chromVCFs_stitch/stitch-n96_Chr4.vcf.gz
/nfs/scistore18/bartogrp/apal/snap_hap/variants/vcf_bcftools_n96/chromVCFs_stitch/stitch-n96_Chr5.vcf.gz
/nfs/scistore18/bartogrp/apal/snap_hap/variants/vcf_bcftools_n96/chromVCFs_stitch/stitch-n96_Chr6.vcf.gz
/nfs/scistore18/bartogrp/apal/snap_hap/variants/vcf_bcftools_n96/chromVCFs_stitch/stitch-n96_Chr7.vcf.gz
/nfs/scistore18/bartogrp/apal/snap_hap/variants/vcf_bcftools_n96/chromVCFs_stitch/stitch-n96_Chr8.vcf.gz

Concatinating chromosome VCF
Checking the headers and starting positions of 8 files


In [None]:
## Convert stitch vcf to phylip
time python3 ~/_softwares/vcf2phylip.py -i ./n96_stitch.vcf.gz