The purpose of this code is to subset the gnomAD vcf files so that they only include variants that are of interest that map to guide ranges in the filtering.bed file.

In [1]:
#Arguments/Parameters

working_dir = "/home/jupyter/notebooks/Ancestry"
workspace_bucket = Sys.getenv('WORKSPACE_BUCKET')

filtering_bed = "minilibcas9_filtering.bed" #The name of the bed file that will be used to subset the gnomAD data. This file must be uploaded to the workspace bucket.

out_directory = "minilibcas9" #the output directory name


In [2]:
#Build the directory structure 

system(glue::glue("

cd {working_dir}

#Build the master directory if it doesn't exist
if [ ! -d '{working_dir}/filtered_output' ] 
then
mkdir filtered_output
fi

#Make the sample-specific sub-directory
mkdir filtered_output/{out_directory}

"))

In [3]:
#Install/load required packages/software

#Download R packages
install.packages("tictoc")

#Load R packages
library(dplyr)
library(tictoc)

#Install bcftools
step_install_bcftools <- !file.exists(glue::glue("{working_dir}/software/bcftools/bcftools"))

if(step_install_bcftools) {
system(glue::glue("
cd {working_dir}/software
git clone --recurse-submodules git://github.com/samtools/htslib.git
git clone git://github.com/samtools/bcftools.git
cd bcftools
autoheader && autoconf && ./configure --enable-libgsl --enable-perl-filters
make
export BCFTOOLS_PLUGINS=/home/jupyter-user/notebooks/Ancestry/software/bcftools/plugins
")) } else {print("bcftools is already installed")}



Installing package into ‘/home/jupyter/notebooks/packages’
(as ‘lib’ is unspecified)


Attaching package: ‘dplyr’


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union




[1] "bcftools is already installed"


In [4]:
##Load the filtering.bed file into the filtering output directory

system(glue::glue("

cd {working_dir}/filtered_output/{out_directory}
gsutil cp gs://fc-45c0e148-0b1c-4244-9bfc-feb559bbc514/{filtering_bed} .

"))

In [None]:
##Loop through all of the files and subset them

tic()
system(glue::glue("

cd {working_dir}/raw_data/hgdp

for vcffile in {working_dir}/raw_data/hgdp/*.vcf.bgz
do
/home/jupyter/notebooks/Ancestry/software/bcftools/bcftools view -R {working_dir}/filtered_output/{out_directory}/{filtering_bed} $vcffile -o $vcffile.{out_directory} -Ov
done

"))
toc()

In [None]:
#Move the files into a different directory for the filtered output
system(glue::glue("

cd {working_dir}/raw_data/hgdp
mv *{out_directory} {working_dir}/filtered_output/{out_directory}

"))