# Set up the environment

In [13]:
#Arguments/Parameters

working_dir = '/home/jupyter/notebooks/Ancestry' #home directory for this workspace
workspace_bucket = Sys.getenv('WORKSPACE_BUCKET') #the workspace bucket that we will upload the output to
num.threads = 64 #Specify the number of CPUs you want to use. Some steps take a lot of compute.
imputation.server.password = '6\\{VDgXClmd\\$iB6' #Password to download the imputed data
phased.data.password = '16HUsumDrRBWvu' #password to download the phased (but not imputed) data

In [2]:
#Load packages
library(tidyverse)

#Define functions
show_msg <- function(x){ 
    print(x)
    flush.console()
}

── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.3.1 ──

[32m✔[39m [34mggplot2[39m 3.3.5     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.1.4     [32m✔[39m [34mdplyr  [39m 1.0.7
[32m✔[39m [34mtidyr  [39m 1.1.3     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 2.0.1     [32m✔[39m [34mforcats[39m 0.5.1

── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()



In [3]:
system(glue::glue("
if [ ! -d 'mkdir {working_dir}/raw_data' ] 
then
mkdir {working_dir}/raw_data
fi


if [ ! -d 'mkdir {working_dir}/raw_data/new_ccle' ] 
then
mkdir {working_dir}/raw_data/new_ccle
fi
"))

In [None]:
#Download and install bcftools
#Install bcftools
step_install_bcftools <- !file.exists(glue::glue("{working_dir}/software/bcftools/bcftools"))

if(step_install_bcftools) {
system(glue::glue("
cd {working_dir}/software
git clone --recurse-submodules git://github.com/samtools/htslib.git
git clone git://github.com/samtools/bcftools.git
cd bcftools
autoheader && autoconf && ./configure --enable-libgsl --enable-perl-filters
make
export BCFTOOLS_PLUGINS=/home/jupyter-user/notebooks/Ancestry/software/bcftools/plugins
")) } else {print("bcftools is already installed")}

In [None]:
#Install tabix

# Download the VCF files 

These files are from the CCLEv2 dataset.

There are more BAM files than VCF files, so need to ask Jeremie/Javad how to get the additional VCFs.

First download the sample manifest from the terra notebook, then use the gsutil links in that manifest to download the data.

Some samples have both WES and WGS data, and we want to use the WGS data in these cases. So also need to do a little work in R to make a list of files that we want to download.

In [5]:
#Download the sample manifest
system(glue::glue("
cd {working_dir}/raw_data/new_ccle
gsutil cp gs://fc-45c0e148-0b1c-4244-9bfc-feb559bbc514/sample.tsv .
"))

#Load the sample manifest into R
paste(working_dir, "/raw_data/new_ccle", sep = "") %>% setwd()
sample.manifest = read.table('sample.tsv', sep = "\t", header = T)


#Subset the dataset so that it only contains the samples that we are interested in downloading
samples.to.download = sample.manifest %>%
select(entity.sample_id, stripped_cell_line_name, hg38_wgs_hc_cnn_filtered_vcf, hg38_wes_hc_cnn_filtered_vcf) %>%
rename("ach_id" = entity.sample_id, "cell_line" = stripped_cell_line_name, "wgs.vcf" = hg38_wgs_hc_cnn_filtered_vcf, "wes.vcf" = hg38_wes_hc_cnn_filtered_vcf) %>%
na_if("") %>%
filter(!is.na(wgs.vcf) | !is.na(wes.vcf)) %>%
mutate(link.to.use = ifelse(is.na(wgs.vcf), wes.vcf, wgs.vcf))


#Extract the list of files to download. Just means we need to write one fewer line of bash code to get this data.
download.list = samples.to.download %>%
pull(link.to.use)


#Export the full data frame and a list of files to download
write.table(samples.to.download, 'samples.to.download', row.names = F, col.names = T, sep = "\t", quote = FALSE)
write.table(download.list, 'download.list', row.names = F, col.names = F, sep = "\t", quote = FALSE)

In [44]:
#Download all of the vcf files
system(glue::glue("
cd {working_dir}/raw_data/new_ccle
cat download.list | while read line
do
echo $line
gsutil -u cclfbilling cp $line .
done
"))

# Process the vcf files

In [47]:
#Index all of the vcf files
system(glue::glue("
cd {working_dir}/raw_data/new_ccle

for vcffile in *.vcf.gz
do
{working_dir}/software/tabix-0.2.6/tabix -p vcf $vcffile
done
"))

In [3]:
#Filter all of the vcf files so that they only include variants in exons
#This list was downloade from ucsc genome browser table viewer.
#It is UCSC RefSeq refGene, coding exons only

#Download a bed file with all of the human exons
system(glue::glue("
cd {working_dir}/raw_data/new_ccle
gsutil cp {workspace_bucket}/exon_positions .
"))

#Filter the file
#Note to self: Filtering with -T is much faster than with -R.
#Second note to self: Running this command with 96 CPUs has ~8 CPUs at full load. Running it with 16 only has ~2.
#Need to learn more about multi-threading with Bcftools. I suspect that there may be more efficient ways to run things.
system(glue::glue("
cd {working_dir}/raw_data/new_ccle

for vcffile in *.vcf.gz
do
{working_dir}/software/bcftools/bcftools view $vcffile -T exon_positions --threads {num.threads} -o exon.$vcffile -Oz;
done
"))

#And then index it
system(glue::glue("
cd {working_dir}/raw_data/new_ccle

for vcffile in exon.CDS*
do
{working_dir}/software/tabix-0.2.6/tabix -p vcf $vcffile;
done
"))

In [3]:
#Combine all of the vcf files into a single vcf.gz file
system(glue::glue("
cd {working_dir}/raw_data/new_ccle
filenames=$(find | grep 'exon' | grep '.vcf.gz' | grep -v '.tbi') 
{working_dir}/software/bcftools/bcftools merge $filenames -0 --missing-to-ref -Oz --threads {num.threads} -o hg38.new.ccle.vcf.gz
"))

In [6]:
#Re-name the vcf file so that the sample names are the ACH IDs

#First create a bridging file to convert from the CDS ID or cell line name to the ACH ID
ach.to.cds = samples.to.download %>%
select(ach_id, link.to.use, cell_line) %>%
rename(cds_id = link.to.use) %>%
mutate(cds_id = gsub("..*CDS", "CDS", cds_id)) %>%
mutate(cds_id = gsub("_..*", "", cds_id))


#Extract the sample names from the compiled vcf file
system(glue::glue("
cd {working_dir}/raw_data/new_ccle
find | grep 'exon' | grep '.vcf.gz' | grep -v '.tbi' > old.sample.names
"))

paste(working_dir, "/raw_data/new_ccle", sep = "") %>% setwd()
old.sample.names = read.table("old.sample.names", sep = "\t", header = F) %>%
pull(V1) %>%
gsub("..*CDS", "CDS", .) %>%
gsub("_..*", "", .)


#Now convert the old sample names into the ACH ID and write them
new.sample.names = old.sample.names %>%
plyr::mapvalues(from = ach.to.cds$cds_id, to = ach.to.cds$ach_id)
write.table(new.sample.names, "new.sample.names", sep = "\t", col.names = F, row.names = F, quote = F)


#Rename the compiled vcf.gz file
system(glue::glue("
cd {working_dir}/raw_data/new_ccle
{working_dir}/software/bcftools/bcftools reheader --samples new.sample.names -o renamed.hg38.new.ccle.vcf.gz hg38.new.ccle.vcf.gz
"))




Variants at the same position are grouped together right now. So we should un-group them.

For example:

Current format: chr1 // pos123456789 // ref=A // alt=G/T

New format: chr1 // pos123456789 // ref=A // alt=G chr1 // pos123456789 // ref=A // alt=T

In [7]:
#split the genotype calls
system(glue::glue("
cd {working_dir}/raw_data/new_ccle
{working_dir}/software/bcftools/bcftools norm -m - renamed.hg38.new.ccle.vcf.gz -o split.hg38.new.ccle.vcf.gz
{working_dir}/software/tabix-0.2.6/tabix -p vcf split.hg38.new.ccle.vcf.gz
"))

In [8]:
#Remove SNPs with low MAF
system(glue::glue("
cd {working_dir}/raw_data/new_ccle
{working_dir}/software/bcftools/bcftools view -i 'MAF > 0.01' --threads {num.threads} split.hg38.new.ccle.vcf.gz -Oz -o maf.split.hg38.new.ccle.vcf.gz
{working_dir}/software/tabix-0.2.6/tabix -p vcf maf.split.hg38.new.ccle.vcf.gz
")) 

# Pre/Post-Imputation Server

For this project we are also interested in intronic SNPs. Unfortunately, only half of our samples are WGS. To get around that problem, we filtered the original dataset so that it only includes exons. Now we can perform SNP imputation to uncover many more SNPs. The best way to do that is with the Topmed Imputation Server.

Another option is to phase the data locally with Eagle2.4.

In [9]:
#We need to split the data by chromosome so that we can input it to the Michigan Imputation Server
chromosomes = c(seq(from = 1, to = 22, by = 1), "X")

for(chr in chromosomes){
    system(glue::glue("
    cd {working_dir}/raw_data/new_ccle
    {working_dir}/software/bcftools/bcftools view -r chr{chr} maf.split.hg38.new.ccle.vcf.gz -Oz -o chr{chr}.hg38.new.ccle.vcf.gz;
"))
}

Sadly, there is no way to interact with the Topmed Imputation Server from command line (that I know of). So we need to bring all of the data off the cloud so that we can perform the imputation.

Access the imputation server at: https://imputation.biodatacatalyst.nhlbi.nih.gov/#!

In [None]:
##############################
#Bring the data off the cloud#
##############################
system(glue::glue("
cd {working_dir}/raw_data/new_ccle
gsutil cp chr* {workspace_bucket}
"))

#################################################
#Bring the phased/imputed data back to the cloud#
#################################################
system(glue::glue("
cd {working_dir}/raw_data/new_ccle
curl -sL https://imputation.biodatacatalyst.nhlbi.nih.gov/get/430843/3f3bb6adc564ab74d7915bc04fc0533b5abe70b53b2bdb6efe51f58b412b482f | bash"))


#Unpack all of the .zip files
chromosomes = c(seq(from = 1, to = 22, by = 1), "X")
for(chr in chromosomes){
    system(glue::glue("
    cd {working_dir}/raw_data/new_ccle
    unzip -P {imputation.server.password} chr_{chr}.zip"))
}

In [None]:
#Merge all of the chromosomes back together and convert it to a bcf file
#Then index the file
system(glue::glue("
cd {working_dir}/raw_data/new_ccle
vcffiles=$(find | grep 'dose')
{working_dir}/software/bcftools/bcftools concat $vcffiles --threads {num.threads} -Ou -o imputed.hg38.ccle.new.bcf
{working_dir}/software/bcftools/bcftools index imputed.hg38.ccle.new.bcf --threads {num.threads} -c -f 
"))

# Create a phased (but not imputed) dataset

RFMix requires phased data, but the imputation is far too much and takes too long to compute.
Just create a phased version of the exon data without imputation. That should be good enough for ancestry inference. 

In [14]:
#########################################
#Bring the phased data back to the cloud#
#########################################
system(glue::glue("
mkdir {working_dir}/raw_data/new_ccle/phased_only
cd {working_dir}/raw_data/new_ccle/phased_only
curl -sL https://imputation.biodatacatalyst.nhlbi.nih.gov/get/431074/27cb2de603dcd046c7485af0c24c21eefe1e874757f8c89f461150105469e88e | bash
"))

#Unpack all of the .zip files, then move them up one directory and index them
chromosomes = c(seq(from = 1, to = 22, by = 1), "X")
for(chr in chromosomes){
    system(glue::glue("
    cd {working_dir}/raw_data/new_ccle/phased_only
    unzip -P {phased.data.password} chr_{chr}.zip
    mv chr{chr}.phased.vcf.gz {working_dir}/raw_data/new_ccle/chr{chr}.phased.vcf.gz
    {working_dir}/software/tabix-0.2.6/tabix -p vcf {working_dir}/raw_data/new_ccle/chr{chr}.phased.vcf.gz
"))
}

#Delete the temp dir
system(glue::glue("
rm -rf {working_dir}/raw_data/new_ccle/phased_only
"))

# Notes

Some blocks of the genome were unphased because the mismatch rate is too high (I still need to figure out what this actually means). In most cases this is relatively unimportant since few regions of the genome are cut. But for the Ancestry analysis it ends up being a big deal since CLSPN is in one of these regions.

As such, I think it is better to use the dataset that was 'exon filtered' -> 'unphased/unimputed', rather than the fully phased/imputed dataset. At least for most analysis for this project. The phased/imputed data will still be useful for other analysis that I am doing.

I need to figure out why the error rate is so high. Maybe I need to include a pre-filtering step before giving the data to the imputation server.

Note: pre-filtering by low MAF significantly improves phasing accuracy.