The purpose of this notebook is to phase genotypes using Eagle. The reason we want to do this is because we want to run RFMix to impute the ancestry fraction of unknown cell lines.

# Set up the environment and install all of the software

In [1]:
#Arguments/Parameters

working_dir = "/home/jupyter/Ancestry"
workspace_bucket = Sys.getenv('WORKSPACE_BUCKET')
num.threads = 64 #This is the number of threads that will be used to phase the genotypes

In [2]:
#Set up the environment

#load packages
library(tidyverse)
library(reshape2)

#Define functions
show_msg <- function(x){ 
    print(x)
    flush.console()
}

#Make directories
system(glue::glue("
cd {working_dir}

#Create the directory to store the liftover files
if [ ! -d '{working_dir}/LiftOver' ] 
then
mkdir LiftOver
fi

if [ ! -d '{working_dir}/LiftOver/hg38_to_hg19' ] 
then
mkdir LiftOver/hg38_to_hg19
fi

if [ ! -d '{working_dir}/rfmix_output' ]
then
mkdir {working_dir}/rfmix_output
fi
"))



“running command 'timedatectl' had status 1”
── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.3.1 ──

[32m✔[39m [34mggplot2[39m 3.3.5     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.1.6     [32m✔[39m [34mdplyr  [39m 1.0.7
[32m✔[39m [34mtidyr  [39m 1.1.4     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 2.1.1     [32m✔[39m [34mforcats[39m 0.5.1

── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()


Attaching package: ‘reshape2’


The following object is masked from ‘package:tidyr’:

    smiths




In [25]:
#Download and install software

#Download and install Samtools
system(glue::glue("
cd {working_dir}/software
wget https://github.com/samtools/samtools/releases/download/1.9/samtools-1.9.tar.bz2
tar -vxjf samtools-1.9.tar.bz2
cd samtools-1.9
make
cd {working_dir}/software
rm samtools-1.9.tar.bz2
"))

#Download and install Bcftools


#Set the bcftools plugin path and add bcftools to PATH
system(glue::glue("
export PATH=$PATH:{working_dir}/software/bcftools
export BCFTOOLS_PLUGINS='/home/jupyter/notebooks/Ancestry/software/bcftools/plugins'
"))


#Download and install tabix


Install RFMix and process all of the dependencies

In [None]:
#Download and instal RFMix
#I first installed RFMixv2 on the UGER cluster, then zipped and transferred the directory over to the google bucket for this project
system(glue::glue("
cd {working_dir}/software

#Create the directory to store the rfmix files
if [ ! -d '{working_dir}/software/rfmix' ] 
then
mkdir {working_dir}/software/rfmix
cd {working_dir}/software/rfmix

#Download the rfmix script
gsutil cp gs://fc-45c0e148-0b1c-4244-9bfc-feb559bbc514/rfmix.zip .
unzip rfmix.zip
rm rfmix.zip

"))


Also install and process the reference panel for the hg38 data.

The reference panel is from: http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20201028_3202_phased/

The genetic map is from a previous install of Eagle v2.4.1. So I just copied it over to the new dir.
https://alkesgroup.broadinstitute.org/Eagle/#x1-250005.1.2

The sample map was already downloaded for the hg19 version, so I just used the same one

In [5]:
#Download the hg38 1000 genomes VCF files and their index files


#Download the reference panel
system(glue::glue("
if [ ! -d '{working_dir}/software/rfmix/hg38' ]
then
mkdir {working_dir}/software/rfmix/hg38
fi
"))

chromosomes = seq(from = 1, to = 22, by = 1)
for(chr in chromosomes){
system(glue::glue("
cd {working_dir}/software/rfmix/hg38
wget http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000_genomes_project/release/20190312_biallelic_SNV_and_INDEL/ALL.chr{chr}.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz &
wget http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000_genomes_project/release/20190312_biallelic_SNV_and_INDEL/ALL.chr{chr}.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz.tbi &
"))
}


#Download the genetic map
system(glue::glue("
cp {working_dir}/software/Eagle_v2.4.1/tables/genetic_map_hg38_withX.txt {working_dir}/software/rfmix/hg38/genetic_map_hg38_withX.txt
"))


#Format the genetic map
#Load in the data and format it nicely
paste(working_dir, "/software/rfmix/hg38", sep = "") %>% setwd()
genetic.map = read.table("genetic_map_hg38_withX.txt", sep = " ", header = T) %>%
rename("chr" = 1, "pos" = 2, "combined" = 3, "genetic_pos" = 4) %>%
select(chr, pos, genetic_pos) %>%
mutate(chr = paste("chr", chr, sep = ""))

#Get a list of all of the chromosomes
unique.chromosomes = genetic.map %>% pull(chr) %>% unique()

#Loop through all of the chromosomes, format the data and then write the output file.
for(chrom in unique.chromosomes){
    
    isolated.chromosome = genetic.map %>%
    filter(chr %in% chrom) %>%
    mutate(genetic_pos = format(genetic_pos, scientific=F)) %>%
    mutate(genetic_pos = as.numeric(genetic_pos)) %>%
    arrange(genetic_pos)
    
    file.name = paste(chrom, "_genetic_map.txt", sep = "")
    
    write.table(isolated.chromosome, file.name, sep = "\t", col.names = T, row.names = F, quote = F)

}

#Download the sample map
#the sample map is a two column text file. the first column is the sample names. the second column is the ancestry group.
system(glue::glue("
cd {working_dir}/software/rfmix/hg38
gsutil cp {workspace_bucket}/1kg_sample_map .
"))

ERROR: Error in filter(., chr %in% chrom): object '*tmp*' not found


In [None]:
#Recode the chromosome names in the reference panel from "8" to "chr8"

chromosomes = seq(from = 1, to = 22, by = 1)

for(chr in chromosomes){
    
system(glue::glue("
cd {working_dir}/software/rfmix/hg38
{working_dir}/software/bcftools/bcftools annotate ALL.chr{chr}.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz --rename-chrs {working_dir}/LiftOver/hg19_to_hg38/hg19_to_hg38_chr_bridge -Oz -o 1kg_chr{chr}.recodeChr.vcf.gz
{working_dir}/software/tabix-0.2.6/tabix -p vcf 1kg_chr{chr}.recodeChr.vcf.gz

"))
}

In [5]:
#Remove unwanted samples from the reference panel.
#The unwanted samples are related, so they are probably going to skew our data.

code.chunk ='{print $1}'
system(glue::glue("
cd {working_dir}/software/rfmix/hg38
awk '{code.chunk}' 1kg_sample_map > 1kg.samples.to.keep
"))


#Now filter all of the bcf files so that they only contain these samples
chromosomes = seq(from = 22, to = 1, by = -1)

for(chr in chromosomes){
system(glue::glue("
cd {working_dir}/software/rfmix/hg38
/home/jupyter/notebooks/Ancestry/software/bcftools/bcftools view -S 1kg.samples.to.keep --force-samples 1kg_chr{chr}.recodeChr.vcf.gz -o 1kg.ogsamples.chr{chr}.vcf.gz -Oz;
"))
}


# Pre-process the CCLE VCFs

Aaron has done most of the processing for this VCF file. What was done is:

-Take the SNP6 birdseed files and convert to VCF
-Phase/Impute with the topmed reference panel
-Filter with a MAF cutoff of 1%
-re-header to ACH ID
-split multi-allelic sites


To make RFMix run a little bit faster, the final thing we want to do is split it into different chromosomes so that we can run each chromosome on its own CPU.

In [4]:
#Split the vcf file into each chromosome

cd {working_dir}/rfmix

for num in {1..22};
do
{working_dir}/software/bcftools/bcftools view split.all_chroms.maf.subset.reheader.vcf.gz --regions chr$num -Oz -o chr$num.split.all_chroms.maf.subset.reheader.vcf.gz
done

{working_dir}/software/bcftools/bcftools view split.all_chroms.maf.subset.reheader.vcf.gz --regions chrX -Oz -o chrX.split.all_chroms.maf.subset.reheader.vcf.gz
{working_dir}/software/bcftools/bcftools view split.all_chroms.maf.subset.reheader.vcf.gz --regions chrY -Oz -o chrY.split.all_chroms.maf.subset.reheader.vcf.gz


ERROR: Error in parse(text = text, keep.source = FALSE): <text>:1:3: unexpected numeric constant
1: 1..22
      ^


# Run RFMix 

Run RFMix to calculate the ancestry fractions for all of the cell lines.

We are running this on a big machine right now, but it looks like rfmix is only using one core, and really not that much memory. So in the future we should probabl try to modify the code to either take advantage of multithreading or just plan to run this on a smaller machine to save a bit of $.

In [2]:
#chromosomes = seq(from = 22, to = 1, by = -1)
chromosomes = c(2, 3, 5, 6, 7, 8, 9, 10, 11, 12, "X")

for(chr in chromosomes){
    
system(glue::glue("
cd {working_dir}/rfmix
{working_dir}/software/rfmix/rfmix/rfmix -f chr{chr}.split.all_chroms.maf.subset.reheader.vcf.gz -r {working_dir}/software/rfmix/hg38/1kg_chr{chr}.recodeChr.vcf.gz -m {working_dir}/software/rfmix/hg38/1kg_sample_map -g {working_dir}/software/rfmix/hg38/chr{chr}_genetic_map.txt -o chr{chr}.rfmix.output --chromosome=chr{chr}
"))
    
}

In [None]:
#temp re-run some chromosomes
system(glue::glue("
cd {working_dir}/rfmix
{working_dir}/software/rfmix/rfmix/rfmix -f chr2.split.all_chroms.maf.subset.reheader.vcf.gz -r {working_dir}/software/rfmix/hg38/1kg_chr2.recodeChr.vcf.gz -m {working_dir}/software/rfmix/hg38/1kg_sample_map -g {working_dir}/software/rfmix/hg38/chr2_genetic_map.txt -o chr2.rfmix.output --chromosome=chr2 
{working_dir}/software/rfmix/rfmix/rfmix -f chr3.split.all_chroms.maf.subset.reheader.vcf.gz -r {working_dir}/software/rfmix/hg38/1kg_chr3.recodeChr.vcf.gz -m {working_dir}/software/rfmix/hg38/1kg_sample_map -g {working_dir}/software/rfmix/hg38/chr3_genetic_map.txt -o chr3.rfmix.output --chromosome=chr3 
{working_dir}/software/rfmix/rfmix/rfmix -f chrX.split.all_chroms.maf.subset.reheader.vcf.gz -r {working_dir}/software/rfmix/hg38/1kg_chrX.recodeChr.vcf.gz -m {working_dir}/software/rfmix/hg38/1kg_sample_map -g {working_dir}/software/rfmix/hg38/chrX_genetic_map.txt -o chrX.rfmix.output --chromosome=chrX 
{working_dir}/software/rfmix/rfmix/rfmix -f chr5.split.all_chroms.maf.subset.reheader.vcf.gz -r {working_dir}/software/rfmix/hg38/1kg_chr5.recodeChr.vcf.gz -m {working_dir}/software/rfmix/hg38/1kg_sample_map -g {working_dir}/software/rfmix/hg38/chr5_genetic_map.txt -o chr5.rfmix.output --chromosome=chr5 
{working_dir}/software/rfmix/rfmix/rfmix -f chr6.split.all_chroms.maf.subset.reheader.vcf.gz -r {working_dir}/software/rfmix/hg38/1kg_chr6.recodeChr.vcf.gz -m {working_dir}/software/rfmix/hg38/1kg_sample_map -g {working_dir}/software/rfmix/hg38/chr6_genetic_map.txt -o chr6.rfmix.output --chromosome=chr6 
{working_dir}/software/rfmix/rfmix/rfmix -f chr7.split.all_chroms.maf.subset.reheader.vcf.gz -r {working_dir}/software/rfmix/hg38/1kg_chr7.recodeChr.vcf.gz -m {working_dir}/software/rfmix/hg38/1kg_sample_map -g {working_dir}/software/rfmix/hg38/chr7_genetic_map.txt -o chr7.rfmix.output --chromosome=chr7 
{working_dir}/software/rfmix/rfmix/rfmix -f chr8.split.all_chroms.maf.subset.reheader.vcf.gz -r {working_dir}/software/rfmix/hg38/1kg_chr8.recodeChr.vcf.gz -m {working_dir}/software/rfmix/hg38/1kg_sample_map -g {working_dir}/software/rfmix/hg38/chr8_genetic_map.txt -o chr8.rfmix.output --chromosome=chr8 
{working_dir}/software/rfmix/rfmix/rfmix -f chr9.split.all_chroms.maf.subset.reheader.vcf.gz -r {working_dir}/software/rfmix/hg38/1kg_chr9.recodeChr.vcf.gz -m {working_dir}/software/rfmix/hg38/1kg_sample_map -g {working_dir}/software/rfmix/hg38/chr9_genetic_map.txt -o chr9.rfmix.output --chromosome=chr9 
{working_dir}/software/rfmix/rfmix/rfmix -f chr10.split.all_chroms.maf.subset.reheader.vcf.gz -r {working_dir}/software/rfmix/hg38/1kg_chr10.recodeChr.vcf.gz -m {working_dir}/software/rfmix/hg38/1kg_sample_map -g {working_dir}/software/rfmix/hg38/chr10_genetic_map.txt -o chr10.rfmix.output --chromosome=chr10 
{working_dir}/software/rfmix/rfmix/rfmix -f chr11.split.all_chroms.maf.subset.reheader.vcf.gz -r {working_dir}/software/rfmix/hg38/1kg_chr11.recodeChr.vcf.gz -m {working_dir}/software/rfmix/hg38/1kg_sample_map -g {working_dir}/software/rfmix/hg38/chr11_genetic_map.txt -o chr11.rfmix.output --chromosome=chr11 
{working_dir}/software/rfmix/rfmix/rfmix -f chr12.split.all_chroms.maf.subset.reheader.vcf.gz -r {working_dir}/software/rfmix/hg38/1kg_chr12.recodeChr.vcf.gz -m {working_dir}/software/rfmix/hg38/1kg_sample_map -g {working_dir}/software/rfmix/hg38/chr12_genetic_map.txt -o chr12.rfmix.output --chromosome=chr12
"))

setwd('/home/jupyter/Ancestry/rfmix')
finished.files = list.files(path = ".", pattern = ".Q")
while(length(finished.files) != 23){
    Sys.sleep(5)
}
show_msg("All files are now complete!")
finished.files