# Set up the environment

In [6]:
#Arguments/Parameters

working_dir = "/home/jupyter/notebooks/Ancestry"
workspace_bucket = Sys.getenv('WORKSPACE_BUCKET')

In [7]:
#Set up the environment

#load packages
library(tidyverse)
library(reshape2)

#Define functions
show_msg <- function(x){ 
    print(x)
    flush.console()
}


In [None]:
#Download and unpack PLINK
#This may error if the install link changes. You may need to manually update it by searching for it here: https://www.cog-genomics.org/plink/2.0/
system(glue::glue("
mkdir {working_dir}/software/plink2
cd {working_dir}/software/plink2
wget https://s3.amazonaws.com/plink2-assets/plink2_linux_avx2_20210920.zip
unzip plink2_linux_avx2_20210920.zip
"))

In [3]:
#Download and install ADMIXTURE
system(glue::glue("
cd {working_dir}/software
wget https://dalexander.github.io/admixture/binaries/admixture_linux-1.3.0.tar.gz
tar -xvzf admixture_linux-1.3.0.tar.gz
mv dist admixture
rm admixture_linux-1.3.0.tar.gz
"))

In [4]:
#Create directory structure

system(glue::glue("
cd {working_dir}

#Create the directory to store the liftover files
if [ ! -d '{working_dir}/admixture' ] 
then
mkdir admixture
fi

"))

# Prepare the data

The data was already downloaded and pre-processed in the New_Download_Process_CCLE_VCF notebook. We want the data that has split genotypes but is not phased. So just transfer that data from the rfmix dataset to this notebook. 

Also, use the data that has not undergone a MAF filter step. Instead, first pre-filter the 1000 genomes data to only include SNPs with a MAF > 0.01. The logic for this is that if we perform a MAF filtering step on the CCLE data, we are affecting rarer ancestry groups (AFR, AMR, SAS) more than common ancestry groups (EAS, EUR), since the rare ancestry groups only make up a minor subset of the data. If we filter the whole dataset by MAF, then we will clip off more AFR, AMR, and SAS SNPs than EAS and EUR SNPs.

ADMIXTURE requires the reference and the target data to have intersecting SNPs. Use bcftools to calculate the intersection

In [5]:
#First, perform MAF filtering on the 1000 genomes data.
#Put this output into the admixture folder to make life a little easier.
#This data has been pre-processed a bit as part of the rfmix pipeline. So just use those data
chromosomes = seq(from = 1, to = 22, by = 1)
for(chr in chromosomes){
system(glue::glue("
cp {working_dir}/software/rfmix/hg38/1kg.ogsamples.chr{chr}.vcf.gz {working_dir}/admixture
cp {working_dir}/software/rfmix/hg38/1kg.ogsamples.chr{chr}.vcf.gz.tbi {working_dir}/admixture
"))
}

In [6]:
#Copy the CCLE data to the same directory
chromosomes = seq(from = 1, to = 22, by = 1)
for(chr in chromosomes){
system(glue::glue("
cp {working_dir}/raw_data/new_ccle/split.hg38.new.ccle.vcf.gz {working_dir}/admixture
cp {working_dir}/raw_data/new_ccle/split.hg38.new.ccle.vcf.gz.tbi {working_dir}/admixture
"))
}

In [None]:
#Split the data into different chromosomes to speed things up
chromosomes = c(seq(from = 1, to = 22, by = 1), "X")

for(chr in chromosomes){
system(glue::glue("
cd {working_dir}/admixture
{working_dir}/software/bcftools/bcftools view -r chr{chr} split.hg38.new.ccle.vcf.gz -Oz -o chr{chr}.hg38.new.ccle.vcf.gz
{working_dir}/software/tabix-0.2.6/tabix -p vcf {working_dir}/raw_data/new_ccle/chr{chr}.hg38.new.ccle.vcf.gz
"))
}



In [None]:
#Now calculate the intersection between the ccle variant calls and the 1kg variant calls
chromosomes = seq(from = 1, to = 22, by = 1)

for(chr in chromosomes){
system(glue::glue("
cd {working_dir}/admixture
{working_dir}/software/bcftools/bcftools isec -p {working_dir}/admixture/isec_chr{chr} -n=2 -Oz 1kg.ogsamples.chr{chr}.vcf.gz chr{chr}.hg38.new.ccle.vcf.gz 
"))

In [None]:
#Merge all of the outputs back together into a single dataset
cd /home/jupyter/notebooks/Ancestry/admixture
/home/jupyter/notebooks/Ancestry/software/bcftools/bcftools concat isec_chr{1..22}/0000.vcf.gz --threads {num.threads} -Oz -o merged.1kg.admixture.reference.vcf.gz &
/home/jupyter/notebooks/Ancestry/software/bcftools/bcftools concat isec_chr{1..22}/0001.vcf.gz --threads {num.threads} -Oz -o merged.ccle.admixture.target.vcf.gz 

In [14]:
#Convert to plink format
system(glue::glue("
cd {working_dir}/admixture
{working_dir}/software/plink2/plink2 --vcf merged.1kg.admixture.reference.vcf.gz --make-bed --out merged.1kg.admixture.reference &
{working_dir}/software/plink2/plink2 --vcf merged.ccle.admixture.target.vcf.gz --make-bed --out merged.ccle.admixture.target
"))

# Run ADMIXTURE!

We want to run ADMIXTURE in supervised mode. So first, we need to create a sample annotation file

In [20]:
#Extract the sample names from the vcf file 
system(glue::glue("
cd {working_dir}/admixture
{working_dir}/software/bcftools/bcftools query -l merged.1kg.admixture.reference.vcf.gz > 1kg.sample.order
"))

#Load in the sample names
paste(working_dir, "/admixture", sep = "") %>% setwd()
sample.names = read.table('1kg.sample.order', sep = "\t") %>%
pull(1)


#Load in the sample annotation file
paste(working_dir, "/admixture", sep = "") %>% setwd()
sample.map = read.table('1kg_sample_map', sep = "\t") %>%
rename("name" = 1, "ancestry" = 2)


#Create the popfile
pop.file = sample.names %>%
plyr::mapvalues(from = sample.map$name, to = sample.map$ancestry, warn_missing = FALSE)
write.table(pop.file, "merged.1kg.admixture.reference.pop", sep = "\t", col.names = FALSE, row.names = FALSE, quote = FALSE)

Now we can actually run ADMIXTURE

In [23]:
#First run admixture on the reference panel

#Pipe portion. Just ignore. This is here so that the syntax of the below code works
code.chunk = "tee log5.out"

system(glue::glue("
cd {working_dir}/admixture
{working_dir}/software/admixture/admixture_linux-1.3.0/admixture --cv --supervised -j60 merged.1kg.admixture.reference.bed 5 | {code.chunk}
"))

In [24]:
#Now project the CCLE samples onto the gnomad samples
system(glue::glue("
cd {working_dir}/admixture
cp merged.1kg.admixture.reference.5.P merged.ccle.admixture.target.5.P.in
{working_dir}/software/admixture/admixture_linux-1.3.0/admixture -j30 -P merged.ccle.admixture.target.bed 5
"))