The purpose of this notebook is to analyze the GWAS data for the ancestry-associated SNPs. Specifically, in this notebook I will try to identify cis-QTLs (that are mostly likely the result of the SNP-in-guide artifact).

# Set up the environment

In [23]:
#Arguments/Parameters

working_dir = '/home/jupyter/notebooks/Ancestry' #home directory for this workspace
workspace_bucket = Sys.getenv('WORKSPACE_BUCKET') #the workspace bucket that we will upload the output to
num.threads = 8 #Specify the number of CPUs you want to use. 

In [2]:
#Load packages
library(tidyverse)
library(genefu)
library(qqman)
library(ggtext)
library(biomaRt)

#Define functions
show_msg <- function(x){ 
    print(x)
    flush.console()
}

── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.3.1 ──

[32m✔[39m [34mggplot2[39m 3.3.5     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.1.4     [32m✔[39m [34mdplyr  [39m 1.0.7
[32m✔[39m [34mtidyr  [39m 1.1.3     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 2.0.1     [32m✔[39m [34mforcats[39m 0.5.1

── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()

Loading required package: survcomp

Loading required package: survival

Loading required package: prodlim

Loading required package: biomaRt

Loading required package: iC10

Loading required package: pamr

Loading required package: cluster

Loading required package: impute

Loading required package: iC10TrainingData

Loading required package: AIMS

Loadi

# Load in the data

First, we want to load in all of the data and merge it into a single file

In [7]:
#Get a list of all of the GWAS output files
paste(working_dir, "/gwas/gwas_output/", sep = "") %>% setwd()
gwas.files = list.files(path = ".", pattern = "glm.linear")
gwas.files

In [8]:
test = read.table("gwas_out.WDR82.glm.linear")

In [14]:
head(test)
dim(test)

Unnamed: 0_level_0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13
Unnamed: 0_level_1,<int>,<int>,<chr>,<chr>,<chr>,<chr>,<chr>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>
1,1,69270,.,A,G,A,ADD,402,-0.0254363,0.0190201,-1.33733,0.181895,.
2,1,69428,.,T,G,G,ADD,402,-0.0727138,0.0799725,-0.909235,0.363789,.
3,1,69511,.,A,G,A,ADD,402,-0.0234028,0.0196899,-1.18857,0.235335,.
4,1,69761,.,A,T,T,ADD,402,-0.0757628,0.0656665,-1.15375,0.249311,.
5,1,69897,.,T,C,C,ADD,402,-0.00968426,0.020531,-0.471689,0.637413,.
6,1,926250,.,G,A,A,ADD,402,0.0347149,0.0333173,1.04195,0.298082,.


In [18]:
#Loop through all of the files, load them in, and extract the important information.

merged.pval.output = NULL
for(file in gwas.files){
    
    #print the status of the loop
    show_msg(file)
    
    #Load in the file and process it
    gwas.data = read.table(file) %>%
    dplyr::select(1, 2, 12) %>%
    rename("chr" = 1, "pos" = 2, "pval" = 3) %>%
    mutate(gene = file, .before = chr) %>%
    mutate(fdr = p.adjust(pval, method = "BY", n = length(pval)))
    
    #bind it back to the primary matrix
    merged.pval.output = rbind(merged.pval.output, gwas.data)
}

[1] "gwas_out.ADCY3.glm.linear"
[1] "gwas_out.BAALC.glm.linear"
[1] "gwas_out.C2orf80.glm.linear"
[1] "gwas_out.C7orf25.glm.linear"
[1] "gwas_out.CBFA2T2.glm.linear"
[1] "gwas_out.CCDC74B.glm.linear"
[1] "gwas_out.CHGB.glm.linear"
[1] "gwas_out.CLSPN.glm.linear"
[1] "gwas_out.CSDC2.glm.linear"
[1] "gwas_out.DEFB108B.glm.linear"
[1] "gwas_out.DGKA.glm.linear"
[1] "gwas_out.DND1.glm.linear"
[1] "gwas_out.ECE1.glm.linear"
[1] "gwas_out.GMDS.glm.linear"
[1] "gwas_out.GTPBP2.glm.linear"
[1] "gwas_out.INCENP.glm.linear"
[1] "gwas_out.KCNV1.glm.linear"
[1] "gwas_out.LIME1.glm.linear"
[1] "gwas_out.PKDREJ.glm.linear"
[1] "gwas_out.POP4.glm.linear"
[1] "gwas_out.QRICH2.glm.linear"
[1] "gwas_out.RESP18.glm.linear"
[1] "gwas_out.RHD.glm.linear"
[1] "gwas_out.RHOB.glm.linear"
[1] "gwas_out.RIMS2.glm.linear"
[1] "gwas_out.SLC2A4RG.glm.linear"
[1] "gwas_out.SPATA6L.glm.linear"
[1] "gwas_out.SRP14.glm.linear"
[1] "gwas_out.TNFRSF1B.glm.linear"
[1] "gwas_out.TNFRSF6B.glm.linear"
[1] "gwas_out.TOB1.glm

In [19]:
dim(merged.pval.output)
head(merged.pval.output)

Unnamed: 0_level_0,gene,chr,pos,pval,fdr
Unnamed: 0_level_1,<chr>,<int>,<int>,<dbl>,<dbl>
1,gwas_out.ADCY3.glm.linear,1,69270,0.317436,1
2,gwas_out.ADCY3.glm.linear,1,69428,0.238294,1
3,gwas_out.ADCY3.glm.linear,1,69511,0.872524,1
4,gwas_out.ADCY3.glm.linear,1,69761,0.1601,1
5,gwas_out.ADCY3.glm.linear,1,69897,0.385175,1
6,gwas_out.ADCY3.glm.linear,1,926250,0.742348,1


# Get gene positions

The purpose of this analysis is to test whether the top SNP for each gene is a cis-QTL or a trans-QTL. The first step for this is to get the positions of each gene in the genome.

In [59]:
#Download a bridigng file to convert the Refseq ID to the hgnc ID
#This bridging file was downloaded from genenames.org
paste(working_dir, "/raw_data/new_ccle", sep = "") %>% setwd()
system(glue::glue("
gsutil cp gs://fc-45c0e148-0b1c-4244-9bfc-feb559bbc514/refseq_to_hgnc.txt .
"))
bridging.file = read.table("refseq_to_hgnc.txt", sep = "\t", fill = TRUE, header = TRUE) %>%
dplyr::select(Approved.symbol, RefSeq.IDs)

head(bridging.file)

Unnamed: 0_level_0,Approved.symbol,RefSeq.IDs
Unnamed: 0_level_1,<chr>,<chr>
1,A1BG,NM_130786
2,A1BG-AS1,NR_015380
3,A1CF,NM_014576
4,A1S9T,
5,A2M,NM_000014
6,A2M-AS1,NR_026971


In [62]:
#Load in the exon positions and format the file correctly
paste(working_dir, "/raw_data/new_ccle", sep = "") %>% setwd()
exon.positions = read.table("exon_positions") %>%
rename("chr" = 1, "start" = 2, "end" = 3, "refseq.string" = 4) %>%
mutate(gene = stringr::str_extract(refseq.string, "[^_]*_[^_]*")) %>%
mutate(gene = plyr::mapvalues(from = bridging.file$RefSeq.IDs, to = bridging.file$Approved.symbol, gene, warn_missing = FALSE))

head(exon.positions)

Unnamed: 0_level_0,chr,start,end,refseq.string,V5,V6,gene
Unnamed: 0_level_1,<chr>,<int>,<int>,<chr>,<int>,<chr>,<chr>
1,chr1,201283702,201283904,NM_000299_cds_0_0_chr1_201283703_f,0,+,PKP1
2,chr1,201293941,201294045,NM_000299_cds_1_0_chr1_201293942_f,0,+,PKP1
3,chr1,201313165,201313560,NM_000299_cds_2_0_chr1_201313166_f,0,+,PKP1
4,chr1,201316552,201316697,NM_000299_cds_3_0_chr1_201316553_f,0,+,PKP1
5,chr1,201317571,201317779,NM_000299_cds_4_0_chr1_201317572_f,0,+,PKP1
6,chr1,201318617,201318795,NM_000299_cds_5_0_chr1_201318618_f,0,+,PKP1
