-
Notifications
You must be signed in to change notification settings - Fork 0
/
combine_v8colocs_sqtl.R
122 lines (103 loc) · 4.16 KB
/
combine_v8colocs_sqtl.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
library(data.table)
library(tidyverse)
# some functions
"%&%" <- function(a,b) paste(a,b,sep="")
cf <- function(method, x) method %&% x
smrf <- function(x) {
"%&%" <- function(a,b) paste(a,b,sep="")
ll = unlist(strsplit(x, ":"))
ll[1] = gsub("chr","",ll[1])
intron = "intron_" %&% ll[1] %&% "_" %&% ll[2] %&% "_" %&% ll[3]
return(intron)
}
# set working directory
wd <- "/scratch/users/raoa/consortium"
setwd(wd)
# these are downloaded from Zenodo and Google Drive
multi <- "/sqtl_smultixcan" # performed across tissues, so same across all dbs
predix <- "/sqtl_spredixcan"
enloc <- "/results_sqtl_eur_enloc"
smr <- "/smr_gtex_v8_sqtl_results"
# coloc <- "/results_enloc_priors"
combined_output <- "/combined/sqtl"
if (!file.exists(combined_output)) {
system("mkdir combined")
system("mkdir combined/sqtl")
}
# read in v8 traits and tissues
traits <- fread(wd %&% "/traits.txt", data.table = F, header = F)
traits <- traits$V1
tissues <- fread(wd %&% "/tissues.txt", data.table = F, header = F)
tissues <- tissues$V1
# make temp files and have a separate python script put all of them into a db using os.listdir()
for (tissue in tissues) {
for (trait in traits) {
to_combine = vector()
if (file.exists(wd %&% combined_output %&% "/" %&% trait %&% "__PM__" %&% tissue %&% "_combined.csv")) {
next
}
# read in smultixcan
mfile <- wd %&% multi %&% '/' %&% trait %&% "_smultixcan_imputed_gwas_gtexv8mashr_ccn30.txt"
if (file.exists(mfile)) {
m <- fread(mfile, data.table = F, header = T)
colnames(m) = sapply(colnames(m), cf, method = "smultix_")
colnames(m)[1] = "gene" # column to merge on
to_combine = append(to_combine, "m")
}
# read in spredixcan
pfile <- wd %&% predix %&% '/spredixcan_igwas_gtexmashrv8_' %&% trait %&% "__PM__" %&% tissue %&% ".csv"
if (file.exists(pfile)) {
p <- fread(pfile, data.table = F, header = T)
colnames(p) = sapply(colnames(p), cf, method = "spredix_")
colnames(p)[1] = "gene" # column to merge on
to_combine = append(to_combine, "p")
}
# read in enloc
efile <- wd %&% enloc %&% '/' %&% trait %&% "__PM__" %&% tissue %&% ".enloc.rst.gz"
if (file.exists(efile)) {
e <- fread(efile, data.table = F, header = T) # read in gzip file
colnames(e) = sapply(colnames(e), cf, method = "enloc_")
colnames(e)[2] = "gene" # column to merge on
to_combine = append(to_combine, "e")
}
# read in smr
# probeID chr1:1090428:1091472:clu_35118:ENSG00000131591.17
sfile <- wd %&% smr %&% '/' %&% tissue %&% "/" %&% trait %&% "_" %&% tissue %&% ".smr"
if (file.exists(sfile)) {
s <- fread(sfile, data.table = F, header = T) # read in gzip file
colnames(s) = sapply(colnames(s), cf, method = "smr_")
s <- try(s %>% rowwise %>% mutate(intron = smrf(smr_probeID)))
if (class(s) == "try-error") break
colnames(s)[which(colnames(s) == "intron")] = "gene" # column to merge on
to_combine = append(to_combine, "s")
}
# read in coloc enloc priors
#cfile <- wd %&% coloc %&% '/' %&% trait %&% "__PM__" %&% tissue %&% ".txt.gz"
#if (file.exists(cfile)) {
# c <- fread(cfile, data.table = F, header = T) # read in gzip file
# colnames(c) = sapply(colnames(c), cf, method = "coloc_")
# colnames(c)[1] = "gene" # column to merge on
# to_combine = append(to_combine, "c")
#}
# combine dataframes (Reduce is more efficient, but issue with row numbers)
if (length(to_combine) == 1) {
cmd <- "merged <- " %&% to_combine[1]
} else if (length(to_combine) > 1) {
cmd <- "merge(" %&% to_combine[1] %&% ", " %&% to_combine[2] %&% ", by = \"gene\", all = TRUE)"
if (length(to_combine) > 2) {
for (i in 3:length(to_combine)) {
cmd <- "merge(" %&% cmd %&% ", " %&% to_combine[i] %&% ", by = \"gene\", all = TRUE)"
}
}
cmd <- "merged <- " %&% cmd
} else if (length(to_combine) == 0) {
#next
}
eval(parse(text = cmd))
# remove rows with all NAs in rows other than gene name
merged <- merged[rowSums(is.na(merged[ , 2:ncol(merged)])) < ncol(merged) - 1, ]
# write merged dataframe to intermediate file
write.table(merged, wd %&% combined_output %&% "/" %&% trait %&% "__PM__" %&% tissue %&% "_combined.csv", sep=",", row.names = F, col.names = T)
cat(tissue, trait, "DONE\n")
}
}