-
Notifications
You must be signed in to change notification settings - Fork 2
/
bioplex.R
313 lines (285 loc) · 11.9 KB
/
bioplex.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
############################################################
#
# author: Ludwig Geistlinger
# date: 2021-02-26 08:22:09
#
# descr: obtain BioPlex PPIs
#
############################################################
#' @title Obtain BioPlex protein-protein interaction data
#' @description Functionality for retrieving the BioPlex protein-protein
#' interaction data. Available networks include:
#' \itemize{
#' \item BioPlex 293T cells (versions 1.0, 2.0, and 3.0),
#' \item BioPlex HCT116 cells (version 1.0).
#' } See references.
#' @param cell.line character. Valid options include:
#' \itemize{
#' \item \code{"293T"}: derivative of human embryonic kidney 293 cell line,
#' \item \code{"HCT116"}: human colon cancer cell line 116.
#' }
#' Defaults to \code{"293T"}.
#' @param version character. Valid options include \code{"1.0"}, \code{"2.0"},
#' and \code{"3.0"} for 293T cells. For HCT116 cells, only \code{"1.0"} is
#' available.
#' Defaults to \code{"3.0"}.
#' @param remap.uniprot.ids logical. Should the protein-to-gene mappings from BioPlex
#' (i.e. UNIPROT-to-SYMBOL and UNIPROT-to-ENTREZID) be updated using Bioc annotation
#' functionality?
#' Defaults to \code{FALSE} which will then keep the mappings provided by BioPlex.
#' @param cache logical. Should a locally cached version used if available?
#' Defaults to \code{TRUE}.
#' @return A \code{data.frame}.
#' @references BioPlex: \url{https://bioplex.hms.harvard.edu/interactions.php}
#' @examples
#' # (1) Obtain the latest version of the 293T PPI network
#' bp.293t <- getBioPlex(cell.line = "293T", version = "3.0")
#'
#' # (2) Obtain the latest version of the HCT116 PPI network
#' bp.hct116 <- getBioPlex(cell.line = "HCT116", version = "1.0")
#' @importFrom utils read.delim
#' @export
getBioPlex <- function(cell.line = c("293T", "HCT116"),
version = c("3.0", "1.0", "2.0"),
remap.uniprot.ids = FALSE,
cache = TRUE)
{
bioplex.url <- "https://bioplex.hms.harvard.edu/data/BioPlex_"
cell.line <- match.arg(cell.line)
version <- match.arg(version)
# we only have version 1.0 for HCT116 cells currently
if(cell.line == "HCT116") version <- "1.0"
clver <- paste(cell.line, version, sep = ".")
rname <- paste("bioplex", clver, sep = ".")
# should a cache version be used?
if(cache) ppi.file <- .getResourceFromCache2(rname)
if(!cache || is.null(ppi.file))
{
# get the data
file.ext <- switch(clver,
`293T.1.0` = "interactionList_v2",
`293T.2.0` = "interactionList_v4a",
`293T.3.0` = "293T_Network_10K_Dec_2019",
`HCT116.1.0` = "HCT116_Network_5.5K_Dec_2019",
`293T.1.0d` = "1.0_293T_" )
file.ext <- paste(file.ext, "tsv", sep = ".")
ppi.file <- paste0(bioplex.url, file.ext)
ppi.file <- .cacheResource2(rname, ppi.file)
}
bioplex <- read.delim(ppi.file)
colnames(bioplex) <- c("GeneA", "GeneB",
"UniprotA", "UniprotB",
"SymbolA", "SymbolB",
"pW", "pNI", "pInt")
bioplex$GeneA <- as.character(bioplex$GeneA)
bioplex$GeneB <- as.character(bioplex$GeneB)
# remap gene ids
if(remap.uniprot.ids) bioplex <- .remapUniprotIdsBP(bioplex)
# clean up & cache
return(bioplex)
}
#' @title Representation of BioPlex PPIs in a graph data structure
#' @description Representation of BioPlex PPIs in a \code{graphNEL} object
#' from the \code{graph} package.
#' @param bioplex.df a \code{data.frame} storing the Bioplex PPIs in a flat
#' from-to format. Typically obtained via \code{\link{getBioPlex}}.
#' @return An object of class \code{graphNEL}.
#' @references BioPlex: \url{https://bioplex.hms.harvard.edu/interactions.php}
#' @seealso \code{\link{getBioPlex}}, \code{\link{ftM2graphNEL}}
#' @examples
#' # (1) Obtain the latest version of the 293T PPI network
#' bp.293t <- getBioPlex(cell.line = "293T", version = "3.0")
#'
#' # (2) Turn the data into a graph
#' bp.gr <- bioplex2graph(bp.293t)
#'
#' @export
bioplex2graph <- function(bioplex.df)
{
stopifnot(is.data.frame(bioplex.df))
node.cols <- paste0("Uniprot", c("A", "B"))
ftm <- as.matrix(bioplex.df[,node.cols])
ftm <- sub("-[0-9]+$", "", ftm)
ind <- !duplicated(ftm)
ftm <- ftm[ind,]
bioplex.df <- bioplex.df[ind,]
gr <- graph::ftM2graphNEL(ftm, edgemode = "directed")
gr <- .annotateBioplexGraph(gr, bioplex.df)
return(gr)
}
#' @title Annotate PFAM domains to BioPlex PPI graph
#' @description This function adds PFAM domain annotations to the node metadata
#' of the BioPlex PPI graph.
#' @param bp.gr an object of class \code{\linkS4class{graph}} storing the
#' BioPlex PPIs. Typically obtained via \code{\link{bioplex2graph}}.
#' @param orgdb an \code{orgdb} object storing annotation data for human.
#' @return An object of class \code{graphNEL} containing PFAM domain annotations
#' in the \code{nodeData}.
#' @references
#' BioPlex: \url{https://bioplex.hms.harvard.edu/interactions.php}
#'
#' PFAM: \url{http://pfam.xfam.org}
#' @seealso \code{\link{nodeData}}
#' @examples
#' # (1) Obtain the latest version of the 293T PPI network
#' bp.293t <- getBioPlex(cell.line = "293T", version = "3.0")
#'
#' # (2) Turn the data into a graph
#' bp.gr <- bioplex2graph(bp.293t)
#'
#' # (3) Obtain orgdb package from AnnotationHub
#' ah <- AnnotationHub::AnnotationHub()
#' orgdb <- AnnotationHub::query(ah, c("orgDb", "Homo sapiens"))
#' orgdb <- orgdb[[1]]
#'
#' # (4) Annotate PFAM domains
#' bp.gr <- annotatePFAM(bp.gr, orgdb)
#'
#' @export
annotatePFAM <- function(bp.gr, orgdb)
{
up2pfam <- suppressMessages(AnnotationDbi::mapIds(orgdb,
keys = graph::nodes(bp.gr),
keytype = "UNIPROT",
column = "PFAM",
multiVals = "list"))
graph::nodeDataDefaults(bp.gr, "PFAM") <- NA
graph::nodeData(bp.gr, graph::nodes(bp.gr), "PFAM") <- up2pfam
return(bp.gr)
}
# we will want to annotate these somewhat
# ‘edgeData’: An ‘attrData’ instance for edge attributes.
#
# ‘nodeData’: An ‘attrData’ instance for node attributes.
#
# ‘graphData’: A ‘list’ for graph-level attributes. Only mandatory
# list item is ‘edgemode’ which indicates whether edges are
# ‘"directed"’ or ‘"undirected"’
#
.annotateBioplexGraph <- function(gr, df)
{
# create maps to annotate node data:
# 1. uniprot -> symbol
# 2. uniprot -> entrez
ucols <- paste0("Uniprot", c("A", "B"))
scols <- paste0("Symbol", c("A", "B"))
ecols <- paste0("Gene", c("A", "B"))
iids <- unlist(df[,ucols])
sids <- unlist(df[,scols])
eids <- unlist(df[,ecols])
ind <- !duplicated(iids)
iids <- iids[ind]
uids <- sub("-[0-9]+$", "", iids)
up2sym <- sids[ind]
up2eg <- eids[ind]
names(iids) <- names(up2eg) <- names(up2sym) <- uids
# graph data annotation
# when starting off with a ordinary dfs we'll be losing the ability
# to annotate graph-level annotation such as cell.line, version, PMID, ...
# we might need to work with DataFrames where we have mcols and metadata
# node data annotation
graph::nodeDataDefaults(gr, "ENTREZID") <- NA
graph::nodeData(gr, graph::nodes(gr), "ENTREZID") <- up2eg[graph::nodes(gr)]
graph::nodeDataDefaults(gr, "SYMBOL") <- NA
graph::nodeData(gr, graph::nodes(gr), "SYMBOL") <- up2sym[graph::nodes(gr)]
graph::nodeDataDefaults(gr, "ISOFORM") <- NA
graph::nodeData(gr, graph::nodes(gr), "ISOFORM") <- iids[graph::nodes(gr)]
# edge data annotation
for(col in ucols) df[,col] <- sub("-[0-9]+$", "", df[,col])
for(col in c("pW", "pNI", "pInt"))
{
graph::edgeDataDefaults(gr, col) <- numeric(0L)
graph::edgeData(gr, df[,ucols[1]], df[,ucols[2]], col) <- df[,col]
}
return(gr)
}
#' @title Map experimental data onto a graph
#' @description Functionality for mapping experimental data stored in
#' a \code{\linkS4class{SummarizedExperiment}} onto a
#' \code{\linkS4class{graph}} object.
#' @param gr an object of class \code{\linkS4class{graph}}.
#' @param se an object of class \code{\linkS4class{SummarizedExperiment}}.
#' @param col.names character. Column names of \code{se} for which assay
#' data should be mapped onto the nodes of \code{gr}. Defaults to \code{NULL}
#' which will then use all column names of \code{se}.
#' @param rowdata.cols character. Column names of \code{rowData(se)} which
#' should be mapped onto the nodes of \code{gr}. Defaults to \code{NULL}
#' which will then use all column names of \code{rowData(se)}.
#' @param prefix character. Informative prefix that should be pasted together
#' with the selected \code{col.names} and \code{rowdata.cols} to allow easy
#' identification of columns of interest when mapping from multiple experimental
#' datasets.
#' @return An object of class \code{\linkS4class{graph}}.
#' @examples
#' # (1) Obtain the latest version of the 293T PPI network ...
#' bp.293t <- getBioPlex(cell.line = "293T", version = "3.0")
#'
#' # (2) ... and turn into a graph
#' bp.gr <- bioplex2graph(bp.293t)
#'
#' # (3) Obtain the BioPlex3 proteome data ...
#' se <- getBioplexProteome()
#'
#' # (4) ... and map onto the graph
#' bp.gr <- mapSummarizedExperimentOntoGraph(bp.gr, se)
#' @export
mapSummarizedExperimentOntoGraph <- function(gr, se,
col.names = NULL,
rowdata.cols = NULL,
prefix = "")
{
isect <- intersect(rownames(se), graph::nodes(gr))
if(!length(isect)) stop("SummarizedExperiment (se)", " and ",
"graph (gr) have no node IDs in common")
if(is.null(col.names)) col.names <- colnames(se)
else if(!all(col.names %in% colnames(se)))
stop("Invalid col.names provided")
if(is.null(rowdata.cols)) rowdata.cols <- colnames(rowData(se))
else if(!all(rowdata.cols %in% colnames(rowData(se))))
stop("Invalid rowdata.cols provided")
# map assay data onto nodes
for(n in col.names)
{
gn <- paste0(prefix, n)
graph::nodeDataDefaults(gr, gn) <- NA
graph::nodeData(gr, isect, gn) <- assay(se)[isect, n]
}
# map rowData onto nodes
for(n in rowdata.cols)
{
gn <- paste0(prefix, n)
graph::nodeDataDefaults(gr, gn) <- NA
graph::nodeData(gr, isect, gn) <- rowData(se)[isect, n]
}
return(gr)
}
.remapUniprotIdsBP <- function(df)
{
stopifnot(is.data.frame(df))
uids1 <- sub("-[0-9]+$", "", df$UniprotA)
uids2 <- sub("-[0-9]+$", "", df$UniprotB)
suppressMessages({
ah <- AnnotationHub::AnnotationHub()
orgdb <- AnnotationHub::query(ah, c("orgDb", "Homo sapiens"))
orgdb <- orgdb[[1]]
})
suppressMessages({
df$GeneA <- AnnotationDbi::mapIds(orgdb,
keys = uids1,
keytype = "UNIPROT",
column = "ENTREZID")
df$GeneB <- AnnotationDbi::mapIds(orgdb,
keys = uids2,
keytype = "UNIPROT",
column = "ENTREZID")
df$SymbolA <- AnnotationDbi::mapIds(orgdb,
keys = uids1,
keytype = "UNIPROT",
column = "SYMBOL")
df$SymbolB <- AnnotationDbi::mapIds(orgdb,
keys = uids2,
keytype = "UNIPROT",
column = "SYMBOL")
})
return(df)
}