Skip to content
Permalink
master
Go to file
 
 
Cannot retrieve contributors at this time
475 lines (407 sloc) 20.3 KB
##################################################
## Description: mapping b/w names & database identifiers
##
## Author: Jeff Xia, jeff.xia@mcgill.ca
## McGill University, Canada
## License: GNU GPL (>= 2)
###################################################
# given a list of compound names or ids, find matched name or ids from selected databases
CrossReferencing <- function(q.type, hmdb=T, pubchem=T, chebi=F, kegg=T, metlin=F){
# record the filter for 8 major databases
return.cols <<- c(hmdb, pubchem, chebi, kegg, metlin);
if(is.null(cmpd.db)){
load("../../libs/cmpd_name.rda", .GlobalEnv);
#how to save image from the CSV
#cmpd.db <<- read.csv("../../libs/cmpd_name.csv", as.is=T, header=T);
#com.nms <- cmpd.db$name
#syns.vec <- cmpd.db$synonym;
#Encoding (syns.vec) <- "UTF-8"; # otherwise, encoding warnings
# syn.db <- list();
# syn.db$syns.vec <- cmpd.db$synonym;
# syn.db$syns.list <- strsplit(syns.vec, "; *");
#save(syn.db, file="syn_nms_db.rda");
#cmpd.db$synonym <- NULL;
#save(cmpd.db, file="cmpd_db.rda");
}
# record all the data
if(!exists("name.map")){
name.map <<- list();
}
# distribute job
dataSet$q.type <<- q.type;
if(q.type == "name"){
NameMappingExact();
}else{
IDMapping(q.type);
}
# do some sanity check
todo.inx <-which(is.na(name.map$hit.inx));
if(length(todo.inx)/length(name.map$hit.inx) > 0.5){
nmcheck.msg <<- c(0, "Over half of the compound IDs could not be matched to our database. Please make
sure that correct compound IDs or common compound names are used.");
}else if (length(todo.inx) > 15){
nmcheck.msg <<- c(2, "There are >15 compounds without matches. You can either proceed or if necessary, update these compound IDs and upload again.");
}else{
nmcheck.msg <<- c(1, "Name matching OK, please inspect (and manual correct) the results then proceed.");
}
}
# Mapping from compound names to other id, can do exact or approximate match
NameMappingExact<-function(){
qvec <- dataSet$cmpd;
# variables to record results
hit.inx = vector(mode='numeric', length=length(qvec)); # record hit index, initial 0
match.values = vector(mode='character', length=length(qvec)); # the best matched values (hit names), initial ""
match.state = vector(mode='numeric', length=length(qvec)); # match status - 0, no match; 1, exact match; initial 0
# first find exact match to the common compound names
hit.inx <- match(tolower(qvec), tolower(cmpd.db$name));
match.values <- cmpd.db$name[hit.inx];
match.state[!is.na(hit.inx)] <- 1;
# then try to find exact match to synanyms for the remaining unmatched query names one by one
if(!exists("syn.db")){
load("../../libs/syn_nms_db.rda", .GlobalEnv);
}
syns.list <- syn.db$syns.list;
todo.inx <-which(is.na(hit.inx));
if(length(todo.inx) > 0){
for(i in 1:length(syns.list)){
syns <- syns.list[[i]];
hitInx <- match(tolower(qvec[todo.inx]), tolower(syns));
hitPos <- which(!is.na(hitInx));
if(length(hitPos)>0){
# record matched ones
orig.inx<-todo.inx[hitPos];
hit.inx[orig.inx] <- i;
# match.values[orig.inx] <- syns[hitInx[hitPos]]; # show matched synnames
match.values[orig.inx] <- cmpd.db$name[i]; # show common name
match.state[orig.inx] <- 1;
# update unmatched list
todo.inx<-todo.inx[is.na(hitInx)];
}
if(length(todo.inx) == 0) break;
}
}
# empty memory
gc();
name.map$hit.inx <<- hit.inx;
name.map$hit.values <<- match.values;
name.map$match.state <<- match.state;
}
# only exact match will be performed
IDMapping<-function(q.type){
qvec <- dataSet$cmpd;
# variables to record results
hit.inx = vector(mode='numeric', length=length(qvec)); # record hit index, initial 0
match.values = vector(mode='character', length=length(qvec)); # the best matched values (hit names), initial ""
match.state = vector(mode='numeric', length=length(qvec)); # match status - 0, no match; 1, exact match; 2 approximate match. initial 0
if(q.type == "hmdb"){
hit.inx <- match(tolower(qvec), tolower(cmpd.db$hmdb));
}else if(q.type == "pubchem"){
hit.inx <- match(tolower(qvec), tolower(cmpd.db$pubchem));
}else if(q.type == "chebi"){
hit.inx <- match(tolower(qvec), tolower(cmpd.db$chebi));
}else if(q.type == "metlin"){
hit.inx <- match(tolower(qvec), tolower(cmpd.db$metlin));
}else if(q.type == "kegg"){
hit.inx <- match(tolower(qvec), tolower(cmpd.db$kegg));
}else{ # hmdb + kegg
hit.inx <- match(tolower(qvec), tolower(cmpd.db$hmdb));
hit.inx2 <- match(tolower(qvec), tolower(cmpd.db$kegg));
nohmdbInx <- is.na(hit.inx);
hit.inx[nohmdbInx]<-hit.inx2[nohmdbInx]
}
match.values <- cmpd.db$name[hit.inx];
match.state[!is.na(hit.inx)] <- 1;
name.map$hit.inx <<- hit.inx;
name.map$hit.values <<- match.values;
name.map$match.state <<- match.state;
}
GetMappingResultTable<-function(){
qvec <- dataSet$cmpd;
if(is.null(qvec)){
return();
}
# style for highlighted background for unmatched names
pre.style<-NULL;
post.style<-NULL;
# style for no matches
if(dataSet$q.type == "name"){
no.prestyle<-"<strong style=\"background-color:yellow; font-size=125%; color=\"black\">";
no.poststyle<-"</strong>";
}else{
no.prestyle<-"<strong style=\"background-color:red; font-size=125%; color=\"black\">";
no.poststyle<-"</strong>";
}
hit.inx<-name.map$hit.inx;
hit.values<-name.map$hit.values;
match.state<-name.map$match.state;
# contruct the result table with cells wrapped in html tags
# the unmatched will be highlighted in different background
html.res<-matrix("", nrow=length(qvec), ncol=8);
csv.res<-matrix("", nrow=length(qvec), ncol=8);
colnames(csv.res)<-c("Query", "Match", "HMDB", "PubChem", "ChEBI", "KEGG", "METLIN", "Comment");
for (i in 1:length(qvec)){
if(match.state[i]==1){
pre.style<-"";
post.style="";
}else{ # no matches
pre.style<-no.prestyle;
post.style<-no.poststyle;
}
hit <-cmpd.db[hit.inx[i], ,drop=F];
html.res[i, ]<-c(paste(pre.style, qvec[i], post.style, sep=""),
paste(ifelse(match.state[i]==0, "", hit.values[i]), sep=""),
paste(ifelse(match.state[i]==0 || is.na(hit$hmdb_id) || hit$hmdb_id=="" || hit$hmdb_id=="NA","-", paste("<a href=http://www.hmdb.ca/metabolites/", hit$hmdb_id, " target='_blank'>",hit$hmdb_id,"</a>", sep="")), sep=""),
paste(ifelse(match.state[i]==0 || is.na(hit$pubchem_id) || hit$pubchem_id=="" || hit$pubchem_id=="NA", "-", paste("<a href=http://pubchem.ncbi.nlm.nih.gov/summary/summary.cgi?cid=", hit$pubchem_id," target='_blank'>", hit$pubchem_id,"</a>", sep="")), sep=""),
paste(ifelse(match.state[i]==0 || is.na(hit$chebi_id) || hit$chebi_id==""|| hit$chebi_id=="NA","-", paste("<a href=http://www.ebi.ac.uk/chebi/searchId.do?chebiId=", hit$chebi_id, " target='_blank'>",hit$chebi_id,"</a>", sep="")), sep=""),
paste(ifelse(match.state[i]==0 || is.na(hit$kegg_id) || hit$kegg_id==""|| hit$kegg_id=="NA","-",paste("<a href=http://www.genome.jp/dbget-bin/www_bget?", hit$kegg_id, " target='_blank'>", hit$kegg_id,"</a>", sep="")), sep=""),
paste(ifelse(match.state[i]==0 || is.na(hit$metlin_id) || hit$metlin_id==""|| hit$metlin_id=="NA","-",paste("<a href=http://metlin.scripps.edu/metabo_info.php?molid=", hit$metlin_id," target='_blank'>",hit$metlin_id,"</a>", sep="")), sep=""),
ifelse(match.state[i]!=1,"View",""));
csv.res[i, ]<-c(qvec[i],
ifelse(match.state[i]==0, "NA", hit.values[i]),
ifelse(match.state[i]==0, "NA", hit$hmdb_id),
ifelse(match.state[i]==0, "NA", hit$pubchem_id),
ifelse(match.state[i]==0, "NA", hit$chebi_id),
ifelse(match.state[i]==0, "NA", hit$kegg_id),
ifelse(match.state[i]==0, "NA", hit$metlin_id),
match.state[i]);
}
# return only columns user selected
# add query and match columns at the the beginning, and 'Detail' at the end
return.cols <- c(TRUE, TRUE, return.cols, TRUE);
html.res <- html.res[,return.cols, drop=F];
csv.res <- csv.res[,return.cols, drop=F];
# store the value for report
dataSet$map.table <<- csv.res;
write.csv(csv.res, file="name_map.csv", row.names=F);
return(as.vector(html.res));
}
GetHitsRowNumber<-function(){
return (length(name.map$hit.inx));
}
PerformApproxMatch<-function(inx){
q <- dataSet$cmpd[inx];
# only for none lipids
nonLipidInx <- cmpd.db$lipid == 0;
com.nms <- cmpd.db$name[nonLipidInx]
syns.vec <- syn.db$syns.vec[nonLipidInx];
syns.list <- syn.db$syns.list[nonLipidInx];
matched.dist <- NULL;
q.length <- nchar(q);
s <- c(0, 0.1, 0.2);
for (j in s) {
new.q <- q;
if(q.length > 32){ # note: agrep fail for exact match when length over 32 characters
new.q<-substr(q, 1, 32);
}
matched <- FALSE;
matched.inx <- agrep(new.q, syns.vec, ignore.case=T, max.distance=j, useBytes=T);
if(length(matched.inx) > 0) {
# record all the candidates,
# don't use cbind, since all will be converted to character mode
# for data.frame specify "stringsAsFactors" to prevent convert value col into factor
candidates <- data.frame(index=vector(mode = "numeric", length=length(matched.inx)),
value=vector(mode = "character", length=length(matched.inx)),
score=vector(mode = "numeric", length=length(matched.inx)),
stringsAsFactors = FALSE);
for(n in 1:length(matched.inx)){
nm.vec<-syns.list[[matched.inx[n]]];
# try approximate match, note: in some cases, split into element will break match using whole string
hit3.inx <- agrep(q,nm.vec,ignore.case=T, max.distance=j, useBytes=T);
if(length(hit3.inx)>0){
hit3.nm <- vector(mode = "character", length=length(hit3.inx));
hit3.score <- vector(mode = "numeric", length=length(hit3.inx));
for(k in 1:length(hit3.inx)){
idx <- hit3.inx[k];
hit3.nm[k] <- nm.vec[idx];
hit3.score[k] <- j + abs(nchar(nm.vec[idx])-nchar(q))/(10*nchar(q));
}
# now get the best match, the rule is that the first two character should matches
# first check if first two character are digits or characters, otherwise will cause error
matches2 <- c();
if(length(grep("^[1-9a-z]{2}", q, ignore.case=T))>0){
matches2 <- grep(paste("^", substr(q, 1, 2), sep=""), hit3.nm);
}else if (length(grep("^[1-9a-z]", q, ignore.case=T))>0){
matches2 <- grep(paste("^", substr(q, 1, 1), sep=""), hit3.nm);
}
if(length(matches2)>0){
hit3.score[matches2] <- hit3.score[matches2] - 0.05;
}
best.inx<-which(hit3.score==min(hit3.score))[1];
candidates[n,1]<-matched.inx[n];
# candidates[n,2]<-hit3.nm[best.inx]; # show matched syn names
candidates[n,2]<-com.nms[matched.inx[n]] # show common names
candidates[n,3]<-hit3.score[best.inx];
}
}
rm.inx <- is.na(candidates[,2]) | candidates[,2]=="NA" | candidates[,2]=="";
candidates<-candidates[!rm.inx, ];
candidates<-candidates[order(candidates[,3], decreasing=F), , drop=F];
if(nrow(candidates) > 10){
candidates<-candidates[1:10,];
}
candidates <<- candidates;
return();
}
}
candidates <<- NULL;
}
# Get all candidate compound names for a given index - 3 col - inx, name, score
GetCandidateList<-function(){
# contruct the result table with cells wrapped in html tags
# the unmatched will be highlighted in different background
can.mat<-matrix("", nrow=nrow(candidates)+1, ncol= 6);
for (i in 1:nrow(candidates)){
hit.inx <- candidates[i, 1];
hit.name <- candidates[i, 2];
hit <-cmpd.db[hit.inx, ,drop=F];
can.mat[i, ]<-c(hit.name,
paste(ifelse(hit$hmdb_id=="NA","", paste("<a href=http://www.hmdb.ca/metabolites/", hit$hmdb_id, " target='_blank'>",hit$hmdb_id,"</a>", sep="")), sep=""),
paste(ifelse(hit$pubchem_id=="NA", "", paste("<a href=http://pubchem.ncbi.nlm.nih.gov/summary/summary.cgi?cid=", hit$pubchem_id," target='_blank'>", hit$pubchem_id,"</a>", sep="")), sep=""),
paste(ifelse(hit$chebi_id=="NA","", paste("<a href=http://www.ebi.ac.uk/chebi/searchId.do?chebiId=", hit$chebi_id, " target='_blank'>",hit$chebi_id,"</a>", sep="")), sep=""),
paste(ifelse(hit$kegg_id=="NA","",paste("<a href=http://www.genome.jp/dbget-bin/www_bget?", hit$kegg_id, " target='_blank'>", hit$kegg_id,"</a>", sep="")), sep=""),
paste(ifelse(hit$metlin_id=="NA","",paste("<a href=http://metlin.scripps.edu/metabo_info.php?molid=", hit$metlin_id," target='_blank'>",hit$metlin_id,"</a>", sep="")), sep=""));
}
# add "none" option
can.mat[nrow(candidates)+1,] <- c("None of the above", "", "", "", "", "");
# add the hit columns
return.cols <- c(TRUE, return.cols);
return(as.vector(can.mat[,return.cols, drop=F]));
}
GetCanListRowNumber<-function(inx){
return(nrow(candidates)+1); # include the "none" row
}
GetQuery<-function(inx){
return(dataSet$cmpd[inx]);
}
# set matched name based on user selection from all potential hits
# note: to change object in the enclosing enviroment, use "<<-"
SetCandidate<-function(query_inx, can_inx){
can_mat<- candidates;
if(can_inx <= nrow(can_mat)){
name.map$hit.inx[query_inx] <<- can_mat[can_inx,1];
name.map$hit.values[query_inx] <<- can_mat[can_inx,2];
name.map$match.state[query_inx] <<- 1;
# re-generate the CSV file
hit <-cmpd.db[name.map$hit.inx[query_inx], ,drop=F];
csv.res <- dataSet$map.table;
if(ncol(csv.res) > 6){ # general utilities
csv.res[query_inx, ]<-c(csv.res[query_inx, 1],
name.map$hit.values[query_inx],
hit$hmdb_id,
hit$pubchem_id,
hit$chebi_id,
hit$kegg_id,
hit$metlin_id,
1);
}else{
csv.res[query_inx, ]<-c(csv.res[query_inx, 1],
name.map$hit.values[query_inx],
hit$hmdb_id,
hit$pubchem_id,
hit$kegg_id,
1);
}
write.csv(csv.res, file="name_map.csv", row.names=F);
dataSet$map.table <<- csv.res;
}else{ #no match
name.map$hit.inx[query_inx] <<- 0;
name.map$hit.values[query_inx] <<- "";
name.map$match.state[query_inx] <<- 0;
}
}
# return the final (after user selection) map as dataframe
# three col, original name, HMDB name and KEGG ID,
# for Enrichment and pathway analysis, respectively
GetFinalNameMap<-function(){
hit.inx<-name.map$hit.inx;
hit.values<-name.map$hit.values;
match.state<-name.map$match.state;
qvec <- dataSet$cmpd;
nm.mat<-matrix(nrow=length(qvec), ncol=3);
colnames(nm.mat)<-c("query", "hmdb", "kegg");
for (i in 1:length(qvec)){
hit <-cmpd.db[hit.inx[i], ,drop=F];
if(match.state[i]==0){
hmdb.hit <- NA;
kegg.hit <- NA;
}else{
hmdb.hit <- ifelse(nchar(hit.values[i])==0, NA, hit.values[i]);
kegg.hit <- ifelse(nchar(hit$kegg_id)==0, NA, hit$kegg_id);
}
nm.mat[i, ]<-c(qvec[i],hmdb.hit, kegg.hit);
}
return(as.data.frame(nm.mat));
}
GetMapTable<-function(){
require(xtable);
print(xtable(dataSet$map.table, caption="Result from Compound Name Mapping"),
tabular.environment = "longtable", caption.placement="top", size="\\scriptsize");
}
PathMapping<-function(qvec){
qvec <- strsplit(qvec, "; *")[[1]];
if(is.null(cmpd.db)){
load("../../libs/cmpd_name.rda", .GlobalEnv);
}
if(!exists("syn.db")){
load("../../libs/syn_nms_db.rda", .GlobalEnv);
}
if(!exists('path.list')){
LoadSmpLib();
}
# first find exact match to the common compound names
hit.inx <- match(tolower(qvec), tolower(cmpd.db$name));
match.values <- cmpd.db$name[hit.inx];
match.ids <- cmpd.db$hmdb[hit.inx];
# then try to find exact match to synanyms for the remaining unmatched query names one by one
todo.inx <-which(is.na(hit.inx));
if(length(todo.inx) > 0){
syns.list <- syn.db$syns.list;
for(i in 1:length(syns.list)){
syns <- syns.list[[i]];
hitInx <- match(tolower(qvec[todo.inx]), tolower(syns));
hitPos <- which(!is.na(hitInx));
if(length(hitPos)>0){
# record matched ones
orig.inx<-todo.inx[hitPos];
match.values[orig.inx] <- cmpd.db$name[i]; # show common name
match.ids[orig.inx] <- cmpd.db$hmdb[i];
# update unmatched list
todo.inx<-todo.inx[is.na(hitInx)];
}
}
}
na.inx<-is.na(match.values);
qvec.nm <- match.values[!na.inx];
qvec.id <- match.ids[!na.inx];
# then used the matched.values to search the pathway database
hits<-lapply(path.list, match, qvec.nm);
set.size<-length(path.list);
res.mat<-matrix(NA, nrow=set.size, ncol=4);
path.nms<- names(path.list);
for(i in 1:set.size){
path <- path.list[[i]];
m.inx <- hits[[i]];
m.inx <- m.inx[!is.na(m.inx)];
if(length(m.inx)>0){
# note: need to set cmpd highlights in SMPDB using the hmdbid
# syntax: http://pathman.smpdb.ca/pathways/SMP00055/pathway?reset=true&highlight[HMDB00243]=true
# changed URL--> http://www.smpdb.ca/view/SMP00055
pathUrl <- paste(path.link[i], "?reset=true", sep="");
highLights <- paste("highlight[", qvec.id[m.inx], "]=true", sep="", collapse="&");
res.mat[i, 1] <- paste("<a href=", paste(pathUrl, "&", highLights, sep=""), ">",path.nms[i],"</a>", sep="");
res.mat[i, 2] <- paste("<a href=http://www.hmdb.ca/metabolites/", qvec.id[m.inx], ">",qvec.nm[m.inx],"</a>", sep="", collapse="; ");
res.mat[i, 3] <- length(m.inx);
res.mat[i, 4] <- 1/length(path);
}
}
res.mat<-res.mat[!is.na(res.mat[,1]),];
ord.inx <- order(res.mat[,3], res.mat[,4], decreasing = T);
res.mat <- res.mat[ord.inx, -c(3,4)];
path.res <<- res.mat;
}
GetPathNames<-function(){
return(path.res[,1]);
}
GetMatchedCompounds<-function(){
return(path.res[,2]);
}
You can’t perform that action at this time.