Addressing the "no visible binding for global variable" warning

- Changes to ensure that the warning is stopped - Improved other issues, including variable names, provided data types, made internal functions #2 #28 #27
biodavidjm · Sep 17, 2018 · 6683b63 · 6683b63
1 parent 0f7c7f6
commit 6683b63
Show file tree

Hide file tree

Showing 11 changed files with 170 additions and 45 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: artMS
 Type: Package
 Title: Analytical R tools for Mass Spectrometry
-Version: 0.99.5
+Version: 0.99.6
 Date: 2018-09-13
 Author: David Jimenez-Morales, John Von Dollen, Alexandre Rosa Campos
 Maintainer: David Jimenez-Morales <biodavidjm@gmail.com>
@@ -53,6 +53,7 @@ Imports:
   org.Pf.plasmo.db,
   org.Pt.eg.db,
   org.Rn.eg.db,
+  org.Sc.sgd.db,
   org.Ss.eg.db,
   org.Xl.eg.db,
   PerformanceAnalytics,

diff --git a/NAMESPACE b/NAMESPACE
@@ -61,6 +61,7 @@ import(org.Mmu.eg.db)
 import(org.Pf.plasmo.db)
 import(org.Pt.eg.db)
 import(org.Rn.eg.db)
+import(org.Sc.sgd.db)
 import(org.Ss.eg.db)
 import(org.Xl.eg.db)
 import(pheatmap)
@@ -105,6 +106,7 @@ importFrom(stats,order.dendrogram)
 importFrom(stats,phyper)
 importFrom(tidyr,unnest)
 importFrom(utils,combn)
+importFrom(utils,globalVariables)
 importFrom(utils,head)
 importFrom(utils,read.delim)
 importFrom(utils,setTxtProgressBar)

diff --git a/R/MSstats_functions.R b/R/MSstats_functions.R
@@ -193,9 +193,9 @@ artms_mergeMaxQDataWithKeys <- function(data, keys, by=c('RawFile')){
 #' @return (data.frame) with both evidence and keys files merged by raw.files
 #' @keywords internal, merge, evidence, keys
 #' @examples \donttest{
-#' evidenceKeys <- artms_mergeEvidenceKeysByFiles(
-#' 	evidence_file = "FLU-THP1-H1N1-AB-evidence.txt", 
-#' 	keys_file = "FLU-THP1-H1N1-AB-keys.txt")
+#'   evidenceKeys <- artms_mergeEvidenceKeysByFiles(
+#'                   evidence_file = "FLU-THP1-H1N1-AB-evidence.txt", 
+#'                   keys_file = "FLU-THP1-H1N1-AB-keys.txt")
 #' }
 #' @export
 artms_mergeEvidenceKeysByFiles <- function(evidence_file, keys_file) {
@@ -400,8 +400,8 @@ artms_resultsWide <- function(evidence_file, output_file){
 .artms_significantHits <- function(mss_results, labels='*', LFC=c(-2,2), FDR=0.05){
   ## get subset based on labels
   selected_results = mss_results[grep(labels,mss_results$Label), ]
-  cat(sprintf('>> AVAILABLE LABELS FOR HEATMAP:\n %s\n',paste(unique(mss_results$Label), collapse=', ')))
-  cat(sprintf('>> SELECTED LABELS FOR HEATMAP:\n %s\n',paste(unique(selected_results$Label), collapse=', ')))
+  # cat(sprintf('>> AVAILABLE LABELS FOR HEATMAP:\n %s\n',paste(unique(mss_results$Label), collapse=', ')))
+  # cat(sprintf('>> SELECTED LABELS FOR HEATMAP:\n %s\n',paste(unique(selected_results$Label), collapse=', ')))
   significant_proteins = selected_results[(!is.na(selected_results$log2FC) & selected_results$adj.pvalue <= FDR & (selected_results$log2FC >= LFC[2] | selected_results$log2FC <= LFC[1])) , 'Protein']
   significant_results = selected_results[selected_results$Protein %in% significant_proteins, ]
   return(significant_results)

diff --git a/R/analysisQuantifications.R b/R/analysisQuantifications.R
@@ -293,8 +293,8 @@ artms_analysisQuantifications <- function(log2fc_file,
   abundancesName <- paste0("plot.",abundancesName)
   abundancesName <- paste0(output_dir,"/",abundancesName)
   pdf(abundancesName)
-    .artms_plotAbundanceBoxplots(dfmq)
-    .artms_plotNumberProteinsAbundance(dfmq)
+    .artms_plotAbundanceBoxplots(data = dfmq)
+    .artms_plotNumberProteinsAbundance(data = dfmq)
   garbage <- dev.off()
 
   # Reproducibility plots based on normalized abundance
@@ -336,7 +336,7 @@ artms_analysisQuantifications <- function(log2fc_file,
   # TECHNICAL REPLICAS: if there are technical replicas means that we will find
   # two values for the same protein in the same bioreplica, therefore we need to 
   # aggregate first just in case:
-  abundance <- aggregate(Abundance~Prey+Bait+Bioreplica, data = abundance, FUN = mean)
+  abundance <- aggregate(Abundance~Prey+Bait+BioReplicate, data = abundance, FUN = mean)
 
   # Let's aggregate to get the sum of the abundance, we will use it later.
   abundance_dcsum <- data.table::dcast(abundance, Prey~Bait, value.var = 'Abundance', fun.aggregate = sum, fill = 0 )
@@ -1221,10 +1221,10 @@ artms_generatePhSiteExtended <- function(df, pathogen, specie, ptmType){
   # Take the abundance values for all the proteins
   abu2imp <- .artms_loadModelqcBasic(dfmq)
   # Aggregate the technical replica by choosing the maximum value
-  abu2imp2 <- aggregate(Abundance~Protein+Condition+Bioreplica, data = abu2imp, FUN = mean)
+  abu2imp2 <- aggregate(Abundance~Protein+Condition+BioReplicate, data = abu2imp, FUN = mean)
 
   # Check things that will be imputed
-  # dfdc.ni <- data.table::dcast(data=abu2imp2, Protein~Bioreplica, value.var = "Abundance")  
+  # dfdc.ni <- data.table::dcast(data=abu2imp2, Protein~BioReplicate, value.var = "Abundance")  
 
   # Two possible options here. 
   # 1. Select based on the bottom x%
@@ -1247,13 +1247,13 @@ artms_generatePhSiteExtended <- function(df, pathogen, specie, ptmType){
   numbers2sample <- seq(from=theMin, to=theMax, by=.00001)
 
   # dcast on abundance and fill with random numbers between the minimum and q05
-  suppressWarnings(dfdc.im <- data.table::dcast(data=abu2imp2, Protein~Bioreplica, value.var = "Abundance", fill = sample(numbers2sample, replace = F)  ) )
+  suppressWarnings(dfdc.im <- data.table::dcast(data=abu2imp2, Protein~BioReplicate, value.var = "Abundance", fill = sample(numbers2sample, replace = F)  ) )
 
   # Needs to aggregate on biological replicas
   # 1. Melt on biological replicas
-  dfdc.melt <- reshape2::melt(dfdc.im, id.vars = c('Protein'), value.name = 'Abundance', variable.name = 'Bioreplica')
+  dfdc.melt <- reshape2::melt(dfdc.im, id.vars = c('Protein'), value.name = 'Abundance', variable.name = 'BioReplicate')
   # 2. Get the condition
-  dfdc.melt$Condition <- gsub("(.*)(-)(.*)","\\1",dfdc.melt$Bioreplica)
+  dfdc.melt$Condition <- gsub("(.*)(-)(.*)","\\1",dfdc.melt$BioReplicate)
   # 3. Dcast and Aggregate on the condition, taking the mean
   dfdc.final <- data.table::dcast(data=dfdc.melt, Protein~Condition, value.var = 'Abundance', fun.aggregate = mean)
   # 4. Filter by proteins to impute
@@ -1326,9 +1326,10 @@ artms_generatePhSiteExtended <- function(df, pathogen, specie, ptmType){
 
 #------------------------------------------------------------------------------
 # @title Load the basic ModelQC file
+# 
 # @param data (data.frame) of the ModelQC file
 # @return (data.frame) of the modelqc file with the columns Protein, Abundance, 
-# Condition, Bioreplica
+# Condition, BioReplicate
 # @keywords internal, loading
 .artms_loadModelqcBasic <- function(data){
   if( length(grep(";",data$PROTEIN))>0 ) data <- data[-grep(";",data$PROTEIN),] # NOTE!!! We lose a lot of entries this way.
@@ -1351,12 +1352,12 @@ artms_generatePhSiteExtended <- function(df, pathogen, specie, ptmType){
     stop("Abort mission\n!")
   }
   if("SUBJECT_ORIGINAL" %in% colnames(data)){
-    names(data)[grep("SUBJECT_ORIGINAL", names(data))] <- 'Bioreplica'
+    names(data)[grep("SUBJECT_ORIGINAL", names(data))] <- 'BioReplicate'
   }else{
     cat("ERROR: you should check the abundance file because something is seriously wrong!\n") 
     stop("Abort mission\n!")
   }
-  data <- subset(data, select = c(Protein, Abundance, Condition, Bioreplica))
+  data <- subset(data, select = c(Protein, Abundance, Condition, BioReplicate))
   return(data)
 }
 

diff --git a/R/main.R b/R/main.R
@@ -34,6 +34,7 @@
 #' @import org.Pf.plasmo.db
 #' @import org.Pt.eg.db
 #' @import org.Rn.eg.db
+#' @import org.Sc.sgd.db
 #' @import org.Ss.eg.db
 #' @import org.Xl.eg.db
 #' @importFrom PerformanceAnalytics chart.Correlation
@@ -47,10 +48,23 @@
 #' @importFrom stats aggregate as.dendrogram cor dist fisher.test hclust kmeans median order.dendrogram phyper as.dist complete.cases
 #' @import stringr
 #' @importFrom tidyr unnest
-#' @importFrom utils combn read.delim write.table setTxtProgressBar txtProgressBar head
+#' @importFrom utils combn read.delim write.table setTxtProgressBar txtProgressBar head globalVariables
 #' @import VennDiagram
 #' @import yaml
 
+utils::globalVariables(c("Modifications", "Protein", "Abundance", "Condition", 
+"BioReplicate", "Comparison", "Intensity", "log2FC", "Label", "RawFile",
+"iLog2FC", "pvalue", "adj.pvalue", "iPvalue", "Prey", "ReproBioreplicaCount", 
+"ReproConditionCount", "category", "Specie", "AbMean", "Gene", "imputedDFext",
+"pathogen.ids", "Contaminant", "TR", "MODIFICATION", "Proteins", "ComplexName",
+"PTMone", "PTMsite", "output_dir", "isPtm", "log2fc_file",
+"artms_data_corum_mito_database",
+"SUBJECT_ORIGINAL", "ABUNDANCE", "IsotopeLabelType", 
+"GROUP_ORIGINAL", "SYMBOL", "GENENAME", "PROTEIN", "FEATURE", 
+"pearson", "..count..", "tr1", "tr2", "sample_name", "value", "prot_names",
+"cluster", "ymax", "ymin", "uniprot_ac", "uniprot_id", "res_index", "ptm_site"
+))
+
 # ------------------------------------------------------------------------------
 #' @title Relative quantification using MSstats
 #' 
@@ -212,7 +226,7 @@ artms_quantification <- function(yaml_config_file){
     .artms_writeExtras(results$ComparisonResult, config)
   }
 
-  cat(">> ANALYSIS COMPLETE! HAVE A NICE DAY :)\n")
+  cat("\nANALYSIS COMPLETE! HAVE A NICE DAY :)\n")
 }
 
 
diff --git a/R/plots.R b/R/plots.R
@@ -8,8 +8,10 @@
 #' @return A ggplot2 correlation plot
 #' @keywords plot, correlation
 .artms_plotCorrelationDistribution <- function(MatrixCorrelations){
-  cor.data <- MatrixCorrelations[upper.tri(MatrixCorrelations, diag = FALSE)]  # we're only interested in one of the off-diagonals, otherwise there'd be duplicates
-  cor.data <- as.data.frame(cor.data)  # that's how ggplot likes it
+  # we're only interested in one of the off-diagonals, 
+  # otherwise there'd be duplicates
+  cor.data <- MatrixCorrelations[upper.tri(MatrixCorrelations, diag = FALSE)]  
+  cor.data <- as.data.frame(cor.data)
   colnames(cor.data) <- "pearson"
 
   g <- ggplot(data = cor.data, mapping = aes(x = pearson))
@@ -65,6 +67,76 @@ artms_dataPlots <- function(input_file, output_file){
   garbarge <- dev.off()
 }
 
+# ------------------------------------------------------------------------------
+#' @title Heatmap of significant values
+#' 
+#' @description heatmap plot to represent proteins with significant changes
+#' @param mss_F (data.frame) with the significant values (log2fc, pvalues)
+#' @param out_file (char) Name for the output
+#' @param labelOrder (vector) Vector with the particular order for the IDs 
+#' (default, `NULL` no order)
+#' @param names (char) Type of ID used. Default is `Protein` (uniprot entry id). 
+#' Soon will be possible to use 'Gene' name ids.
+#' @param cluster_cols (logical) Select whether to cluster the columns. 
+#' Options: `T` or `F`. Default `T`.
+#' @param display (char) Value used to genarate the heatmaps. Options: 
+#' - `log2FC` (default)
+#' - `adj.pvalue`
+#' - `pvalue`
+#' @return A heatmap of significant values
+#' @keywords significant, heatmap
+.artms_plotHeat <- function(mss_F, out_file, labelOrder=NULL, names='Protein', cluster_cols=F, display='log2FC'){
+  heat_data = data.frame(mss_F, names=names)
+
+  ## create matrix from log2FC or p-value as user defined
+  if(display=='log2FC'){
+    # Issues with extreme_val later if we have Inf/-Inf values.
+    if( sum(is.infinite(heat_data$log2FC)) > 0 ){
+      idx <- is.infinite(heat_data$log2FC)
+      heat_data$log2FC[ idx ] <- NA
+    }
+    heat_data_w = dcast(names ~ Label, data=heat_data, value.var='log2FC') 
+  }else if(display=='adj.pvalue'){
+    heat_data$adj.pvalue = -log10(heat_data$adj.pvalue+10^-16)  
+    heat_data_w = dcast(names ~ Label, data=heat_data, value.var='adj.pvalue')  
+  }else if(display=='pvalue'){
+    heat_data$pvalue = -log10(heat_data$pvalue+10^-16)  
+    heat_data_w = dcast(names ~ Label, data=heat_data, value.var='pvalue')  
+  }
+
+  ## try
+  #gene_names = uniprot_to_gene_replace(uniprot_ac=heat_data_w$Protein)
+  rownames(heat_data_w) = heat_data_w$names
+  heat_data_w = heat_data_w[,-1]
+  heat_data_w[is.na(heat_data_w)]=0
+  max_val = ceiling(max(heat_data_w))
+  min_val = floor(min(heat_data_w))
+  extreme_val = max(max_val, abs(min_val))
+  if(extreme_val %% 2 != 0) extreme_val=extreme_val+1
+  bin_size=2
+  signed_bins = (extreme_val/bin_size)
+  colors_neg = rev(colorRampPalette(brewer.pal("Blues",n=extreme_val/bin_size))(signed_bins))
+  colors_pos = colorRampPalette(brewer.pal("Reds",n=extreme_val/bin_size))(signed_bins)
+  colors_tot = c(colors_neg, colors_pos)
+
+  if(is.null(labelOrder)){
+    pheatmap(heat_data_w, scale="none", cellheight=10, 
+             cellwidth=10, filename =out_file, color=colors_tot, 
+             breaks=seq(from=-extreme_val, to=extreme_val, by=bin_size), 
+             cluster_cols=cluster_cols, fontfamily="mono")  
+    cat("--- Heatmap is out\n")
+  }else{
+    heat_data_w <- heat_data_w[,labelOrder]
+    pheatmap(heat_data_w, scale="none", cellheight=10, cellwidth=10, 
+             filename=out_file, color=colors_tot, 
+             breaks=seq(from=-extreme_val, to=extreme_val, by=bin_size), 
+             cluster_cols=cluster_cols, fontfamily="mono")
+    cat("--- Heatmap is out\n")
+  }
+
+  return(heat_data_w)
+}
+
 # ------------------------------------------------------------------------------
 #' @title Outputs a heatmap of the MSStats results created using the log2fold 
 #' changes
@@ -317,12 +389,11 @@ artms_plotHeatmapQuant <- function(input_file,
 #' @param data (data.frame) modelqc
 #' @return (pdf) Barplots with the number of proteins per br / condition
 #' @keywords internal, plots, abundance, counts
-#' .artms_plotNumberProteinsAbundance()
-artms_plotNumberProteinsAbundance <- function(data) {
+.artms_plotNumberProteinsAbundance <- function(data) {
   x <- data[c('PROTEIN','SUBJECT_ORIGINAL')]
   y <- unique(x)
-  names(y)[grep('SUBJECT_ORIGINAL', names(y))] <- 'BioReplicates'
-  z <- ggplot2::ggplot(y, aes(x = BioReplicates, fill = BioReplicates))
+  names(y)[grep('SUBJECT_ORIGINAL', names(y))] <- 'BioReplicate'
+  z <- ggplot2::ggplot(y, aes(x = BioReplicate, fill = BioReplicate))
   z <- z + geom_bar(stat = "count")
   z <- z + theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5))
   z <- z + geom_text(stat='count', aes(label=..count..), vjust=-0.5, size = 2.7)
@@ -331,8 +402,8 @@ artms_plotNumberProteinsAbundance <- function(data) {
 
   a <- data[c('PROTEIN','GROUP_ORIGINAL')]
   b <- unique(a)
-  names(b)[grep('GROUP_ORIGINAL', names(b))] <- 'Conditions'
-  c <- ggplot2::ggplot(b, aes(x = Conditions, fill = Conditions))
+  names(b)[grep('GROUP_ORIGINAL', names(b))] <- 'Condition'
+  c <- ggplot2::ggplot(b, aes(x = Condition, fill = Condition))
   c <- c + geom_bar(stat = "count")
   c <- c + theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5))
   c <- c + geom_text(stat='count', aes(label=..count..), vjust=-0.5, size = 2.7)

diff --git a/R/runMSstats.R b/R/runMSstats.R
@@ -19,6 +19,7 @@
 #' - MSstats `designSampleSize` power experiment
 #' @keywords internal, run, MSstats, contrast, intensity, plots, QC
 .artms_runMSstats <- function(dmss, contrasts, config){
+  cat(">> MSstats STARTS RUNNING:\n")
   # plot the data BEFORE normalization
   if(grepl('before', config$msstats$profilePlots)){
     mssquant = dataProcess(raw = dmss, 
@@ -51,7 +52,7 @@
                             MBimpute = config$msstats$MBimpute,
                             featureSubset=config$msstats$feature_subset)
   }else{
-    cat(sprintf('\n>> NORMALIZATION\t%s\n',config$msstats$normalization_method))
+    cat(sprintf('\n--- NORMALIZATION METHOD: %s\n',config$msstats$normalization_method))
     #mssquant = dataProcess(dmss, normalization=config$msstats$normalization_method , fillIncompleteRows = F, betweenRunInterferenceScore = F)
     mssquant = dataProcess(raw = dmss, 
                            normalization=config$msstats$normalization_method,
@@ -81,17 +82,17 @@
 
   cat(sprintf('\tFITTING CONTRASTS:\t%s\n',paste(rownames(contrasts),collapse=',')))
   write.table(mssquant$ProcessedData, file=gsub('.txt','-mss-normalized.txt',config$files$output), eol="\n", sep="\t", quote=F, row.names=F, col.names=T)
-  results = groupComparison(data = mssquant, contrast.matrix = contrasts)
+  results <- groupComparison(data = mssquant, contrast.matrix = contrasts)
   write.table(results$ComparisonResult, file=config$files$output, eol="\n", sep="\t", quote=F, row.names=F, col.names=T)  
   write.table(results$ModelQC, file=gsub(".txt","_ModelQC.txt",config$files$output), eol="\n", sep="\t", quote=F, row.names=F, col.names=T) 
-  cat(sprintf(">> WRITTEN\t%s\n",config$files$output))
+  cat(">> MSstats IS DONE!\n")
 
   #(1) Minimal number of biological replicates per condition
   cat(">> CALCULATING SAMPLE SIZE FOR FUTURE EXPERIMENTS\n" )
   results.ss1 <- designSampleSize(data=results$fittedmodel,numSample=TRUE,desiredFC=c(1.25,2),FDR=0.05,power=0.95)
   results.ss2 <- designSampleSize(data=results$fittedmodel,numSample=TRUE,desiredFC=c(1.25,2),FDR=0.05,power=0.9)
   results.ss3 <- designSampleSize(data=results$fittedmodel,numSample=TRUE,desiredFC=c(1.25,2),FDR=0.05,power=0.8)
-  results.ss = rbind( results.ss1, results.ss2, results.ss3)
+  results.ss <- rbind( results.ss1, results.ss2, results.ss3)
   write.table(results.ss, file=gsub(".txt","_sampleSize.txt",config$files$output), eol="\n", sep="\t", quote=F, row.names=F, col.names=T) 
 
   #(2) Power calculation

diff --git a/R/writeExtras.R b/R/writeExtras.R
@@ -41,24 +41,22 @@
   sign_labels <- unique(sign_hits$Label)
   cat(sprintf("\tSELECTED HITS FOR PLOTS WITH LFC BETWEEN %s AND %s AT %s FDR:\t%s\n",lfc_lower, lfc_upper, config$output_extras$FDR, nrow(sign_hits)/length(sign_labels))) 
 
-  cat(paste("output_file: ", config$files$output, "\n"))
-
   ## REPRESENTING RESULTS AS HEATMAP
   if(config$output_extras$heatmap){
     ## plot heat map for all contrasts
-    cat(">>   PLOTTING HEATMAP FOR ALL CONTRASTS\n")
+    cat(">> PLOTTING HEATMAP FOR SIGNIFICANT CHANGES\n")
     heat_labels <- .artms_prettyPrintHeatmapLabels(uniprot_acs=sign_hits$Protein,uniprot_ids=sign_hits$name, gene_names=sign_hits$Gene.names)
-    heat_data_w <- plotHeat(mss_F = sign_hits, out_file =  gsub('.txt','-sign.pdf',config$files$output), names=heat_labels, cluster_cols=config$output_extras$heatmap_cluster_cols, display = config$output_extras$heatmap_display)  
+    heat_data_w <- .artms_plotHeat(mss_F = sign_hits, out_file =  gsub('.txt','-sign.pdf',config$files$output), names=heat_labels, cluster_cols=config$output_extras$heatmap_cluster_cols, display = config$output_extras$heatmap_display)  
   }
 
   if(config$output_extras$volcano){
-    cat(">>   PLOTTING VOLCANO PLOT\n")
+    cat(">> PLOTTING VOLCANO PLOT\n")
     file_name <- gsub('.txt','-volcano.pdf',config$files$output)
     artms_volcanoPlot(results_ann[grep(selected_labels,results_ann$Label),], lfc_upper, lfc_lower, FDR=config$output_extras$FDR, file_name=file_name)  
   }
 }
 
-# 
+
 # ------------------------------------------------------------------------------
 #' @title Annotate the files based on the Uniprot accession id
 #' @description Annotate the files based on the Uniprot accession id
@@ -81,7 +79,7 @@
   Uniprot = NULL
   for(org in species_split){
     cat(sprintf("\tLOADING %s\n",org))
-    tmp <- read.delim(sprintf("%s/uniprot_protein_descriptions_%s.txt",uniprot_dir,org), stringsAsFactors=F, quote="")    
+    tmp <- read.delim(sprintf("%s/uniprot_protein_descriptions_%s.txt", uniprot_dir,org), stringsAsFactors=F, quote="")    
     if(is.null(Uniprot)){
       Uniprot = as.data.frame(tmp)
     }else{

diff --git a/man/artms_mergeEvidenceKeysByFiles.Rd b/man/artms_mergeEvidenceKeysByFiles.Rd