## ChemProp2
Authors: Abzer Kelminal (abzer.shah@uni-tuebingen.de) <br>
Edited by: Daniel Petras (daniel.petras@uni-tuebingen.de) <br>
Input file format: .txt files or .csv files <br>
Outputs: .csv files  <br>
Dependencies: library(ggplot2), library(dplyr)

### About Input files:

- **Feature_file** is obtained by performing Feature based Molecular Networking on the data using MZmine software.
- **Nw_edge file** has the information of Feature IDs that are similar (not the same) in the columns 'Feature_ID_1' & 'Feature_ID_2'
- **Nw_edge file** is an output of GNPS. 
- **Clusterinfo file** is an output of FBMN

In [None]:
# setting the current directory as the working directory
Directory <- normalizePath(readline("Enter the path of the folder with input files: "),"/",mustWork=FALSE)
setwd(Directory)

In [None]:
# install the package if not present

#install.packages('ggplot2')
#install.packages('dplyr')

In [None]:
library('ggplot2')
library('dplyr')

In [None]:
# Getting all the files in the folder
dirs <- dir(path=paste(getwd(), sep=""), full.names=TRUE, recursive=TRUE)
folders <- unique(dirname(dirs))
files = list.files(folders, full.names=TRUE)
files_1 <- basename((files))
files_2 <- dirname((files))

In [None]:
# Creating a Result folder
dir.create(path=paste(files_2[[1]], "_Results", sep=""), showWarnings = TRUE)
fName <-paste(files_2[[1]], "_Results", sep="")

print(files_1)

In [None]:
Feature_file <- read.csv(files_1[(as.double(readline("Number of Feature file:")))], header=TRUE, row.names = 1,check.names = FALSE) # By applying 'row.names = 1', the 1st column 'ID' becomes the row names
Meta_File <-read.table(files_1[(as.double(readline("Number of Meta file:")))], sep="\t",header=TRUE, row.names = 1,check.names = FALSE)
Nw_edge <-read.table(files_1[(as.double(readline("Number of Network edge file:")))], sep="\t", header = TRUE,check.names = FALSE)

cluster_info <-read.csv(files_1[(as.double(readline("Number of Cluster_info file:")))],sep="\t",dec=".",header=TRUE,check.names = FALSE,fill=TRUE)

#For .csv files, use read.csv
#For .txt files, use read.table and include sep="\t" (i.e tab seperated)
#For .tsv files, use read.csv and include sep="\t" (i.e tab seperated)

In [None]:
print(colnames(Feature_file))
Info_columns <- as.double(1):as.double(readline('No.of columns to skip:'))

In [None]:
# head function returns the header (upto first 6 rows)of each files. This gives an idea about the content of the files.
head(Meta_File)

In [None]:
if(readline("Is Metadata information given column-wise? YES/NO:") == "NO"){
  Meta_File <- as.data.frame(t(Meta_File))
} 

In [None]:

Meta_Filtered <- Meta_File %>% filter(ATTRIBUTE_species == "E.coli")
MetaData_Name <- readline('Enter the MetaData Name:')
Meta_Data <- Meta_Filtered %>% select(contains(MetaData_Name)) ##Meta_Filtered or Meta_File

### Chemical Proportionality score:

- The below code adds a column of **Chemical Proportionality score** to the Nw_edge file. In addition to that, columns with information such as absolute values of ChemProp score and the sign of Chemprop scores are also added.
- In addition to ChemProp score using Pearson correlation method (which is ideal for linear transformations), the below code also generates scores using other methods such as spearman correlation, natural log transformation, square root transformations, for supporting non-linear data 

In [None]:
ChemProp2 <- c()
ChemProp_spearman <-c()
ChemProp_log <- c()
ChemProp_sqrt <- c()

for (i in 1:NROW(Nw_edge)) {
  
  x<- subset(Feature_file, rownames(Feature_file) == Nw_edge[i,1]) # rownames(Feature_file) is the feature ID or cluster ID. The subset command gets the 'Feature ID 1' from the first column of Nw_edge. Then picks the row from the Feature_file corresponding to the 'Feature ID 1'
  x<- rbind(x,subset(Feature_file, rownames(Feature_file) == Nw_edge[i,2]))
  # x is the subset data which has the Feature ID 1 and 2 specified according to Nw_edge file.
  x<-x[,-(Info_columns)] # Removing the columns other than feature intensities
  A<-colnames(x) 
  B<-rownames(Meta_Data)
  A==B # Checking the column names of the subset data x against that of meta data.
  reorder_id<-match(B,A) #Match gives the position in which B (the column names of Meta data) is present in A (subset data) and store the position info in reorder_id 
  reordered_x <- data.frame(t(x[reorder_id])) #Rearranging x (subset data) with respect to the new positions and transposing it
  reordered_x <- cbind(Meta_Data[,1],reordered_x) # Combining the metadata column (here, timepoint) with reordered_x
  #Thus, the resulting reordered_x contains 3 columns, such as: 'Metadata info(eg., Timepoint)', 'Feature ID 1', 'Feature ID 2'
  
  corr_result<-cor(reordered_x, method = "pearson") # Performing Pearson correlation
  ChemProp_score <- (corr_result[1,3] - corr_result[1,2]) / 2 # ChemProp2 score is obtained by: (Pearson(Feature ID 2) - Pearson(Feature ID 1)) / 2
  
  corr_2 <- cor(reordered_x, method = "spearman") # Performing Spearman correlation
  Score_spearman <- (corr_2[1,3] - corr_2[1,2]) / 2
  
  log_reorderedX <- cbind(reordered_x[,1],log(reordered_x[,2:3]+1)) # Performing natural log transformations on Feature IDs 1 and 2
  corr_3 <- cor(log_reorderedX) # performing (pearson) correlation on the log transformed data
  Score_log <-(corr_3[1,3] - corr_3[1,2]) / 2
  
  sqrt_reorderedX <- cbind(reordered_x[,1],sqrt(reordered_x[,2:3])) # Taking square roots of Feature IDs 1 and 2
  corr_4 <- cor(sqrt_reorderedX) # performing (pearson) correlation on the square roots
  Score_sqrt <- (corr_4[1,3] - corr_4[1,2])/2
  
  ChemProp2 <- rbind(ChemProp2, ChemProp_score, deparse.level = 0) # deparse.level = 0 constructs no labels; if not given, the resultant matrix has row names (for all rows) created from the input arguments such as 'ChemProp_score' here.
  ChemProp_spearman <- rbind(ChemProp_spearman,Score_spearman,  deparse.level = 0)
  ChemProp_log <- rbind(ChemProp_log,Score_log,  deparse.level = 0)
  ChemProp_sqrt <- rbind(ChemProp_sqrt, Score_sqrt, deparse.level = 0)
    
  Max_Y_Axis= max(max(reordered_x[,2]),max(reordered_x[,3]))
  
  #Plotting scatterplots
  if(is.na(ChemProp_score)==FALSE){
     
    
     Col_names <- colnames(reordered_x)
     png(filename=paste0(fName, "/ScatterPlot_",i,"_ChemProp_",ChemProp_score, ".png", sep=""), width=20, height=20, units='cm', bg='white', res=600)
     layout(matrix(1:1, nrow=1, byrow=FALSE))
     par(mar=c(5,4,4,8), mgp=c(2, 1, 0), cex.axis=1, cex.lab=1, cex.main=1,xpd=TRUE)
     plot(reordered_x[,1],reordered_x[,2],main=paste0("Scatter Plot of Cluster IDs: ",Col_names[2]," vs ",Col_names[3]), sub=paste0("ChemProp2 score: ",ChemProp_score),col="red",xlab = "Hours",ylab="Abundance",ylim=c(0,Max_Y_Axis))
     points(reordered_x[,1],reordered_x[,3],col="blue")
     xtick<-seq(0, 50, by=5)
     axis(side=1, at=xtick, labels = TRUE)
     
     legend("topright",inset=c(-0.2,0),legend=c(Col_names[2], Col_names[3]),col=c("red", "blue"), lty=1:2, cex=0.8,pch=1)
     dev.off()
}
print(paste0("No.of Scatter Plots in the Results Folder will be: ",sum(!is.na(ChemProp2))))
    
Nw_edge_new <- cbind (Nw_edge, ChemProp2,ChemProp_spearman,ChemProp_log,ChemProp_sqrt )
rownames(Nw_edge_new) <- NULL
Nw_edge_new <- Nw_edge_new[order(Nw_edge_new$ChemProp2, decreasing = TRUE), ] # Rearranging Nw_edge_new in the decreasing order of ChemProp2 score

Abs_values <- abs(Nw_edge_new[,6:9])
colnames(Abs_values) <- paste("abs", colnames(Abs_values), sep = "_")

Sign_ChemProp2 <- sign(Nw_edge_new$ChemProp2) #getting only the sign of ChemProp2 as 1 or -1
         
ChemProp2_file <- cbind(Nw_edge_new,Abs_values,Sign_ChemProp2)

<div class="alert alert-block alert-warning">
<b>Combining the information from clusterinfo file onto ChemProp file:</b>
</div> 

In [None]:
ChemProp_new <- c()
for (i in 1:nrow(ChemProp2_file)){
  y1<- subset(cluster_info, ChemProp2_file[i,1] == cluster_info$`cluster index`)
  colnames(y1) <- paste("Compound1", colnames(y1), sep = "_")
  y2<- subset(cluster_info, ChemProp2_file[i,2] == cluster_info$`cluster index`)
  colnames(y2) <- paste("Compound2", colnames(y2), sep = "_")
  Final <- cbind(ChemProp2_file[i,],y1[,31:ncol(y1)],y2[,31:ncol(y2)])
  ChemProp_new <- rbind(ChemProp_new,Final)
}

ChemProp_NAs_replaced <- ChemProp_new %>% mutate_if(is.numeric, ~replace(., is.na(.), 0)) 

write.csv(ChemProp2_file, paste0(fName,'/20220601_ChemProp2_score_Result_ohne_t0_Ecoli.csv'),row.names = F)
write.csv(ChemProp_NAs_replaced, paste0(fName,'/20220602_ChemProp2_replaced_NAs.csv'),row.names=F)

<div class="alert alert-block alert-warning">
<b>Visualizing the distribution of different ChemProp scores of the sample data:</b>
</div>

In [1]:
bins <- seq(-1,1,0.1)
SCORES<- c()

for (i in 6:9){
  scores<- cut(as.matrix(ChemProp2_file[,i]),bins,labels=as.character(seq(-0.9,1,0.1))) #cut function store the data into the appropriate bins
  scores_table<-cbind(transform(table(scores)), Condition=paste0("Freq_",names(ChemProp2_file[i])))
  SCORES <- rbind(SCORES,scores_table)
}

Barplot <- ggplot(SCORES, aes(scores, Freq, fill = Condition)) + 
  geom_bar(stat="identity", position = "dodge", width=0.8) + 
  scale_fill_brewer(palette = "Set1") +
  ggtitle(label="Frequency plot") +
  theme(text = element_text(size=14)) +
  xlab("Range") + ylab("Frequency") + labs(fill = "Frequency scores:") + 
  theme(text = element_text(size=12,face="bold"),
        axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),    # setting the angle for the x label
        axis.text.y = element_text(angle = 45, vjust = 0.5, hjust=1)) +  # setting the angle for the y label
  theme_bw() #white background and gray grid lines

Barplot + theme(plot.title = element_text(hjust = 0.5,size=16,face = "bold")) # centering the plot title 

ERROR: Error in as.matrix(ChemProp2_file[, i]): object 'ChemProp2_file' not found
