**Obtaining selective columns from cluster summary files and performing analysis on them and storing them as a new file**<br>
Authors: Abzer Kelminal (abzer.shah@uni-tuebingen.de)<br>
Edited by:  <br>
Input file format: .clustersummary files from GNPS Classic MN output<br>
Outputs: Combined .csv files<br>
Dependencies: library(dplyr)

In [None]:
getwd()  #gets the current working directory
install.packages('dplyr')
library('dplyr')

**STEP 1 : Setting the working directory:**
 Copy the path of your files and simply paste it in the following setwd .<br> 
 For ex: (C:\Users\Nutzer\Desktop\Test_Data). Make sure to change the \ symbol to / while copying the path  in setwd.

In [None]:
setwd("C:/Users/Nutzer/Desktop/Test_Data") 

**STEP 2 :** Initially, get the path of the data folder and its files.

In [None]:
pattern=".clustersummary"             #Here you can change your file type accordingly as .csv or .xlsx 
dirs <- dir(path=paste(getwd(), sep=""), pattern=pattern, full.names=TRUE, recursive=TRUE)       # Gets the complete path of each file and store them on 'dirs'
folders <- unique(dirname(dirs))      #  Gets the path of the folder with the sample files and store it in 'folders'                                                    
files = list.files(folders, pattern=pattern, full.names=TRUE)  # listing the files in 'folders' and store it in 'files'
files_1 <- basename((files))  # just gets the name of each files
files_2 <- dirname((files))   # gets the folder path of each file

**STEP 3 :** Creating a Result folder to store all the result files

In [None]:
for (j in 1:length(files))
{
  files_1[[j]] <- strsplit(files_1[[j]], ".clustersummary")[[1]]
}

dir.create(path=paste(files_2[[1]], "_Results", sep=""), showWarnings = TRUE)
fName <-paste(files_2[[1]], "_Results", sep="")

**STEP 4 :** Reading the files and selecting the columns needed for analysis and storing them as new files in Result folder.

In [None]:

temp <- list()  # Creating empty lists
final <- list()


for (j in 1:length(files))
{
  
  if(pattern == ".clustersummary") 
  { 
    temp[[j]] <- read.csv(file=files[j], header=TRUE, sep="\t")    # Reading the clustersummary files and storing all of them into temp list
    
  }
  
  clusterID <- temp[[j]] %>% select(starts_with("cluster") & ends_with("index"))          #selecting the columns individually with their names
  PrecursorMass<-temp[[j]] %>% select(starts_with("precursor") & ends_with("mass"))
  ComponentIndex<-temp[[j]] %>% select(starts_with("component") & ends_with("index"))
  LibraryID<-temp[[j]] %>% select(starts_with("Library") & ends_with("ID"))
  GNPS<-temp[[j]] %>% select(contains("GNPSGROUP"))   # gets all the columns with the name GNPSGROUP. It ranges from 2 to few columns.
  
  # Here, the calculations will be performed and stored under different variables
  
  BinID <- ifelse(LibraryID == 'N/A',0,1)    #creates a column 'BinID' with values as 0 if the corresponding value in the column 'LibraryID' is N/A. Else, it stores 1
  colnames(BinID) <- "Binary_Library_ID"
    
  BinPresent <- ifelse(GNPS > 0,1,0)  #creates a column or set of columns named 'BinPresent'. Here the values in each column of GNPS is checked as such: if the value > 0, then store 1 (i.e TRUE) in the corresponding column of 'BinPresent'. Else, store 0
  colnames(BinPresent) <- paste("Binary_Present",colnames(BinPresent), sep="-") # the output here will be "Binary_Present-GNPSGROUP.---". For ex: "Binary_Present-GNPSGROUP.Top3"
  colnames(BinPresent) <- gsub(paste0("GNPSGROUP.",collapse = "|"),"", colnames(BinPresent)) # 'GNPSGROUP.' is removed from the column names of BinPresent. For ex: "Binary_Present-Top3"
  
  
  Bin_Present_Temp <- ifelse(BinPresent == 1, 1, 0) #creating a temporary column or set of columns 'Bin_Present_Temp'. It stores a value 1 in its column if the corresponding column in 'BinPresent' has value 1. Else, stores 0
  colnames(Bin_Present_Temp) <- paste("Binary_Temp",colnames(GNPS), sep="-")
  
  BinLibID <-c() # creates an empty vector
  for (i in 1:ncol(BinPresent)){
    x1 <-ifelse(BinID == 1,Bin_Present_Temp[,i],0)     #Here, values in each column of 'BinID' is checked for value 1. If true, it stores the value from corresponding column of 'Bin_Present_Temp' into x1. Else stores 0
    BinLibID <- cbind(BinLibID,x1) # Combining all these x1 columns to get a final 'BinLibID'
  }
  
  colnames(BinLibID) <- paste("Binary_Library_ID",colnames(GNPS), sep="-")
  colnames(BinLibID) <- gsub(paste0("GNPSGROUP.",collapse = "|"),"", colnames(BinLibID))
 
  
  BinSelfloop <- c()
  for (i in 1:ncol(BinPresent)){
    x1 <-ifelse(ComponentIndex == -1,Bin_Present_Temp[,i],0) #Here, values in each column of 'Component Index' is checked for value -1. If true, it stores the value from corresponding column of 'Bin_Present_Temp' into x1. Else stores 0
    BinSelfloop <- cbind(BinSelfloop,x1) # Combining all these x1 columns to get a final 'BinSelfloop'
  }
  
  colnames(BinSelfloop) <- paste("Binary_Selfloop",colnames(GNPS), sep="-")
  colnames(BinSelfloop) <- gsub(paste0("GNPSGROUP.",collapse = "|"),"", colnames(BinSelfloop))


  var <-c()
  for (i in 1:ncol(BinPresent)){
    x1<- ifelse(Bin_Present_Temp[,i] == 1, clusterID[,1],0)   #Here, values in each column of 'Bin_Present_Temp' is checked for 1. If true, it stores the corresponding value from 'Cluster index' into 'x1'. Else stores 0
    var <- cbind(var,x1) # Combining all these x1 columns to get a final 'var'
  }
  
 colnames(var) <- gsub(paste0("GNPSGROUP.",collapse = "|"),"", colnames(GNPS))

  
 result<- NULL 
 result<- cbind(clusterID,PrecursorMass,ComponentIndex,LibraryID,BinID,GNPS, BinPresent, BinLibID, BinSelfloop,var) #combining all the above created columns into one named 'result'
  
  final[[j]] <- as.data.frame(result) 
  write.csv(final[[j]], file=paste(fName, "/NewFiles_", files_1[[j]], ".csv", sep=""), sep='\t',row.names = F) # The results will be stored as individual csv files in the Results folder with a starting name "NewFiles"
  
}