##### `If your files originate from a Thermo instrument:`

Copy all your *.raw files to data/raw directory and you can use the following script to generate a samples.tsv file automatically:

In [None]:
import pandas as pd
import os
df = pd.DataFrame()
df["sample_name"] = [file for file in os.listdir(os.path.join("data", "raw")) if file.endswith(".raw")]
df["sample_name"]=df["sample_name"].replace(".raw", value="", regex=True)
df["comment"] = " "
df["MAPnumber"] = " "
df.to_csv(os.path.join("config", "dataset.tsv"), sep="\t")
df.to_csv(os.path.join("config", "samples.tsv"), sep="\t")
df

##### `If your files originate from another instrument:`

Copy all your already converted files *.mzML to data/mzML directory and you can use the following script to generate a samples.tsv file automatically:

In [1]:
import pandas as pd
import os
df = pd.DataFrame()
df["sample_name"] = [file for file in os.listdir(os.path.join("data", "mzML")) if file.endswith(".mzML")]
df["sample_name"]=df["sample_name"].replace(".mzML", value="", regex=True)
df["comment"] = " "
df["MAPnumber"] = " "
df.to_csv(os.path.join("config", "dataset.tsv"), sep="\t")
df.to_csv(os.path.join("config", "samples.tsv"), sep="\t")
df

Unnamed: 0,sample_name,comment,MAPnumber
0,blank5,,
1,SDCe0686,,
2,SDCe0687,,
3,SDCe0681_1ul,,
4,blank4,,
5,blank3,,
6,blank2,,
7,SDCe0681,,
8,SDCe0682,,
9,MDCe0218,,


##### `Define blanks/QCs/controls:` 

In [3]:
fList =input("Please enter a list of comma separated filenames for your blanks, QCs or control samples from the filelist: ").split(",")
blank_DF = pd.DataFrame({"sample_name":fList})
for i, blank in enumerate(zip(blank_DF.index, blank_DF["sample_name"])):
    for i, filename in zip(df.index, df["sample_name"]):
        if blank==filename:
            blank_DF["sample_name"][i] = df["sample_name"][i]
    blank_DF= blank_DF.dropna(how="all")
    blank_DF.to_csv(os.path.join("config", "blanks.tsv"), sep="\t")

blank_DF= pd.read_csv(os.path.join("config", "blanks.tsv"), sep="\t", index_col="Unnamed: 0")
blank_DF["sample_name"]= blank_DF["sample_name"].astype(str)
sample_DF= pd.DataFrame()
if len(blank_DF)==0:
    df.to_csv(os.path.join("config", "samples.tsv"), sep="\t")
else:
    sample_DF= df[df["sample_name"].str.contains("|".join(blank_DF["sample_name"]))==False]
    sample_DF.to_csv(os.path.join("config", "samples.tsv"), sep="\t")

##### `Create a detailed GNPS metadata table:`
This is datafile-dependent so it is preferable to do it interactively through a Jupyter notebook

In [None]:
# Create a metadata table from the list of mzML files compatible for GNPS

metadata = pd.DataFrame()
metadata["filename"] = [file for file in os.listdir(os.path.join("data", "mzML")) if file.endswith(".mzML")]
metadata["ATTRIBUTE_MAPID"]= ["MAP" + str(i) for i in range(len(metadata))]
metadata['ATTRIBUTE_compound'] = metadata['filename'].replace(".mzML", value="", regex=True)
metadata['ATTRIBUTE_genomeID']=metadata['filename'].str.extract(r'(NBC_?\d*|NBC?\d*)')
metadata['ATTRIBUTE_genomeID']= metadata['ATTRIBUTE_genomeID'].fillna("blank")
metadata['ATTRIBUTE_genomeIDMDNA']=metadata['filename'].str.extract(r'(MDNAWGS?\d*|MDNA_WGS_?\d*)')
metadata['ATTRIBUTE_genomeID']=metadata['ATTRIBUTE_genomeID'].fillna(metadata['ATTRIBUTE_genomeIDMDNA'])
metadata["ATTRIBUTE_media"]= metadata['filename'].str.extract(r'(ISP2|DNPM|FPY12|MA|soyM\d*)')
metadata["ATTRIBUTE_comment"]= metadata['ATTRIBUTE_genomeID'].astype(str) +"_" + metadata["ATTRIBUTE_media"].astype(str)
metadata=metadata.drop(columns="ATTRIBUTE_genomeIDMDNA")
metadata.to_csv(os.path.join("results", "GNPSexport", "metadata.tsv"), sep='\t', index= None)
metadata