## `Formula, structural and compound class predictions of the preprocessed data`

Import libraries:

In [None]:
import os
import glob
from pyopenms import *
import pandas as pd
import shutil
import subprocess
import threading
from pathlib import Path

path = os.path.join("results", "interim", "SiriusCSI")

# SIRIUS

De novo metabolite identification from MS2 spectra.

**SIRIUS**: Identify molecular formula for each compound individually using fragmentation trees and isotope patterns. Output from this tool can be used to generate an OpenSwathAssay library with the AssayGeneratorMetabo TOPP tool.

**CSI:FingerID**: This subtool is dedicated to predicting molecular structures based on tandem mass spectrometry (MS/MS) data. It utilizes a fragmentation tree approach for the annotation of fragment spectra.

**CANOPUS**: Predict compound categories for each compound individually based on its predicted molecular fingerprint (CSI:FingerID) using CANOPUS.

###### Documentation: https:""boecker-lab.github.io"docs.sirius.github.io"

###### Citation: Kai Dührkop, Huibin Shen, Marvin Meusel, Juho Rousu, and Sebastian Böcker, Searching molecular structure databases with tandem mass spectra using CSI:FingerID, PNAS October 13, 2015 112 (41) 12580-12585, https:""doi.org"10.1073"pnas.1509788112

# SIRIUS Login (required!)
Uncomment the first line and enter your user email and password, otherwise SIRIUS will not run.

In [None]:
# !sirius login --email="" --password=""
!sirius login --show

In [None]:
if os.path.exists(path):
    shutil.rmtree(path)
os.mkdir(path)

# Use Requantification consensus map if it exists, else use the one from Preprocessing
feature_files = sorted(glob.glob(os.path.join("results", "interim", 
                                              "Requantification" if os.path.exists(os.path.join("results", "interim", "Requantification")) else "Preprocessing", 
                                              "MFD_*.featureXML")))

input_mzml_files = sorted(glob.glob(os.path.join("results", "interim", "mzML", "MapAligned_*.mzML"))) 
                        
feature_maps = []
for file in feature_files:
    fmap = FeatureMap()
    FeatureXMLFile().load(file, fmap)
    feature_maps.append(fmap)

# collect SIRIUS CLI commands to run them in parallel
sirius_commands = []
for filename in input_mzml_files:
    exp = MSExperiment()
    MzMLFile().load(filename, exp)
    exp.sortSpectra(True)
    print(exp.getNrSpectra())
    
    for fmap in feature_maps:
        if os.path.basename(fmap.getMetaValue("spectra_data")[0].decode())[7:] == os.path.basename(filename)[11:]:
            fm_no_sub = FeatureMap(fmap)
            fm_no_sub.clear(False)
            for f in fmap:
                f.setConvexHulls([])
                f.setSubordinates([])
                fm_no_sub.push_back(f)
            
            featureinfo = os.path.join(path, 'MFD_ncv_' + os.path.basename(filename)[11:-5] + ".featureXML")
            FeatureXMLFile().store(featureinfo, fm_no_sub)

            export = SiriusExportAlgorithm()
            export_par = export.getDefaults()
            export_par.setValue("filter_by_num_masstraces", 2) #Number of mass traces each feature has to have to be included
            export_par.setValue("feature_only", "true") #Uses the feature information from in_fm_no_sub to reduce the search space to MS2
            export.setParameters(export_par)
                
            fm_info = FeatureMapping_FeatureMappingInfo()
            feature_mapping = FeatureMapping_FeatureToMs2Indices() 
            export.preprocessing(featureinfo,
                                exp,
                                fm_info,
                                feature_mapping)

            project_dir = os.path.join(path, os.path.basename(filename)[11:-5])

            if os.path.exists(project_dir):
                shutil.rmtree(project_dir)
            os.mkdir(project_dir)

            ms_file = os.path.join(project_dir, "sirius.ms")
            export.run(mzML_files = [filename.encode()],
                       featureXML_files = [featureinfo.encode()],
                       out_ms = ms_file,
                       out_compoundinfo = os.path.join(project_dir, "compoundinfo.tsv")
            )

            # Setup SIRIUS CLI command as list to call with Python subprocess, modify according to your data
            command = [
                    "sirius",
                    "--input",
                    ms_file,
                    "--project",
                    os.path.join(project_dir, "sirius"),
                    "--no-compression",
                    "--maxmz", "300",
                    "formula", # FORMULA PREDICTION
                    "--profile", "default",
                    "--database", "none",
                    "--ions-considered", "[M+H]+,[M+K]+,[M+Na]+,[M+H-H2O]+,[M+H-H4O2]+,[M+NH4]+,[M-H]-,[M+Cl]-,[M-H2O-H]-,[M+Br]-",
                    "--elements-considered", "SBrClBSe",
                    "--elements-enforced", "CHNOP",
                    "--ppm-max", "10.0",
                    "--ppm-max-ms2", "10.0",
                    "--candidates", "1",
                    "fingerprint", # STRUCTURE PREDICTION CSI:FingerId
                    "structure",
                    "--database", "BIO",
                    "canopus",
                    "write-summaries"
                ]
            sirius_commands.append(command)

# Run all SIRIUS commans in parallel via threading
def run_command(command):
    print(f"\nExecuting command:\n{' '.join(command)} \n Please wait...")
    process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    stdout, stderr = process.communicate()
    # Write long SIRIUS log to file (in results/interim/SiriusCSI/FileName/log.log) instead of Jupyter notebook
    with open(os.path.join(command[4], "log.log"), "w", encoding="utf-8") as f:
        f.write(stdout.decode())
        f.write(stderr.decode())

# Start a new thread for all commands
threads = []
for cmd in sirius_commands:
    thread = threading.Thread(target=run_command, args=(cmd,))
    thread.start()
    threads.append(thread)
# Wait for all threads to complete
for thread in threads:
    thread.join()
print("\nDONE")
 

# Annotation of FeatureMatrix with SIRIUS results

In [None]:
df = pd.read_csv(os.path.join("results", "features", "FeatureMatrix.tsv"), sep="\t")

sirius_projects_dirs = [Path(p, "sirius") for p in Path(path).iterdir() if p.is_dir()]

for p in sirius_projects_dirs:
    for tool, annotation_file, cols in zip(
        ["SIRIUS", "CSI:FingerID", "CANOPUS"],
        [
            "formula_identifications.tsv", # for negative mode: 
            "compound_identifications.tsv",
            "canopus_compound_summary.tsv",
        ],
        [
            ["molecularFormula", "explainedIntensity"],
            ["molecularFormula", "name", "InChI", "smiles"],
            [
                "NPC#pathway",
                "NPC#superclass",
                "NPC#class",
                "ClassyFire#most specific class",
            ],
        ],
    ):
        file = Path(p, annotation_file)
        if file.exists():
            df_tmp = pd.read_csv(file, sep="\t")
            df_tmp["id"] = df_tmp["id"].apply(
                lambda x: x.split("_0_")[1].split("-")[0]
            )
            for col in cols:
                df[
                    f"{p.parent.name}_{tool}_{col.replace('NPC#', '').replace('ClassyFire#', '')}"
                ] = df[f"{p.parent.name}.mzML_IDs"].astype(str).map(
                    df_tmp.set_index("id")[col].to_dict()
                )

df.to_csv(os.path.join("results", "features", "FeatureMatrix.tsv"), sep="\t", index=False)