In [None]:
import numpy as np
import pandas as pd
import glob
import os
from pyteomics import mgf, auxiliary

In [None]:
path= os.path.join("results", "interim", "analysis")
isExist= os.path.exists(path)
if not isExist:
    os.mkdir(path)

First,import the feature table:

In [None]:
FeatureMatrix= os.path.join("results", "features", "FeatureMatrix.tsv")
DF_features= pd.read_csv(FeatureMatrix, sep="\t")
DF_features= DF_features.drop(columns=["charge", "quality"])
DF_features= DF_features.fillna(0)
DF_features["feature_ids"]= DF_features["feature_ids"].str.replace(r"[", "")
DF_features["feature_ids"]= DF_features["feature_ids"].str.replace(r"]", "")

for i, rows in DF_features.iterrows():
    DF_features["feature_ids"][i]= DF_features["feature_ids"][i].split(",")
DF_features

#### `1) SIRIUS and CSI:FingerID annotations`

Create a matrix with all SIRIUS and CSI:FingerID formula and structural predictions, only choose #1 rankings predictions and combine the dataframes to annotate formula and structural predictions according to RT and mz:

df_formulas will likely contain duplicate formulas that could be either isomeric, isobaric compounds, or identical compounds (with identical RT and mz). Here, we want to collapse the identical, repeating compounds

In [None]:
input_formulas= glob.glob(os.path.join("results", "SiriusCSI", "formulas_*.tsv"))
DF_SIRIUS = pd.DataFrame()
list_of_df=[]
for csv in input_formulas:
    df= pd.read_csv(csv, sep="\t", index_col="Unnamed: 0")
    s= df["opt_global_rank"]
    pd.to_numeric(s)
    df= df.loc[df["opt_global_rank"]==1]
    df= df.rename(columns={"opt_global_featureId":"featureId"})
    df= df.drop(columns=df.filter(regex=fr"Score").columns)
    df= df.drop(columns= df.filter(regex=fr"opt").columns)
    df=df.reset_index()
    list_of_df.append(df)
DF_SIRIUS= pd.concat(list_of_df,ignore_index=True)
DF_SIRIUS= DF_SIRIUS.drop(columns="index")
DF_SIRIUS= DF_SIRIUS.rename(columns= {"chemical_formula": "formulas", "exp_mass_to_charge": "mz", "retention_time": "RT"})
DF_SIRIUS["featureId"]= DF_SIRIUS["featureId"].str.replace(r"id_", "")
DF_SIRIUS

Repeat for structral predictions (remove duplicates with the same inchi_keys, which means they represent the same structure):

In [None]:
input_structures= glob.glob(os.path.join("results", "SiriusCSI", "structures_*.tsv"))
DF_CSI = pd.DataFrame()
list_of_df=[]
for csv in input_structures:
    df= pd.read_csv(csv, sep="\t", index_col="Unnamed: 0")
    s= df["opt_global_rank"]
    pd.to_numeric(s)
    df= df.loc[df["opt_global_rank"]==1]
    df= df.rename(columns={"opt_global_featureId":"featureId"})
    df= df.drop(columns=df.filter(regex=fr"Score").columns)
    df= df.drop(columns= df.filter(regex=fr"opt").columns)
    df=df.reset_index()
    list_of_df.append(df)
DF_CSI= pd.concat(list_of_df,ignore_index=True)
DF_CSI= DF_CSI.drop(columns="index")
DF_CSI= DF_CSI.rename(columns= {"chemical_formula": "formulas", "exp_mass_to_charge": "mz", "retention_time": "RT", "description":"name"})
DF_CSI["featureId"]= DF_CSI["featureId"].str.replace(r"id_", "")
DF_CSI

Annotate the formulas and structural predictions to the feature matrix according to SIRIUS and CSI:

In [None]:
DF_features.insert(0, "SIRIUS_predictions", "")

for i, id in zip(DF_features.index, DF_features["feature_ids"]):
    hits = []
    for name, Pred_id in zip(DF_SIRIUS["formulas"], DF_SIRIUS["featureId"]): 
        if Pred_id in id:
            hit = f"{name}"
            if hit not in hits:
                hits.append(hit)
    DF_features["SIRIUS_predictions"][i] = " ## ".join(hits)
DF_features

In [None]:
DF_features.insert(0, "CSI_predictions_name", "")
DF_features.insert(0, "CSI_predictions_formula", "")
DF_features.insert(0, "CSI_predictions_smiles", "")

for i, id, sirius in zip(DF_features.index, DF_features["feature_ids"], DF_features["SIRIUS_predictions"]):
    hits1 = []
    hits2= []
    hits3=[]
    for name, formula, smiles, Pred_id in zip(DF_CSI["name"], DF_CSI["formulas"], DF_CSI["smiles"], DF_CSI["featureId"]): 
        if (Pred_id in id) & (formula in sirius):
                hit1 = f"{name}"
                hit2 = f"{formula}"
                hit3= f"{smiles}"
                if hit1 not in hits1:
                    hits1.append(hit1)
                    hits2.append(hit2)
                    hits3.append(hit3)
    DF_features["CSI_predictions_name"][i] = " ## ".join(hits1)
    DF_features["CSI_predictions_formula"][i] = " ## ".join(hits2)
    DF_features["CSI_predictions_smiles"][i] = " ## ".join(hits3)
DF_features

In [None]:
DF_features.to_csv(os.path.join("results", "annotations", "SiriusCSI_annotated_FeatureMatrix.tsv"), sep="\t", index= None)

#### `2) GNPS annotations`

This step requires the file under the DB_result directory (downloaded cytoscape data) after FBMN. Move the .TSV table under the resources directory and annotate the FeatureMatrix according to SCAN numbers:

In [None]:
list= glob.glob(os.path.join("resources", "*.tsv"))
for file in list:   
    df= pd.read_csv(file, sep="\t")
    df.drop(df.index[df['IonMode'] == "negative"], inplace=True)
    df.drop(df.index[df['MZErrorPPM'] > 10.0], inplace=True)
    GNPS=df.drop_duplicates(subset="Compound_Name", keep='first')
    GNPS["#Scan#"]= GNPS["#Scan#"].astype(str)
GNPS

Annotate the features detected by GNPS according to mz and RT (mz tolerance 10 ppm and RT tolerance 20 seconds: instrument and method-dependent)

In [None]:
path= os.path.join("results", "annotations")
isExist= os.path.exists(path)
if not isExist:
    os.mkdir(path)

Add the SCAN number information to the annotated FeatureMatrix in order to match the GNPS MSMS library matching results:

In [None]:
Matrix= pd.read_csv(os.path.join("results", "annotations", "SiriusCSI_annotated_FeatureMatrix.tsv"), sep="\t")
Matrix["id"]= Matrix["id"].astype(str)
Matrix["feature_ids"]= Matrix["feature_ids"].values.tolist()
Matrix

Turn the MGF file to a dataframe

In [None]:
path= os.path.join("results", "GNPSexport", "MSMS.mgf")
file= mgf.MGF(source=path, use_header=True, convert_arrays=2, read_charges=True, read_ions=False, dtype=None, encoding=None)
parameters=[]
for spectrum in file:
    parameters.append(spectrum['params'])
mgf_file= pd.DataFrame(parameters)
mgf_file["feature_id"]= mgf_file["feature_id"].str.replace(r"e_", "")
mgf_file

Add SCANS column to the Feature Matrix

In [None]:
Matrix.insert(0, "SCANS", "")
for i, id in zip(Matrix.index, Matrix["id"]):
    hits = []
    for scan, feature_id in zip(mgf_file["scans"], mgf_file["feature_id"]): 
        if feature_id==id:
            hit = f"{scan}"
            if hit not in hits:
                hits.append(hit)
    Matrix["SCANS"][i] = " ## ".join(hits)
Matrix

In [None]:
Matrix.insert(0, "GNPS", "")

for i, scan in zip(Matrix.index, Matrix["SCANS"]):
    hits = []
    for name, scan_number, in zip(GNPS["Compound_Name"], GNPS["#Scan#"]):
        if scan==scan_number:
            hit = f"{name}"
            if hit not in hits:
                hits.append(hit)
    Matrix["GNPS"][i] = " ## ".join(hits)

Matrix.to_csv(os.path.join("results", "annotations", "GNPS_annotated_FeatureMatrix.tsv"), sep="\t", index = False)
Matrix

Keep the unannotated features only

In [None]:
Matrix= Matrix[Matrix.GNPS == ""]
Matrix= Matrix.drop(columns= "GNPS")
Matrix= Matrix.set_index(["RT", "mz"])
Matrix_tocsv= Matrix.reset_index()
Matrix_tocsv.to_csv(os.path.join("results", "annotations", "Matrix_unknowns.tsv"), sep="\t", index =None)
Matrix