In [18]:
### Preprocessing requirements
import pandas as pd
# 1. Copy all .h5 and .mzML files from the job folders to a single output directory for easier access.
# 2. Run the deconvolution to make <filename>_deconvoluted.parquet files.
import os
import numpy as np
from scipy.stats import ttest_ind
# import ztest
from statsmodels.stats.weightstats import ztest
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.patches as mpatches

PYTHONPATH = "/global/homes/b/bpb/repos/envnet"
if PYTHONPATH not in sys.path:
    sys.path.insert(0, PYTHONPATH)

from envnet.annotation.core import AnnotationEngine
annotation_engine = AnnotationEngine()
ref_dir = '/global/homes/b/bpb/repos/envnet/results/full_build_20250908_181404/'
node_data = annotation_engine.load_envnet_reference(
    graphml_file=os.path.join(ref_dir, "network_with_sirius.graphml"),
    mgf_base_name=os.path.join(ref_dir, "envnet")
)   
cols = [ 'dbe', 'dbe_ai',
       'dbe_ai_mod', 'ai_mod', 'ai', 'nosc', 'h_to_c', 'o_to_c', 'n_to_c',
       'p_to_c', 'c', 'h', 'o', 'n', 's', 'p', 'original_index','precursor_mz','inchi_key', 'compound_name', 'smiles','NPC#pathway', 'NPC#superclass', 'NPC#class','predicted_formula']

cols = ['original_index','predicted_unchanged_in_soil_prob', 'predicted_unchanged_in_soil']
model_data = pd.read_csv('../envnet/data/node_data_with_predicted_unchanged_20251107.csv', usecols=cols)
node_data['nodes'] = node_data['nodes'].merge(model_data, on='original_index', how='left')

Loading ENVnet reference data...
  GraphML file: /global/homes/b/bpb/repos/envnet/results/full_build_20250908_181404/network_with_sirius.graphml
  Deconvoluted MGF: /global/u2/b/bpb/repos/envnet/data//global/homes/b/bpb/repos/envnet/results/full_build_20250908_181404/envnet_deconvoluted_spectra.mgf
  Original MGF: /global/u2/b/bpb/repos/envnet/data//global/homes/b/bpb/repos/envnet/results/full_build_20250908_181404/envnet_original_spectra.mgf
Loaded 22128 ENVnet nodes


In [19]:
pmz = node_data['reference_pmzs']
spectra = node_data['deconvoluted_spectra']

In [None]:
import pandas as pd
import argparse
import os
import json
import requests
import time
from tqdm import tqdm

# this is sychronous low performance
def query_fasst_usi(usi, database, host="https://fasst.gnps2.org",
                    analog=False, precursor_mz_tol=0.05,
                    fragment_mz_tol=0.05, min_cos=0.7,
                    cache="Yes"):
    params = {
        "usi": usi,
        "library": database,
        "analog": "Yes" if analog else "No",
        "pm_tolerance": precursor_mz_tol,
        "fragment_tolerance": fragment_mz_tol,
        "cosine_threshold": min_cos,
        "cache": cache
    }

    r = requests.get(os.path.join(host, "search"), params=params, timeout=50)
    r.raise_for_status()

    return r.json()

# high performance version
def query_fasst_api_usi(usi, database, host="https://api.fasst.gnps2.org",
                    analog=False, precursor_mz_tol=0.05,
                    fragment_mz_tol=0.05, min_cos=0.7,
                    cache="Yes",
                    lower_delta=100,
                    upper_delta=100,
                    blocking=True):
    
    params = {
        "library": database,
        "usi": usi,
        "analog": "Yes" if analog else "No",
        "cache": "No",
        "lower_delta": lower_delta,
        "upper_delta": upper_delta,
        "pm_tolerance": precursor_mz_tol,
        "fragment_tolerance": fragment_mz_tol,
        "cosine_threshold": min_cos
    }


    r = requests.post(os.path.join(host, "search"), json=params, timeout=5)
    r.raise_for_status()

    task_id = r.json()["id"]
    
    params["task_id"] = task_id
    params["status"] = "PENDING"

    if blocking is False:
        return params

    return get_results(params, host=host)

def query_fasst_peaks(precursor_mz, peaks, database, host="https://fasst.gnps2.org", analog=False, precursor_mz_tol=0.05, fragment_mz_tol=0.05, min_cos=0.7):
    spectrum_query = {
        "peaks": peaks,
        "precursor_mz": precursor_mz
    }

    params = {
        "query_spectrum": json.dumps(spectrum_query),
        "library": database,
        "analog": "Yes" if analog else "No",
        "pm_tolerance": precursor_mz_tol,
        "fragment_tolerance": fragment_mz_tol,
        "cosine_threshold": min_cos,
    }

    r = requests.post(os.path.join(host, "search"), data=params, timeout=50)

    r.raise_for_status()

    return r.json()



def query_fasst_api_peaks(precursor_mz, peaks, database, 
                          host="https://api.fasst.gnps2.org", 
                          analog=False, precursor_mz_tol=0.05, 
                          fragment_mz_tol=0.05, 
                          min_cos=0.7, 
                          lower_delta=100,
                          upper_delta=100,
                          blocking=True):
    spectrum_query = {
        "peaks": peaks,
        "precursor_mz": precursor_mz
    }

    params = {
        "library": database,
        "query_spectrum": json.dumps(spectrum_query),
        "analog": "Yes" if analog else "No",
        "cache": "No",
        "lower_delta": lower_delta,
        "upper_delta": upper_delta,
        "pm_tolerance": precursor_mz_tol,
        "fragment_tolerance": fragment_mz_tol,
        "cosine_threshold": min_cos
    }

    query_url = os.path.join(host, "search")

    r = requests.post(query_url, json=params, timeout=5)
    
    r.raise_for_status()

    task_id = r.json()["id"]

    params["task_id"] = task_id
    params["status"] = "PENDING"
    
    if blocking is False:
        return params

    return params


def get_results(query_parameters_dictionary, host="https://api.fasst.gnps2.org", blocking=True):
    task_id = query_parameters_dictionary["task_id"]
    
    retries_max = 120
    current_retries = 0
    while True:
        print("WAITING FOR RESULTS", current_retries, task_id)
        
        r = requests.get(os.path.join(host, "search/result/{}".format(task_id)), timeout=30)

        try:
            r.raise_for_status()
        except KeyboardInterrupt:
            raise
        except:
            # if we are not blocking, we just return the status
            if blocking is False:
                return "PENDING"
            
            time.sleep(1)
            current_retries += 1
            

            continue


        # checking if the results are ready
        if "status" in r.json() and r.json()["status"] == "PENDING":
            # if we are not blocking, we just return the status
            if blocking is False:
                return "PENDING"
            
            time.sleep(1)
            current_retries += 1

            if current_retries >= retries_max:
                raise Exception("Timeout waiting for results from FASST API")
            
            continue

        results_dict = r.json()
    
        return results_dict

def get_databases(host="https://fasst.gnps2.org"):
    url = "{}/libraries".format(host)

    return requests.get(url).json()

In [None]:
peaks = spectra[0].T.tolist()

[[199.20680236816406, 2683.6484375],
 [207.17523193359375, 11063.068359375],
 [225.18592834472656, 43173.1484375],
 [241.18096923828125, 9991.2373046875]]

In [None]:
database_label = 'metabolomicspanrepo_index_nightly'
usi = "mzspec:GNPS:GNPS-LIBRARY:accession:CCMSLIB00005883671"
query_fasst_usi(usi, database_label, host="https://fasst.gnps2.org",
                    analog=False, precursor_mz_tol=0.05,
                    fragment_mz_tol=0.05, min_cos=0.7,
                    cache="Yes")



{'results': [{'Delta Mass': -0.04,
   'USI': 'mzspec:MSV000095708:peak/mzML/WB23L029.mzML:scan:969',
   'Charge': 1,
   'Cosine': 0.73,
   'Matching Peaks': 3,
   'Unit Delta Mass': 0,
   'Dataset': 'MSV000095708',
   'Status': 'NoID',
   'Query Filename': 'temp/queries/5d/5db576874736475eb3119e7e34246616/5db576874736475eb3119e7e34246616.mgf',
   'Query Scan': 1,
   'Index UnitPM': 104,
   'Index IdxInUnitPM': 985404,
   'Filtered Input Spectrum Path': 'temp/queries/5d/5db576874736475eb3119e7e34246616/0.json'},
  {'Delta Mass': -0.04,
   'USI': 'mzspec:MSV000095708:peak/mzML/TL23L007.mzML:scan:1056',
   'Charge': 1,
   'Cosine': 0.73,
   'Matching Peaks': 3,
   'Unit Delta Mass': 0,
   'Dataset': 'MSV000095708',
   'Status': 'NoID',
   'Query Filename': 'temp/queries/5d/5db576874736475eb3119e7e34246616/5db576874736475eb3119e7e34246616.mgf',
   'Query Scan': 1,
   'Index UnitPM': 104,
   'Index IdxInUnitPM': 987637,
   'Filtered Input Spectrum Path': 'temp/queries/5d/5db576874736475eb31

In [None]:
# Get the file list for various MASST databases here
# https://github.com/robinschmid/microbe_masst/tree/master/data

In [None]:

database_label = 'metabolomicspanrepo_index_nightly'
outdir = '/pscratch/sd/b/bpb/envnet_masst_results'
for i in range(len(pmz)):
    outfile = os.path.join(outdir, f'fasst_query_result_%d.json'%i)
    if os.path.exists(outfile):
        continue
    peaks = spectra[i].T.tolist()
    precursor_mz = pmz[i]
    
    params = query_fasst_api_peaks(precursor_mz, peaks, database_label, 
                            host="https://api.fasst.gnps2.org", 
                            analog=False, precursor_mz_tol=0.05, 
                            fragment_mz_tol=0.05, 
                            min_cos=0.7, 
                            lower_delta=100,
                            upper_delta=100,
                            blocking=True)

    results = get_results(params, host="https://api.fasst.gnps2.org", blocking=True)
    results['envnet_index'] = i
    with open(outfile, 'w') as f:
        json.dump(results, f)


In [56]:
results

{'results': [{'Delta Mass': -0.04,
   'USI': 'mzspec:MTBLS10526:FILES/DERIVED_FILES/FA12a_72.mzML:scan:30396',
   'Charge': 1,
   'Cosine': 1.0,
   'Matching Peaks': 4,
   'Unit Delta Mass': 0,
   'Dataset': 'MTBLS10526',
   'Status': 'NoID',
   'Query Scan': 1,
   'Index UnitPM': 217,
   'Index IdxInUnitPM': 7522489},
  {'Delta Mass': -0.03,
   'USI': 'mzspec:MSV000089289:peak/209_MT_CCE21.mzML:scan:448',
   'Charge': 1,
   'Cosine': 0.97,
   'Matching Peaks': 4,
   'Unit Delta Mass': 0,
   'Dataset': 'MSV000089289',
   'Status': 'NoID',
   'Query Scan': 1,
   'Index UnitPM': 217,
   'Index IdxInUnitPM': 3248736},
  {'Delta Mass': -0.03,
   'USI': 'mzspec:MSV000089289:ccms_peak/209_MT_CCE21.mzML:scan:448',
   'Charge': 1,
   'Cosine': 0.97,
   'Matching Peaks': 4,
   'Unit Delta Mass': 0,
   'Dataset': 'MSV000089289',
   'Status': 'NoID',
   'Query Scan': 1,
   'Index UnitPM': 217,
   'Index IdxInUnitPM': 3247116},
  {'Delta Mass': -0.03,
   'USI': 'mzspec:MSV000084738:ccms_peak/Delic

In [None]:
out = []


3
5
8


ReadTimeout: HTTPSConnectionPool(host='fasst.gnps2.org', port=443): Read timed out. (read timeout=50)

In [None]:
def make_spectra_for_fasst(pmz, spectra):
    spectra_list = []
    for i in range(len(pmz)):
        spec_dict = {'precursor_mz': pmz[i],
                     'spectrum': spectra[i]}
        spectra_list.append(spec_dict)
    return spectra_list

22128