# Performance of MetFrag and SIRIUS on the MassBank Data

For the performance evaluation, we are looking at the case where performance when only tandem mass-spectrometry (MS2) information is used.

In [22]:
import sqlite3 
import numpy as np

from matchms import Spectrum

from ssvm.data_structures import CandSQLiteDB_Massbank
from ssvm.evaluation_tools import get_topk_performance_csifingerid

In [8]:
DB_FN = "../db/massbank__with_metfrag.sqlite"
DB_URI = "file:" + DB_FN + "?mode=ro"
DB_CONN_RO = sqlite3.connect(DB_URI, uri=True)

In [44]:
cands = CandSQLiteDB_Massbank(db_fn=DB_FN, molecule_identifier="inchikey1")

for ds, in DB_CONN_RO.execute("SELECT distinct(dataset) FROM scored_spectra_meta"): 
    print(ds)
    
    d_mf = {}
    d_sir = {}
        
    for idx, (acc, mol) in enumerate(DB_CONN_RO.execute(
        "SELECT accession, inchikey1 FROM scored_spectra_meta " \
        "   INNER JOIN molecules m ON m.cid = scored_spectra_meta.molecule " \
        "   INNER JOIN datasets d ON d.name = scored_spectra_meta.dataset " \
        "   WHERE dataset IS ? AND retention_time >= 3 * column_dead_time_min",
        (ds, )
    )): 
        #print(acc)
        
        d_mf[idx] = {}
        d_sir[idx] = {}
        
        spec = Spectrum(np.array([]), np.array([]), metadata={"spectrum_id": acc, "molecule_identifier": mol})
        
        # Get the label-space for the spectrum
        _lab_space = cands.get_labelspace(spec)
        d_mf[idx]["n_cand"] = len(_lab_space)
        d_sir[idx]["n_cand"] = len(_lab_space)
        #print(".. got labelspace")
        
        # Find the correct structure on the label-space
        _index_of_correct_structure = _lab_space.index(mol)
        d_mf[idx]["index_of_correct_structure"] = _index_of_correct_structure
        d_sir[idx]["index_of_correct_structure"] = _index_of_correct_structure
        #print(".. got correct index")
        
        # Get the MS2 scores
        d_mf[idx]["score"] = cands.get_ms2_scores(spec, "metfrag", scale_scores_to_range=False, return_as_ndarray=True)
        d_sir[idx]["score"] = cands.get_ms2_scores(spec, "sirius__sd__correct_mf", scale_scores_to_range=False, return_as_ndarray=True)
        #print(".. got ms2 scores")
    
    print(np.median([d_mf[i]["n_cand"] for i in range(idx)]))
    print(np.round(get_topk_performance_csifingerid(d_mf)[1][[0, 4, 9, 19]], 1))
    print(np.round(get_topk_performance_csifingerid(d_sir)[1][[0, 4, 9, 19]], 1))

AC_003
320.0
[12.8 32.3 40.2 45.3]
[26.8 55.  61.2 62.6]
AU_003
1319.0
[ 3.8 16.2 32.2 40.7]
[39.4 66.7 73.3 77.8]
AU_002
1009.0
[ 4.4 20.8 31.7 41.2]
[41.8 64.5 72.1 77.5]
AU_001
800.0
[ 2.8 18.2 31.3 42.4]
[37. 72. 76. 78.]
BML_001
1037.0
[ 2.5 12.3 19.  28. ]
[30.8 65.6 72.  77.2]
CE_001
836.5
[ 2.1  6.9 16.7 25.2]
[52.6 74.4 82.1 94.9]
EA_003
1313.0
[ 5.8 29.  43.6 56.9]
[55.  80.8 85.  89.9]
EA_002
1381.0
[ 3.7 21.2 36.8 46.2]
[38.9 64.8 72.8 79.7]
EQ_006
1046.5
[ 6.7 26.3 33.3 44.1]
[48.8 71.1 77.7 83.9]
EQ_004


KeyboardInterrupt: 