# MS2DeepScore

In [2]:
!pip install matchms ms2deepscore tqdm

Collecting matchms
  Downloading matchms-0.31.0-py3-none-any.whl.metadata (21 kB)
Collecting ms2deepscore
  Downloading ms2deepscore-2.6.0-py3-none-any.whl.metadata (9.1 kB)
Collecting deprecated>=1.2.14 (from matchms)
  Downloading deprecated-1.3.1-py2.py3-none-any.whl.metadata (5.9 kB)
Collecting lxml<6.0.0,>=5.4.0 (from matchms)
  Downloading lxml-5.4.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (3.5 kB)
Collecting numba<0.62.0,>=0.61.0 (from matchms)
  Downloading numba-0.61.2-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (2.8 kB)
Collecting pandas<3.0.0,>=2.2.3 (from matchms)
  Downloading pandas-2.3.3-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (91 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m91.2/91.2 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pickydict>=0.4.0 (from matchms)
  Downloading pickydict-0.5.0-py3-none-any.whl.metadata (4.7 kB)
Collecting pubchempy>=1.0.5 (from matchms)
  Dow

In [1]:
import os
import numpy as np

from matchms.importing import load_from_mgf
from matchms.exporting import save_as_mgf
from matchms.filtering import (
    normalize_intensities,
    select_by_mz,
    reduce_to_number_of_peaks,
    require_minimum_number_of_peaks,
)
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem
from ms2deepscore import MS2DeepScore
from matchms import calculate_scores

In [3]:
from matchms.importing import load_from_mgf
from matchms.exporting import save_as_mgf
import os

# Input MGF files
mgf_files = [
    "negative_training_spectra.mgf",
    "negative_validation_spectra.mgf",
    "negative_testing_spectra.mgf",
    "positive_training_spectra.mgf",
    "positive_validation_spectra.mgf",
    "positive_testing_spectra.mgf"
]

# Output directory
out_dir = "mgf_1000"
os.makedirs(out_dir, exist_ok=True)

MAX_SPECTRA = 1000

for mgf_file in mgf_files:
    print(f"\nProcessing {mgf_file}")

    # Load spectra
    spectra = list(load_from_mgf(mgf_file))
    print(f"  Total spectra in file: {len(spectra)}")

    # Take first 1000
    spectra_1000 = spectra[:MAX_SPECTRA]
    print(f"  Keeping: {len(spectra_1000)}")

    # Output filename
    base = os.path.basename(mgf_file)
    out_file = os.path.join(out_dir, base.replace(".mgf", "_1000.mgf"))

    # Save
    save_as_mgf(spectra_1000, out_file)
    print(f"  Saved to: {out_file}")



Processing negative_training_spectra.mgf
  Total spectra in file: 130901
  Keeping: 1000
  Saved to: mgf_1000/negative_training_spectra_1000.mgf

Processing negative_validation_spectra.mgf
  Total spectra in file: 7551
  Keeping: 1000
  Saved to: mgf_1000/negative_validation_spectra_1000.mgf

Processing negative_testing_spectra.mgf
  Total spectra in file: 7142
  Keeping: 1000
  Saved to: mgf_1000/negative_testing_spectra_1000.mgf

Processing positive_training_spectra.mgf
  Total spectra in file: 469257
  Keeping: 1000
  Saved to: mgf_1000/positive_training_spectra_1000.mgf

Processing positive_validation_spectra.mgf
  Total spectra in file: 25412
  Keeping: 1000
  Saved to: mgf_1000/positive_validation_spectra_1000.mgf

Processing positive_testing_spectra.mgf
  Total spectra in file: 24911
  Keeping: 1000
  Saved to: mgf_1000/positive_testing_spectra_1000.mgf


# Everything Below Was Not Used in the Final Product

In [None]:
original_mgf = "ms2deepscore_data/positive_training_spectra.mgf"
spectra = list(load_from_mgf(original_mgf))
print(len(spectra))

469257


In [None]:
import os
os.makedirs("ms2deepscore_small", exist_ok=True)

In [None]:
from matchms.exporting import save_as_mgf
save_as_mgf(spectra, "ms2deepscore_small/small_training_valid.mgf")

In [None]:
fixed_spectra = []

for s in spectra:
    if s is None:
        continue

    meta = s.metadata
    if "inchikey" not in meta:
        if "inchi" in meta:
            mol = Chem.MolFromInchi(meta["inchi"])
            if mol:
                meta["inchikey"] = Chem.inchi.MolToInchiKey(mol)
            else:
                continue
        else:
            continue
    if "precursor_mz" not in meta:
        if "parent_mass" in meta:
            meta["precursor_mz"] = meta["parent_mass"]
        else:
            continue
    if "ionmode" not in meta:
        continue
    if meta["ionmode"].lower() != "positive":
        continue
    fixed_spectra.append(s)
print(len(fixed_spectra))

469257


In [None]:
def fix_metadata(s):
    if "parent_mass" in s.metadata:
        try:
            s.metadata["parent_mass"] = float(s.metadata["parent_mass"])
        except:
            del s.metadata["parent_mass"]
    if "precursor_mz" in s.metadata:
        try:
            s.metadata["precursor_mz"] = float(s.metadata["precursor_mz"])
        except:
            del s.metadata["precursor_mz"]
    return s
fixed_spectra = [fix_metadata(s) for s in fixed_spectra if s is not None]

In [None]:
#from rdkit import Chem
#from rdkit.Chem import inchi

#def ensure_inchikey(s):
#    if "inchikey" in s.metadata and s.metadata["inchikey"].startswith("InChI="):
#        mol = Chem.MolFromInchi(s.metadata["inchikey"])
#        if mol:
#            s.metadata["inchikey"] = inchi.MolToInchiKey(mol)
#    return s
#spectra = [ensure_inchikey(s) for s in spectra]

In [None]:
seen = set()
unique_spectra = []

for s in fixed_spectra:
    ik = s.metadata["inchikey"]
    if ik not in seen:
        unique_spectra.append(s)
        seen.add(ik)

print(f"Unique InChIKeys: {len(unique_spectra)}")

Unique InChIKeys: 36852


In [None]:
os.makedirs("ms2deepscore_small", exist_ok=True)
small_mgf_file = "ms2deepscore_small/small_training_valid.mgf"
save_as_mgf(unique_spectra, small_mgf_file)
print("Saved:", small_mgf_file)


Saved: ms2deepscore_small/small_training_valid.mgf


In [None]:
check = list(load_from_mgf(small_mgf_file))
print("Reloaded spectra:", len(check))
print("Example metadata:")
print(check[0].metadata)


Reloaded spectra: 36852
Example metadata:
{'charge': 1, 'description': 'Enamine and Molport', 'formula': 'C7H5N3O2', 'inchi': 'InChI=1S/C7H5N3O2/c11-7(12)5-6-8-2-1-3-10(6)4-9-5/h1-4H,(H,11,12)', 'feature_id': '339', 'adduct': '[M-H2O+H]+', 'feature_ms1_height': '1.279E6', 'collision_energy': '60.0', 'fragmentation_method': 'HCD', 'isolation_window': '1.200000047684', 'acquisition': 'Crude', 'instrument_type': 'Orbitrap', 'ims_type': 'none', 'ion_source': 'ESI', 'ionmode': 'positive', 'dataset_id': 'MSVPLACEHOLDERID', 'usi': 'mzspec:MSVPLACEHOLDERID:20240405_pluskal_enamine_5002_A4_id_MSn_positive_2uL.mzML:339', 'scans': '339', 'precursor_purity': '0.37398476724755064', 'quality_chimeric': 'CHIMERIC', 'quality_explained_intensity': '0.61782515', 'quality_explained_signals': '0.48387095', 'num_peaks': '31', 'compound_name': 'imidazo[1,5-a]pyrimidine-8-carboxylic acid (Chimeric precursor selection)', 'parent_mass': '163.03818', 'inchi_aux': 'WXRSEUXBXNDCKE-UHFFFAOYSA-N', 'ms_level': '2', 

In [None]:
from matchms.importing import load_from_mgf
from matchms.exporting import save_as_mgf
import numpy as np
import os

input_mgf = "ms2deepscore_data/positive_training_spectra.mgf"
output_mgf = "ms2deepscore_small/small_training_valid.mgf"
os.makedirs("ms2deepscore_small", exist_ok=True)

max_spectra = 1000

all_spectra = []
count = 0
for s in load_from_mgf(input_mgf):
    inchi = s.metadata.get("INCHI") or s.metadata.get("inchikey")
    mz = s.metadata.get("precursor_mz")
    ion = s.metadata.get("ionmode")
    if inchi and mz and ion == "positive":
        all_spectra.append(s)
        count += 1
    if count >= max_spectra:
        break

print(f"Total spectra saved for small MGF: {len(all_spectra)}")
save_as_mgf(all_spectra, output_mgf)


Total spectra saved for small MGF: 1000


In [None]:
import requests
import os
from tqdm import tqdm

def download_file(link, file_name):
    response = requests.get(link, stream=True)
    if os.path.exists(file_name):
        print(f"The file {file_name} already exists, the file won't be downloaded")
        return
    total_size = int(response.headers.get('content-length', 0))

    with open(file_name, "wb") as f, tqdm(desc="Downloading file", total=total_size, unit='B', unit_scale=True, unit_divisor=1024,) as bar:
        for chunk in response.iter_content(chunk_size=1024):
            if chunk:
                f.write(chunk)
                bar.update(len(chunk))
model_file_name = "ms2deepscore_model.pt"
download_file("https://zenodo.org/records/14290920/files/settings.json?download=1", "ms2deepscore_settings.json")
download_file("https://zenodo.org/records/14290920/files/ms2deepscore_model.pt?download=1", model_file_name)

Downloading file: 2.18kB [00:00, 16.5kB/s]
Downloading file: 100%|██████████| 397M/397M [01:14<00:00, 5.59MB/s]


In [None]:
from ms2deepscore.models import load_model
from matchms.Pipeline import Pipeline, create_workflow
from matchms.filtering.default_pipelines import DEFAULT_FILTERS
from ms2deepscore import MS2DeepScore
from matchms.importing import load_from_mgf

model_file_name = "ms2deepscore_model.pt"
model = load_model(model_file_name, allow_legacy=True)

spectra_file_name = "ms2deepscore_small/small_training_valid.mgf"
spectra = list(load_from_mgf(spectra_file_name))

pipeline = Pipeline(
    create_workflow(
        query_filters=DEFAULT_FILTERS,
        score_computations=[[MS2DeepScore, {"model": model}]]
    )
)

report = pipeline.run(spectra_file_name)
similarity_matrix = pipeline.scores.to_array()

ms2ds_model = MS2DeepScore(model)
embeddings = ms2ds_model.get_embedding_array(report.spectra_queries)














Processing spectra: 24it [00:00, 76.23it/s]



Processing spectra: 512it [00:07, 69.94it/s]



Processing spectra: 552it [00:07, 74.79it/s]



Processing spectra: 1361it [00:20, 74.98it/s]



Processing spectra: 2012it [00:31, 72.24it/s]



Processing spectra: 6178it [01:37, 82.03it/s]



Processing spectra: 6187it [01:37, 83.53it/s]







Processing spectra: 7349it [01:53, 88.39it/s]



Processing spectra: 12215it [03:31, 52.70it/s]







Processing spectra: 12451it [03:36, 54.09it/s]



Processing spectra: 12457it [03:36, 51.10it/s]











Processing spectra: 12827it [03:45, 31.58it/s]



Processing spectra: 14560it [04:21, 52.01it/s]



Processing spectra: 14566it [04:21, 50.49it/s]



Processing spectra: 14704it [04:24, 55.09it/s]







Processing spectra: 14728it [04:24, 51.88it/s]



Processing spectra: 14734it [04:24, 51.17it/s]



Processing spectra: 16207it [04:52, 67.27it/s]



Processing spectra: 18656it [05:37, 66.46it/s]



Processing spectra: 20101it [06:02, 62.98it/s]



Processing spectra: 20108it [06:02, 62.79it/s]



Processing spectra: 20375it [06:06, 61.46it/s]



Processing spectra: 20410it [06:07, 65.19it/s]



Processing spectra: 20452it [06:08, 64.12it/s]



Processing spectra: 20494it [06:08, 66.19it/s]



Processing spectra: 21088it [06:19, 63.72it/s]



Processing spectra: 21539it [06:27, 42.47it/s]



Processing spectra: 21554it [06:28, 38.22it/s]



Processing spectra: 22006it [06:36, 65.37it/s]



Processing spectra: 22205it [06:39, 66.33it/s]



Processing spectra: 22226it [06:39, 65.10it/s]



Processing spectra: 22594it [06:47, 62.88it/s]



Processing spectra: 23635it [07:05, 62.48it/s]



Processing spectra: 23974it [07:11, 38.72it/s]



Processing spectra: 24324it [07:18, 65.21it/s]



Processing spectra: 24520it [07:21, 64.30it/s]



Processing spectra: 24804it [07:26, 39.70it/s]



Processing spectra: 24921it [07:29, 44.95it/s]



Processing spectra: 24931it [07:29, 45.33it/s]



Processing spectra: 24936it [07:29, 42.74it/s]



Processing spectra: 24941it [07:29, 42.16it/s]



Processing spectra: 24986it [07:30, 66.47it/s]



Processing spectra: 25003it [07:30, 67.69it/s]



Processing spectra: 25019it [07:30, 70.73it/s]







Processing spectra: 25036it [07:31, 73.97it/s]



Processing spectra: 25060it [07:31, 75.29it/s]



Processing spectra: 25109it [07:32, 70.57it/s]



Processing spectra: 25141it [07:32, 76.00it/s]



Processing spectra: 25157it [07:32, 72.83it/s]



Processing spectra: 25240it [07:33, 74.16it/s]



Processing spectra: 25248it [07:34, 72.78it/s]



Processing spectra: 25637it [07:39, 73.57it/s]



Processing spectra: 25678it [07:39, 76.68it/s]







Processing spectra: 25711it [07:40, 73.19it/s]



Processing spectra: 25727it [07:40, 61.99it/s]



Processing spectra: 26067it [07:46, 73.30it/s]



Processing spectra: 26245it [07:49, 74.47it/s]



Processing spectra: 26302it [07:49, 72.97it/s]



Processing spectra: 26481it [07:52, 74.79it/s]



Processing spectra: 26489it [07:52, 73.71it/s]



Processing spectra: 27244it [08:04, 76.24it/s]



Processing spectra: 27416it [08:06, 78.57it/s]







Processing spectra: 27448it [08:06, 73.77it/s]



Processing spectra: 27481it [08:07, 75.79it/s]



Processing spectra: 27563it [08:08, 78.84it/s]



Processing spectra: 27571it [08:08, 77.11it/s]



Processing spectra: 27579it [08:08, 74.16it/s]



Processing spectra: 27587it [08:08, 70.72it/s]



Processing spectra: 27595it [08:08, 69.94it/s]







Processing spectra: 27603it [08:09, 69.33it/s]



Processing spectra: 27644it [08:09, 77.15it/s]



Processing spectra: 27660it [08:09, 72.13it/s]



Processing spectra: 27668it [08:10, 57.69it/s]



Processing spectra: 28109it [08:18, 66.42it/s]



Processing spectra: 31779it [09:12, 44.09it/s]



Processing spectra: 32080it [09:17, 71.27it/s]



Processing spectra: 32866it [09:30, 67.65it/s]



Processing spectra: 33388it [09:36, 78.46it/s]



Processing spectra: 33685it [09:42, 39.95it/s]



Processing spectra: 33882it [09:45, 75.80it/s]



Processing spectra: 33898it [09:46, 74.51it/s]



Processing spectra: 33906it [09:46, 71.01it/s]



Processing spectra: 35245it [10:05, 75.58it/s]



Processing spectra: 35270it [10:06, 78.43it/s]



Processing spectra: 35311it [10:06, 74.43it/s]



Processing spectra: 35319it [10:06, 74.25it/s]



Processing spectra: 35344it [10:07, 75.07it/s]



Processing spectra: 35352it [10:07, 74.76it/s]



Processing spectra: 35360it [10:07, 74.44it/s]



Processing spectra: 35377it [10:07, 76.60it/s]



Processing spectra: 35394it [10:07, 75.73it/s]



Processing spectra: 35436it [10:08, 79.41it/s]



Processing spectra: 35466it [10:09, 55.14it/s]



Processing spectra: 35484it [10:09, 52.77it/s]



Processing spectra: 35525it [10:10, 50.25it/s]



Processing spectra: 35549it [10:10, 52.22it/s]



Processing spectra: 35572it [10:11, 46.32it/s]



Processing spectra: 35593it [10:11, 47.01it/s]



Processing spectra: 35628it [10:12, 43.62it/s]



Processing spectra: 35638it [10:12, 44.87it/s]



Processing spectra: 35658it [10:13, 44.12it/s]



Processing spectra: 35673it [10:13, 43.17it/s]



Processing spectra: 35678it [10:13, 42.55it/s]



Processing spectra: 35712it [10:14, 68.79it/s]



Processing spectra: 35737it [10:14, 72.27it/s]



Processing spectra: 35997it [10:17, 70.82it/s]







Processing spectra: 36005it [10:17, 65.46it/s]







Processing spectra: 36071it [10:18, 80.09it/s]



Processing spectra: 37025it [10:34, 71.48it/s]











Processing spectra: 37852it [10:48, 58.41it/s]
Computing spectral embeddings ...: 100%|██████████| 37852/37852 [21:52<00:00, 28.83it/s]


In [7]:
from rdkit import Chem
from rdkit.Chem import AllChem, DataStructs
import numpy as np
from ms2deepscore.models import load_model
from matchms.Pipeline import Pipeline, create_workflow
from matchms.filtering.default_pipelines import DEFAULT_FILTERS
from ms2deepscore import MS2DeepScore
from matchms.importing import load_from_mgf

model_file_name = "ms2deepscore_model.pt"
model = load_model(model_file_name, allow_legacy=True)

pipeline = Pipeline(
    create_workflow(
        query_filters=DEFAULT_FILTERS,
        score_computations=[[MS2DeepScore, {"model": model}]]
    )
)

report = pipeline.run("ms2deepscore_small/small_training_valid.mgf")
similarity_matrix = pipeline.scores.to_array()

ms2ds_model = MS2DeepScore(model)
embeddings = ms2ds_model.get_embedding_array(report.spectra_queries)

def compute_tanimoto_similarity(spectra):
    fps = []
    for s in spectra:
        inchi_str = s.metadata.get("inchikey") or s.metadata.get("INCHI")
        if inchi_str:
            mol = Chem.MolFromInchi(inchi_str)
            if mol:
                fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=2048)
                fps.append(fp)
            else:
                fps.append(None)
        else:
            fps.append(None)

    n = len(fps)
    tanimoto_matrix = np.zeros((n, n))

    for i in range(n):
        for j in range(n):
            if fps[i] is not None and fps[j] is not None:
                tanimoto_matrix[i, j] = DataStructs.TanimotoSimilarity(fps[i], fps[j])
            else:
                tanimoto_matrix[i, j] = np.nan
    return tanimoto_matrix


tanimoto_matrix = compute_tanimoto_similarity(report.spectra_queries)
mask = ~np.isnan(tanimoto_matrix)
rmse = np.sqrt(np.mean((similarity_matrix[mask] - tanimoto_matrix[mask])**2))
print("RMSE between MS2DeepScore and Tanimoto similarities:", rmse)











Processing spectra: 20it [00:00, 69.39it/s]



Processing spectra: 512it [00:08, 43.33it/s]



Processing spectra: 552it [00:09, 42.16it/s]



Processing spectra: 1366it [00:21, 46.18it/s]



Processing spectra: 1626it [00:26, 62.08it/s]


KeyboardInterrupt: 