In [3]:
pip install ms2deepscore

Collecting ms2deepscore
  Downloading ms2deepscore-2.6.0-py3-none-any.whl.metadata (9.1 kB)
Collecting matchms>=0.19.0 (from ms2deepscore)
  Downloading matchms-0.31.0-py3-none-any.whl.metadata (21 kB)
Collecting deprecated>=1.2.14 (from matchms>=0.19.0->ms2deepscore)
  Downloading deprecated-1.3.1-py2.py3-none-any.whl.metadata (5.9 kB)
Collecting lxml<6.0.0,>=5.4.0 (from matchms>=0.19.0->ms2deepscore)
  Downloading lxml-5.4.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (3.5 kB)
Collecting numba (from ms2deepscore)
  Downloading numba-0.61.2-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (2.8 kB)
Collecting pandas (from ms2deepscore)
  Downloading pandas-2.3.3-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (91 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m91.2/91.2 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pickydict>=0.4.0 (from matchms>=0.19.0->ms2deepscore)
  Downloading pickydict-0.5.0-py3-none-

In [2]:
import requests
import os
from tqdm import tqdm

def download_file(link, file_name):
    response = requests.get(link, stream=True)
    if os.path.exists(file_name):
        print(f"The file {file_name} already exists, the file won't be downloaded")
        return
    total_size = int(response.headers.get('content-length', 0))

    with open(file_name, "wb") as f, tqdm(desc="Downloading file", total=total_size, unit='B', unit_scale=True, unit_divisor=1024,) as bar:
        for chunk in response.iter_content(chunk_size=1024):
            if chunk:
                f.write(chunk)
                bar.update(len(chunk))

model_file_name = "ms2deepscore_model.pt"
spectrum_file_name = "pesticides.mgf"
download_file("https://zenodo.org/records/14290920/files/settings.json?download=1", "ms2deepscore_settings.json")
download_file("https://zenodo.org/records/14290920/files/ms2deepscore_model.pt?download=1", model_file_name)
download_file("https://raw.githubusercontent.com/matchms/ms2deepscore/refs/heads/main/tests/resources/pesticides_processed.mgf", spectrum_file_name)

The file ms2deepscore_settings.json already exists, the file won't be downloaded
The file ms2deepscore_model.pt already exists, the file won't be downloaded
The file pesticides.mgf already exists, the file won't be downloaded


In [3]:
# Creating the Model
from ms2deepscore.models import load_model
model = load_model(model_file_name, allow_legacy=True)



In [4]:
# Calculating MS2Deepscore scores
from matchms.Pipeline import Pipeline, create_workflow
from matchms.filtering.default_pipelines import DEFAULT_FILTERS
from ms2deepscore import MS2DeepScore

pipeline = Pipeline(create_workflow(query_filters=DEFAULT_FILTERS,
                                    score_computations=[[MS2DeepScore, {"model": model}]]))
report = pipeline.run(spectrum_file_name,
                     # reference_files="path_to_spectrum_file.mgf" ## Add a file here if you want to get scores against a reference library.
                     )
similarity_matrix = pipeline.scores.to_array()











Processing spectra: 76it [00:01, 70.18it/s]
Computing spectral embeddings ...: 100%|██████████| 76/76 [00:03<00:00, 25.25it/s]


In [5]:
print(report)

----- Spectrum Processing Report -----
Number of spectra processed: 76
Number of spectra removed: 0
Changes during processing:
                              removed spectra  changed metadata  changed mass spectrum
filter                                                                                
add_parent_mass                             0                76                      0
add_retention_index                         0                76                      0
add_retention_time                          0                76                      0
clean_adduct                                0                76                      0
derive_formula_from_smiles                  0                76                      0
make_charge_int                             0                 0                      0
add_compound_name                           0                 0                      0
derive_adduct_from_name                     0                 0                      0
der

In [6]:
print(similarity_matrix.shape)

(76, 76)


In [7]:
print(similarity_matrix)

[[1.         0.39567415 0.31721568 ... 0.16670259 0.19357387 0.15074877]
 [0.39567415 1.         0.3567378  ... 0.16240767 0.21237039 0.13035795]
 [0.31721568 0.3567378  1.         ... 0.38725827 0.3921025  0.56782875]
 ...
 [0.16670259 0.16240767 0.38725827 ... 1.         0.99078505 0.47407308]
 [0.19357387 0.21237039 0.3921025  ... 0.99078505 1.         0.45381763]
 [0.15074877 0.13035795 0.56782875 ... 0.47407308 0.45381763 1.        ]]


In [8]:
# Calculate embeddings
cleaned_spectra = pipeline.spectra_queries
ms2ds_model = MS2DeepScore(model)
ms2ds_embeddings = ms2ds_model.get_embedding_array(cleaned_spectra)

Computing spectral embeddings ...: 100%|██████████| 76/76 [00:02<00:00, 28.54it/s]


In [9]:
ms2ds_embeddings

array([[-0.86432904, -0.99871385,  0.23028606, ..., -0.20689027,
         0.8267501 , -0.69716865],
       [-0.45379511, -0.94319844, -0.2845467 , ..., -0.02428324,
        -0.59131169,  0.42027235],
       [-0.8610484 , -0.32745886,  0.39620909, ...,  0.62359434,
         0.81631714, -0.36466593],
       ...,
       [-0.88536602,  0.93186814,  0.06120966, ..., -0.45469359,
         0.0580497 ,  0.40203047],
       [-0.87119734,  0.95781434,  0.18429664, ..., -0.43923515,
        -0.13216279,  0.28982326],
       [-0.44632268, -0.09382542,  0.75702566, ..., -0.0366261 ,
         0.07153989, -0.1326734 ]], shape=(76, 500))

In [10]:
# Save Embeddings
import numpy as np
np.save("ms2ds_embeddings.npy", ms2ds_embeddings)

In [11]:
# Download Embeddings
from google.colab import files
files.download("ms2ds_embeddings.npy")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [21]:
smiles = []
for spectrum in cleaned_spectra:
  smiles.append(spectrum.get("smiles"))
print(len(smiles))

76


In [23]:
import pandas as pd
from google.colab import files
pd.DataFrame({"smiles": smiles}).to_csv("smiles.csv", index=False)
files.download("smiles.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>