In [39]:
import os
import sys
# Make sure all code is in the PATH.
sys.path.append("../src/")

In [40]:
import functools
import lzma
import re

import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import numba as nb
import numpy as np
import pandas as pd
import pyteomics.mgf
import seaborn as sns
import tqdm.notebook as tqdm
from matplotlib.colors import LogNorm
import spectrum_utils.spectrum as sus
import similarity


In [41]:
# Plot styling.
plt.style.use(["seaborn-white", "seaborn-paper"])
plt.rc("font", family="sans-serif")
sns.set_palette(["#9e0059", "#6da7de", "#ee266d", "#dee000", "#eb861e"])
sns.set_context("paper")

## Setup parameters for the comparison


In [42]:
abs_mz_tolerance = 0.02
max_mz_delta = 200
# if defined, we will only search for specific delta m/z between two spectra
# 16 oxygen 15.994914
# specific_mod_mz = 15.994914
# otherwise define as -1
specific_mod_mz = -1

In [43]:
# check for profile spectra in the library file - contains zero intensity values
def is_centroid(spectrum_dict):
    return all(i > 0 for i in spectrum_dict["intensity array"])

In [None]:
# Read all spectra from the ALL_GNPS.MGF from April 13. 2022
# library contains propagated spectra, that will be filtered out
# GNPS library from https://gnps-external.ucsd.edu/gnpslibrary
c_error = 0
c_propagated = 0
c_multi_charged = 0
c_removed = 0
c_profile_spec = 0
spectra = []
# filename = "../data/BILELIB19.mgf"
filename = "../data/20220418_ALL_GNPS_NO_PROPOGATED.mgf"
with pyteomics.mgf.MGF(filename) as f_in:
    for spectrum_dict in tqdm.tqdm(f_in):
        # ignore propagated spectra with LIBRARYQUALITY==4
        # ignore multiple charged molecules
        try:
            if int(spectrum_dict["params"]["libraryquality"])<=3:
                c_propagated += 1
            elif int(spectrum_dict["params"]["charge"][0])==1:
                c_multi_charged +=1
            elif not is_centroid(spectrum_dict):
                c_profile_spec += 1
            else:
                spectra.append(
                    sus.MsmsSpectrum(
                        spectrum_dict["params"]["seq"],
                        float(spectrum_dict["params"]["pepmass"][0]),
                        int(spectrum_dict["params"]["charge"][0]),
                        spectrum_dict["m/z array"],
                        spectrum_dict["intensity array"],
                        #IONMODE=Positive
                        #LIBRARYQUALITY=4
                        #SPECTRUMID
                        # NAME
                        # SMILES
                    )
                )
        except:
            c_error += 1

c_removed = c_propagated + c_error + c_multi_charged + c_profile_spec
print("total spectra={};  total removed={};  error={};  multi charge={};  propagated spec={};  profile spec={}".format(len(spectra), c_removed, c_error, c_multi_charged, c_propagated, c_profile_spec))

0it [00:00, ?it/s]

In [34]:
# sort spectra by precursor mz
spectra.sort(key=lambda spec: spec.precursor_mz)
# Extract precursor mz as filter argument
precursor_mz = nb.typed.List()
for spectrum in spectra:
    precursor_mz.append(spectrum.precursor_mz)

In [35]:
@nb.njit
def generate_pairs(precursor_mz):
    """
    Create pairs of spectra that are compared.
    Maximum precursor mz difference is 200 to limit the search space to a range from smaller modifications (e.g., +O) and
    larger modifications (e.g., hexose)

    Parameters
    ----------
    spectra a list of all spectra

    Returns
    -------

    """
    for i in range(len(precursor_mz)):
        j = i + 1
        while (j < len(precursor_mz)) and (precursor_mz[j] <= precursor_mz[i] + max_mz_delta):
            # list is sorted by precursor mz so j always > i
            # select only one specific precursor mz differance or include all
            if (specific_mod_mz < 0) or (abs(precursor_mz[j]-precursor_mz[i]-specific_mod_mz) <= abs_mz_tolerance):
                yield i
                yield j
            j += 1

In [36]:

pairs = [np.fromiter(
    generate_pairs(precursor_mz),
    np.uint32).reshape((-1, 2))]
pairs = np.vstack(pairs)

In [37]:
print ("Comparing {} pairs".format(len(pairs)))

Comparing 41379 paris


In [38]:
cosines, modified_cosines, neutral_losses = [], [], []
for i, j in tqdm.tqdm(pairs):
    cosines.append(
        similarity.cosine(spectra[i], spectra[j], abs_mz_tolerance)[0]
    )
    modified_cosines.append(
        similarity.modified_cosine(
            spectra[i], spectra[j], abs_mz_tolerance
        )[0]
    )
    neutral_losses.append(
        similarity.neutral_loss(
            spectra[i], spectra[j], abs_mz_tolerance
        )[0]
    )
similarities = pd.DataFrame(
    {
        "cosine": cosines,
        "modified_cosine": modified_cosines,
        "neutral_loss": neutral_losses
    }
)
similarities.to_parquet("gnps_lib_results.parquet")

  0%|          | 0/41379 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(7.2, 7.2 / 1.618))

bins = 50
tick_locators = mticker.FixedLocator(np.arange(0, bins + 1, bins / 4))
tick_labels = np.asarray([f"{a:.2f}" for a in np.arange(0, 1.01, 0.25)])
for i, (xlabel, ylabel) in enumerate(
        [
            ["cosine", "modified_cosine"],
            ["neutral_loss", "cosine"],
            ["neutral_loss", "modified_cosine"]
        ]
    ):
    hist, _, _ = np.histogram2d(
        similarities[xlabel],
        similarities[ylabel],
        bins=bins,
        range=[[0, 1], [0, 1]],
    )
    heatmap = sns.heatmap(
        np.rot90(hist),
        cmap="viridis",
        cbar=False,
        square=True,
        xticklabels=False,
        yticklabels=False,
        ax=axes[i],
        norm=LogNorm(),
    )
    axes[i].yaxis.set_major_locator(tick_locators)
    axes[i].set_yticklabels(tick_labels[::-1])
    axes[i].xaxis.set_major_locator(tick_locators)
    axes[i].set_xticklabels(tick_labels)
    for _, spine in heatmap.spines.items():
        spine.set_visible(True)
    axes[i].set_xlabel(xlabel.replace("_", " ").capitalize())
    axes[i].set_ylabel(ylabel.replace("_", " ").capitalize())
    
plt.tight_layout()

plt.savefig("gnps_lib.png", dpi=300, bbox_inches="tight")
plt.show()
plt.close()