In [None]:
import os
import sys
# Make sure all code is in the PATH.
sys.path.append(
    os.path.normpath(
        os.path.join(
            os.environ["HOME"], "Projects", "cosine_neutral_loss", "src"
        )
    )
)

In [None]:
import functools
import lzma
import re

import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import numba as nb
import numpy as np
import pandas as pd
import pyteomics.mgf
import seaborn as sns
import spectrum_utils.spectrum as sus
import tqdm.notebook as tqdm
from matplotlib.colors import LogNorm

import similarity

In [None]:
# Plot styling.
plt.style.use(["seaborn-white", "seaborn-paper"])
plt.rc("font", family="sans-serif")
sns.set_palette(["#9e0059", "#6da7de", "#ee266d", "#dee000", "#eb861e"])
sns.set_context("paper")

In [None]:
regex_non_alpha = re.compile(r"[^A-Za-z]+")


@functools.lru_cache(None)
def remove_mod(sequence):
    return regex_non_alpha.sub("", sequence)

In [None]:
# Read all spectra from the MGF.
# MassIVE-KB (version 2018-06-15) downloaded from
# https://massive.ucsd.edu/ProteoSAFe/static/massive-kb-libraries.jsp
spectra = []
filename = ("../data/external/LIBRARY_CREATION_AUGMENT_LIBRARY_TEST-82c0124b-"
            "download_filtered_mgf_library-main.mgf.xz")
with lzma.open(filename, "rt") as xz_in:
    with pyteomics.mgf.MGF(xz_in) as f_in:
        for spectrum_dict in tqdm.tqdm(f_in):
            spectra.append(
                sus.MsmsSpectrum(
                    spectrum_dict["params"]["seq"],
                    float(spectrum_dict["params"]["pepmass"][0]),
                    int(spectrum_dict["params"]["charge"][0]),
                    spectrum_dict["m/z array"],
                    spectrum_dict["intensity array"],
                )
            )

In [None]:
# Extract the metadata (peptide sequence and charge).
sequences, charges = [], []
for spectrum in spectra:
    sequences.append(spectrum.identifier)
    charges.append(spectrum.precursor_charge)
metadata = pd.DataFrame({"sequence": sequences, "charge": charges})
metadata["sequence"] = metadata["sequence"].str.replace("I", "L")
metadata["sequence_no_mod"] = metadata["sequence"].apply(remove_mod)

In [None]:
@nb.njit
def generate_pairs(spectrum_indexes, sequences, sequences_no_mod):
    for i in range(len(spectrum_indexes)):
        j = i + 1
        while (j < len(sequences) and
               sequences_no_mod[i] == sequences_no_mod[j]):
            if sequences[i] != sequences[j]:
                yield spectrum_indexes[i]
                yield spectrum_indexes[j]
            j += 1

In [None]:
# Extract indexes for pairs of spectra whose peptides differ by a
# modification.
pairs = []
for charge in np.arange(
        metadata["charge"].min(),
        metadata["charge"].max() + 1
    ):
    metadata_charge = (metadata[metadata["charge"] == charge]
                       .sort_values("sequence")
                       .reset_index())
    pairs.append(
        np.fromiter(
            generate_pairs(
                metadata_charge["index"].values,
                nb.typed.List(metadata_charge["sequence"]),
                nb.typed.List(metadata_charge["sequence_no_mod"]),
            ),
            np.uint32)
        .reshape((-1, 2))
    )
pairs = np.vstack(pairs)

In [None]:
# Compute similarities between spectrum pairs.
fragment_mz_tolerance = 0.05

cosines, modified_cosines, neutral_losses = [], [], []
for i, j in tqdm.tqdm(pairs):
    cosines.append(
        similarity.cosine(spectra[i], spectra[j], fragment_mz_tolerance)[0]
    )
    modified_cosines.append(
        similarity.modified_cosine(
            spectra[i], spectra[j], fragment_mz_tolerance
        )[0]
    )
    neutral_losses.append(
        similarity.neutral_loss(
            spectra[i], spectra[j], fragment_mz_tolerance
        )[0]
    )
similarities = pd.DataFrame(
    {
        "cosine": cosines,
        "modified_cosine": modified_cosines,
        "neutral_loss": neutral_losses
    }
)
similarities.to_parquet("massivekb_peptide_mods.parquet")

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(7.2, 7.2 / 1.618))

bins = 50
tick_locators = mticker.FixedLocator(np.arange(0, bins + 1, bins / 4))
tick_labels = np.asarray([f"{a:.2f}" for a in np.arange(0, 1.01, 0.25)])
for i, (xlabel, ylabel) in enumerate(
        [
            ["cosine", "modified_cosine"],
            ["neutral_loss", "cosine"],
            ["neutral_loss", "modified_cosine"]
        ]
    ):
    hist, _, _ = np.histogram2d(
        similarities[xlabel],
        similarities[ylabel],
        bins=bins,
        range=[[0, 1], [0, 1]],
    )
    heatmap = sns.heatmap(
        np.rot90(hist),
        cmap="viridis",
        cbar=False,
        square=True,
        xticklabels=False,
        yticklabels=False,
        ax=axes[i],
        norm=LogNorm(),
    )
    axes[i].yaxis.set_major_locator(tick_locators)
    axes[i].set_yticklabels(tick_labels[::-1])
    axes[i].xaxis.set_major_locator(tick_locators)
    axes[i].set_xticklabels(tick_labels)
    for _, spine in heatmap.spines.items():
        spine.set_visible(True)
    axes[i].set_xlabel(xlabel.replace("_", " ").capitalize())
    axes[i].set_ylabel(ylabel.replace("_", " ").capitalize())
    
plt.tight_layout()

plt.savefig("massivekb_peptide_mods.png", dpi=300, bbox_inches="tight")
plt.show()
plt.close()