In [6]:
import os
import sys
from pathlib import Path
# Make sure all code is in the PATH.
sys.path.append("../src/")

In [7]:
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import numba as nb
import numpy as np
import pandas as pd
import pyteomics.mgf
import seaborn as sns
import tqdm.notebook as tqdm
from matplotlib.colors import LogNorm
import spectrum_utils.spectrum as sus
import similarity
import bile_mods
# import create_similarity_matrix as main_utils

AttributeError: module 'scipy' has no attribute '_lib'

In [None]:
# Plot styling.
plt.style.use(["seaborn-white", "seaborn-paper"])
plt.rc("font", family="sans-serif")
sns.set_palette(["#9e0059", "#6da7de", "#ee266d", "#dee000", "#eb861e"])
sns.set_context("paper")

## Setup parameters for the comparison


In [None]:
# public parameters
library_file = "../data/BILELIB19.mgf"
# library_file = "../data/20220418_ALL_GNPS_NO_PROPOGATED.mgf"

# analysis name
analysis_name = "oxygen"

# square root transformation of intensities is often performed to limit the impact of high abundant signals
apply_sqrt = True

# size of subset of spectral pairs
n_spectral_pairs = 10

# minimum number of signals only removes the spectra with less
min_n_signals = 6

# signal alignment tolerance
abs_mz_tolerance = 0.02
# only allow precursor mz difference of:
max_mz_delta = 200
# if defined, we will only search for specific delta m/z between two spectra
# 16 oxygen 15.994914
# otherwise define as -1
specific_mod_mz = 15.9949
# specific_mod_mz = -1

# free bile acids compared to conjugated with amino acids
# or different delta mz between different conjugated bile acids
mod_list = np.empty(0, float)
# mod_list = bile_mods.get_as_mods()
# mod_list = bile_mods.get_as_exchange()

library_file_name_without_ext = Path(library_file).stem
# analysis ID is used for file export
if specific_mod_mz <= 0:
    analysis_id = "{}_{}_sqrt_{}_{}pairs_{}min_signals_{}maxdelta_{}mods".format(
        library_file_name_without_ext,
        analysis_name,
        apply_sqrt,
        n_spectral_pairs,
        min_n_signals,
        max_mz_delta,
        len(mod_list),
    ).replace(".", "i")
else:
    analysis_id = "{}_{}_sqrt_{}_{}pairs_{}min_signals_{}specific_delta_{}mods".format(
        library_file_name_without_ext,
        analysis_name,
        apply_sqrt,
        n_spectral_pairs,
        min_n_signals,
        specific_mod_mz,
        len(mod_list),
    ).replace(".", "i")

# output filename of pairs only with pair selection relevant parameters:
pairs_filename = (
    "temp/{}_pairs.parquet".format(analysis_id)
        .replace("sqrt_True_", "")
        .replace("sqrt_False_", "")
)
spectra_filename = "tempspectra/spectra_{}_{}min_signals_sqrt_{}.parquet".format(
    library_file_name_without_ext, min_n_signals, apply_sqrt
)

In [None]:
# check for profile spectra in the library file - contains zero intensity values
def is_centroid(spectrum_dict):
    return all(i > 0 for i in spectrum_dict["intensity array"])

In [None]:
spectra = None
precursor_mz_list = None
if (os.path.isfile(pairs_filename) == False) or (
        os.path.isfile(spectra_filename) == False
):
    # missing either the spectra or pairs file
    spectra = main_utils.import_from_mgf()
    # Extract precursor mz as filter argument
    precursor_mz = nb.typed.List()
    for spectrum in spectra:
        precursor_mz.append(spectrum.precursor_mz)

In [None]:
# compute subset of pairs
pairs_df = main_utils.load_or_compute_pairs_df(precursor_mz_list)
print("Comparing {} pairs".format(len(pairs_df)))

In [None]:
similarities = main_utils.compute_similarity(spectra, pairs_df)
main_utils.save_results(similarities)
print(len(similarities))

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(7.2, 7.2 / 1.618))

bins = 50
tick_locators = mticker.FixedLocator(np.arange(0, bins + 1, bins / 4))
tick_labels = np.asarray([f"{a:.2f}" for a in np.arange(0, 1.01, 0.25)])
for i, (xlabel, ylabel) in enumerate(
        [
            ["cos_score", "mod_score"],
            ["nl_score", "cos_score"],
            ["nl_score", "mod_score"]
        ]
    ):
    hist, _, _ = np.histogram2d(
        similarities[xlabel],
        similarities[ylabel],
        bins=bins,
        range=[[0, 1], [0, 1]],
    )
    heatmap = sns.heatmap(
        np.rot90(hist),
        cmap="viridis",
        cbar=False,
        square=True,
        xticklabels=False,
        yticklabels=False,
        ax=axes[i],
        norm=LogNorm(),
    )
    axes[i].yaxis.set_major_locator(tick_locators)
    axes[i].set_yticklabels(tick_labels[::-1])
    axes[i].xaxis.set_major_locator(tick_locators)
    axes[i].set_xticklabels(tick_labels)
    for _, spine in heatmap.spines.items():
        spine.set_visible(True)
    axes[i].set_xlabel(xlabel.replace("_", " ").capitalize())
    axes[i].set_ylabel(ylabel.replace("_", " ").capitalize())
    
plt.tight_layout()

Path("results/img/").mkdir(parents=True, exist_ok=True)
plt.savefig("results/img/{}.png".format(analysis_name), dpi=300, bbox_inches="tight")
plt.show()
plt.close()