# BLINK Diagnostic Plots

In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

import sys
sys.path.insert(0, '../')

import blink

import matchms as mms
from matchms.exporting import save_as_mgf
from matchms.similarity import CosineGreedy, ModifiedCosine

import pandas as pd
import numpy as np
import pickle
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

from matplotlib import pyplot as plt
from matplotlib.colors import LogNorm
import seaborn as sns

import matplotlib
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42

In [None]:
def create_mms_spectra(row):
    """
    create MatchMS formated spectra
    """
    cols = ['name', 'precursor_mz','inchi', 'smiles', 'spectrumid']
    metadata = row[cols].to_dict()
    spectrum = mms.Spectrum(mz=np.array(row['spectrum'][0], dtype="float"), intensities=np.array(row['spectrum'][1], dtype="float"), metadata=metadata)
    
    return spectrum

def remove_noise_ions(s):
    """
    remove ions <1% of base peak intensity 
    """
    s_ratio = s[1] / s[1].max()
    idx = s_ratio > 0.01
    s_clean = np.array([s[0][idx], s[1][idx]])
    
    return s_clean

def filter_spectra(row, decimal=3):
    """
    filter noise ions and round m/z values to a consistent decimal place
    """
    idx = np.argwhere(abs(row['spectrum'][0]-row['precursor_mz'])>14).flatten()
    s = row['spectrum'][:,idx]
    s_filtered = remove_noise_ions(s)
    s_filtered = s_filtered.round(decimal)
    
    return s_filtered

def round_precursor_mz(row, decimal=3):
    """
    round precursor m/z 
    """
    r_pmz = round(row['precursor_mz'], decimal)
    
    return r_pmz

# Generate Test Data

In [None]:
#load test spectra with blink
berkeley_lab_spectra = blink.open_msms_file('/global/homes/t/tharwood/spectral_libraries/BERKELEY-LAB.mgf')

#sample spectra for 10 million comparisons
small_mgf = berkeley_lab_spectra.sample(1000)
medium_mgf = berkeley_lab_spectra.sample(10000)

#format spectra for MatchMS
small_spectra = small_mgf.apply(lambda x: create_mms_spectra(x), axis=1)
small_spectra = small_spectra.tolist()

medium_spectra = medium_mgf.apply(lambda x: create_mms_spectra(x), axis=1)
medium_spectra = medium_spectra.tolist()

#save spectra
save_as_mgf(small_spectra, 'accuracy_test_data/small.mgf')
save_as_mgf(medium_spectra, 'accuracy_test_data/medium.mgf')

# Load & Filter Test Data

In [None]:
small  = blink.open_msms_file('accuracy_test_data/small.mgf')
medium = blink.open_msms_file('accuracy_test_data/medium.mgf')

#remove all zero intensity ions
small.spectrum = blink.filter_spectra(small.spectrum, small.precursor_mz)
medium.spectrum = blink.filter_spectra(medium.spectrum, medium.precursor_mz)

# #remove ions within m/z tolerance of one another in spectrum
# #note: This is for MatchMS BLINK parity. BLINK factors all ions into score, while MatchMS only selects one ion within tolerance. 
# small.spectrum = blink.remove_duplicate_ions(small.spectrum, min_diff= 0.01)
# medium.spectrum = blink.remove_duplicate_ions(medium.spectrum, min_diff= 0.01)

#filter and round m/z values using function defined in cell 2
small.spectrum = small.apply(lambda x: filter_spectra(x), axis=1)
medium.spectrum = medium.apply(lambda x: filter_spectra(x), axis=1)

small.precursor_mz = small.apply(lambda x: round_precursor_mz(x), axis=1)
medium.precusor_mz = medium.apply(lambda x: round_precursor_mz(x), axis=1)

#convert spectra into BLINK sparse matrix format
S1 = blink.discretize_spectra(small.spectrum,  small.precursor_mz,  intensity_power=0.5, bin_width=0.001)
S2 = blink.discretize_spectra(medium.spectrum, medium.precursor_mz, intensity_power=0.5, bin_width=0.001)

# Make 10 Million Comparisons Using BLINK

In [None]:
%%time
S12 = blink.score_sparse_spectra(S1, S2, tolerance=0.01)

# Make Same Comparisons Using MatchMS

In [None]:
#format spectra for MatchMS
MMS1 = small.apply(lambda x: create_mms_spectra(x), axis=1)
MMS2 = medium.apply(lambda x: create_mms_spectra(x), axis=1)

#MatchMS parameters
cos = CosineGreedy(tolerance=0.0099,intensity_power=0.5)
mod = ModifiedCosine(tolerance=0.0099,intensity_power=0.5)

In [None]:
%%time
MMS12 = {}
MMS12['cos'] = cos.matrix(references=MMS1, queries=MMS2)
# MMS12['mod'] = mod.matrix(references=MMS1, queries=MMS2)

# MMS12['cos'] = cos.matrix(references=MMS1, queries=MMS1)

## Diagnostic Plots

In [None]:
%matplotlib inline
fig, ax = plt.subplots(figsize=(12, 6),nrows=1,ncols=2)
ax = ax.flatten()

x = MMS12['cos']['score'].flatten()
y = S12['mzi'].toarray().flatten()

# one or both have to be non-zero
idx = (x*y)>0
x = x[idx]
y = y[idx]
nonzero_score = len(x)

# indices that are more than 0.001 different
idx_diff = abs(x-y)>0.001
different_score = sum(idx_diff)

# median difference of scores score more than 0.001 different
if different_score > 0:
    diff = (x[idx_diff] - y[idx_diff])
    median_diff_score = np.median(diff)
else:
    median_diff_score = 0

ax[0].plot(x[idx_diff], y[idx_diff],'.', markersize=20, alpha=0.6,label='Unequal: %.4f%%'%(100*different_score/nonzero_score), color='#ff7f0e')
ax[0].plot(x[~idx_diff], y[~idx_diff], '.',markersize=20, alpha=0.6, label='Equal: %.4f%%'%(100*(nonzero_score-different_score)/nonzero_score), color='#1f77b4')
ax[0].set_xlabel('MatchMS Score',fontsize=20)
ax[0].set_ylabel('BLINK Score',fontsize=20)
ax[0].legend()

x = MMS12['cos']['matches'].flatten()
y = S12['mzc'].toarray().flatten()

# one or both have to be non-zero
idx = (x*y)>0
x = x[idx]
y = y[idx]
nonzero_matches = len(x)

# indices where match counts don't agree
idx_diff = abs(x-y)>0
different_matches = sum(idx_diff)

# median difference of different matches 
if different_matches > 0:
    diff = (x[idx_diff] - y[idx_diff])
    median_diff_matches = np.median(diff)
else:
    median_diff_matches = 0

ax[1].plot(x[idx_diff], y[idx_diff],'.', markersize=20, alpha=0.6,label='Unequal: %.4f%%'%(100*different_matches/nonzero_matches), color='#ff7f0e')
ax[1].plot(x[~idx_diff], y[~idx_diff], '.',markersize=20, alpha=0.6, label='Equal: %.4f%%'%(100*(nonzero_matches-different_matches)/nonzero_matches), color='#1f77b4')
ax[1].set_xlabel('MatchMS Matching Ions',fontsize=20)
ax[1].set_ylabel('BLINK Matching Ions',fontsize=20)
ax[1].legend()

# fig.savefig('cos_accuracy_benchmarking.pdf')

In [None]:
%matplotlib inline
fig, ax = plt.subplots(figsize=(12, 6),nrows=1,ncols=2)
ax = ax.flatten()

x = MMS12['mod']['score'].flatten()
y = np.maximum(S12['mzi'].toarray().flatten(),S12['nli'].toarray().flatten())

# one or both have to be non-zero
idx = (x*y)>0
x = x[idx]
y = y[idx]
nonzero_score = len(x)

# indices that are more than 0.001 different
idx_diff = abs(x-y)>0.001
different_score = sum(idx_diff)

# median difference of scores score more than 0.001 different
if different_score > 0:
    diff = (x[idx_diff] - y[idx_diff])
    median_diff_score = np.median(diff)
else:
    median_diff_score = 0

ax[0].plot(x[idx_diff], y[idx_diff],'.', markersize=20, alpha=0.6,label='Unequal: %.4f%%'%(100*different_score/nonzero_score), color='#ff7f0e')
ax[0].plot(x[~idx_diff], y[~idx_diff], '.',markersize=20, alpha=0.6, label='Equal: %.4f%%'%(100*(nonzero_score-different_score)/nonzero_score), color='#1f77b4')
ax[0].set_xlabel('MatchMS Score',fontsize=20)
ax[0].set_ylabel('BLINK Score',fontsize=20)
ax[0].legend()

x = MMS12['mod']['matches'].flatten()
y = np.maximum(S12['mzc'].toarray().flatten(),S12['nlc'].toarray().flatten())

# one or both have to be non-zero
idx = (x*y)>0
x = x[idx]
y = y[idx]
nonzero_matches = len(x)

# indices where match counts don't agree
idx_diff = abs(x-y)>0
different_matches = sum(idx_diff)

# median difference of different matches 
if different_matches > 0:
    diff = (x[idx_diff] - y[idx_diff])
    median_diff_matches = np.median(diff)
else:
    median_diff_matches = 0

ax[1].plot(x[idx_diff], y[idx_diff],'.', markersize=20, alpha=0.6,label='Unequal: %.4f%%'%(100*different_matches/nonzero_matches), color='#ff7f0e')
ax[1].plot(x[~idx_diff], y[~idx_diff], '.',markersize=20, alpha=0.6, label='Equal: %.4f%%'%(100*(nonzero_matches-different_matches)/nonzero_matches), color='#1f77b4')
ax[1].set_xlabel('MatchMS Matching Ions',fontsize=20)
ax[1].set_ylabel('BLINK Matching Ions',fontsize=20)
ax[1].legend()

# fig.savefig('mod_accuracy_benchmarking.pdf')