# BLINK Diagnostic Plots

In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

import sys
sys.path.insert(0, '../')

import blink

import matchms as mms
from matchms.exporting import save_as_mgf
from matchms.similarity import CosineGreedy, CosineHungarian

import pandas as pd
import numpy as np
import pickle
import time
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

from matplotlib import pyplot as plt
from matplotlib.colors import LogNorm
import seaborn as sns

import matplotlib
from matplotlib.ticker import FormatStrFormatter

In [None]:
def create_mms_spectra(row):
    """
    create MatchMS formated spectra
    """
    if 'spectrumid' in row.index and 'name' in row.index:
        cols = ['name', 'precursor_mz','inchi', 'smiles', 'spectrumid', 'scans', 'ionmode']
    else:
        cols = ['compound_name', 'precursor_mz','inchi', 'smiles', 'scans', 'ionmode']
    metadata = row[cols].to_dict()
    spectrum = mms.Spectrum(mz=np.array(row['spectrum'][0], dtype="float"), intensities=np.array(row['spectrum'][1], dtype="float"), metadata=metadata)
    
    return spectrum

def remove_noise_ions(s):
    """
    remove ions <1% of base peak intensity 
    """
    s_ratio = s[1] / s[1].max()
    idx = s_ratio > 0.01
    s_clean = np.array([s[0][idx], s[1][idx]])
    
    return s_clean

def filter_spectra(row, decimal=4):
    """
    filter noise ions and round m/z values to a consistent decimal place
    """
    idx = np.argwhere(abs(row['spectrum'][0]-row['precursor_mz'])>14).flatten()
    s = row['spectrum'][:,idx]
    s_filtered = remove_noise_ions(s)
    s_filtered = s_filtered.round(decimal)
    
    return s_filtered

def round_precursor_mz(row, decimal=4):
    """
    round precursor m/z 
    """
    r_pmz = round(row['precursor_mz'], decimal)
    
    return r_pmz

# Generate Test Data

In [None]:
#load test spectra with blink
berkeley_lab_spectra = blink.open_msms_file('/global/cfs/cdirs/metatlas/projects/spectral_libraries/BERKELEY-LAB.mgf')

#sample spectra for 10 million comparisons
small_mgf = berkeley_lab_spectra.sample(1000)
medium_mgf = berkeley_lab_spectra.sample(10000)

#format spectra for MatchMS
small_spectra = small_mgf.apply(lambda x: create_mms_spectra(x), axis=1)
small_spectra = small_spectra.tolist()

medium_spectra = medium_mgf.apply(lambda x: create_mms_spectra(x), axis=1)
medium_spectra = medium_spectra.tolist()

In [None]:
#save spectra
save_as_mgf(small_spectra, 'accuracy_test_data/small.mgf')
save_as_mgf(medium_spectra, 'accuracy_test_data/medium.mgf')

# Set Comparison Parameters

In [None]:
tolerance = 0.01
blink_bin_width = 0.0001

# Load & Filter Test Data

In [None]:
small  = blink.open_msms_file('accuracy_test_data/small.mgf')
medium = blink.open_msms_file('accuracy_test_data/medium.mgf')

#remove all zero intensity ions
small.spectrum = blink.spectral_normalization._filter_spectra(small.spectrum)
medium.spectrum = blink.spectral_normalization._filter_spectra(medium.spectrum)

#filter and round m/z values using function defined in cell 2
small.spectrum = small.apply(lambda x: filter_spectra(x), axis=1)
medium.spectrum = medium.apply(lambda x: filter_spectra(x), axis=1)

small.precursor_mz = small.apply(lambda x: round_precursor_mz(x), axis=1)
medium.precusor_mz = medium.apply(lambda x: round_precursor_mz(x), axis=1)

#reconcile duplicate noise ions
small.spectrum = blink.spectral_normalization._remove_duplicate_ions(small.spectrum, min_diff = tolerance)
medium.spectrum = blink.spectral_normalization._remove_duplicate_ions(medium.spectrum, min_diff = tolerance)

In [None]:
small.shape

In [None]:
medium.shape

# Make 10 Million Comparisons Using BLINK

In [None]:
S1 = blink.discretize_spectra(small.spectrum, medium.spectrum, small.precursor_mz, medium.precursor_mz, intensity_power=0.5, bin_width=blink_bin_width, tolerance=tolerance)

In [None]:
%%time
S12 = blink.score_sparse_spectra(S1)

# Make Same Comparisons Using MatchMS

In [None]:
matchms_tol = round(tolerance - blink_bin_width, 4)

#format spectra for MatchMS
MMS1 = small.apply(lambda x: create_mms_spectra(x), axis=1)
MMS2 = medium.apply(lambda x: create_mms_spectra(x), axis=1)

#MatchMS parameters
cos = CosineGreedy(tolerance=matchms_tol, intensity_power=0.5)
cos_hungarian = CosineHungarian(tolerance=matchms_tol, intensity_power=0.5)

In [None]:
%%time
MMS12 = {}
MMS12['greedy_cos'] = cos.matrix(references=MMS1, queries=MMS2)

## Comparison Plots

In [None]:
fig, ax = plt.subplots(1, figsize=(12, 12))
# ax = ax.flatten()

x = MMS12['greedy_cos']['score'].flatten()
y = S12['mzi'].toarray().flatten()

# one or both have to be non-zero
idx = (x*y)>0
x = x[idx]
y = y[idx]
nonzero_score = len(x)

# indices that are more than 0.001 different
idx_diff = abs(x-y)>0.001
different_score = sum(idx_diff)

ax.plot(x[idx_diff], y[idx_diff],'.', markersize=25, alpha=0.6,label='Unequal: %.3f%%'%(100*different_score/nonzero_score), color='#ff7f0e', rasterized=True)
ax.plot(x[~idx_diff], y[~idx_diff], '.',markersize=25, alpha=0.6, label='Equal: %.1f%%'%(100*(nonzero_score-different_score)/nonzero_score), color='#1f77b4', rasterized=True)
ax.set_xlabel('MatchMS (Greedy) Score',fontsize=40)
ax.set_ylabel('BLINK Score',fontsize=40)
ax.hlines(y=0.7, xmin=0, xmax=0.7, color='r', linestyle='-', linewidth=4)
ax.vlines(x=0.7, ymin=0, ymax=0.7, color='r', linestyle='-', linewidth=4)
ax.tick_params(axis='both', labelsize=36)
ax.tick_params(axis='both', labelsize=36)
ax.legend(loc=2, prop={'size': 36})
ax.set_ylim(bottom=0)
ax.set_xlim(left=0)

ax.set_xticks(np.arange(0, 1.2, 0.2))
ax.set_yticks(np.arange(0, 1.2, 0.2))

labels = [0, 0.2, 0.4, 0.6, 0.8, 1.0]
ax.set_yticklabels(labels)
ax.set_xticklabels(labels)

ax.set_aspect('equal')

ax.grid()

# fig.savefig('cos_score-accuracy_benchmarking.pdf', bbox_inches="tight")
plt.show()

In [None]:
fig, ax = plt.subplots(1, figsize=(12, 12))
# ax = ax.flatten()

x = MMS12['greedy_cos']['matches'].flatten()
y = S12['mzc'].toarray().flatten()

# one or both have to be non-zero
idx = (x*y)>0
x = x[idx]
y = y[idx]
nonzero_matches = len(x)

# indices where match counts don't agree
idx_diff = abs(x-y)>0
different_matches = sum(idx_diff)

ax.plot(x[idx_diff], y[idx_diff],'.', markersize=25, alpha=0.6,label='Unequal: %.3f%%'%(100*different_score/nonzero_score), color='#ff7f0e', rasterized=True)
ax.plot(x[~idx_diff], y[~idx_diff], '.',markersize=25, alpha=0.6, label='Equal: %.1f%%'%(100*(nonzero_score-different_score)/nonzero_score), color='#1f77b4', rasterized=True)
ax.set_xlabel('MatchMS (Greedy) Matches',fontsize=40)
ax.set_ylabel('BLINK Matches',fontsize=40)
ax.hlines(y=6, xmin=0, xmax=6, color='r', linestyle='-', linewidth=4)
ax.vlines(x=6, ymin=0, ymax=6, color='r', linestyle='-', linewidth=4)
ax.tick_params(axis='both', labelsize=36)
ax.tick_params(axis='both', labelsize=36)
ax.legend(loc=2, prop={'size': 36})
ax.set_ylim(bottom=0, top=150)
ax.set_xlim(left=0, right=150)
ax.set_xticks(np.arange(0, 175, 25))
ax.set_yticks(np.arange(0, 175, 25))

ax.set_aspect('equal')

ax.grid()

# fig.savefig('cos_matches-accuracy_benchmarking.pdf', bbox_inches="tight")
plt.show()

# Benchmark BLINK Across Bin Widths

In [None]:
blink_bin_sizes = [0.000001, 0.000005, 0.00001, 0.00005, 0.0001, 0.0005, 0.001, 0.005, 0.01]

x_init_score = MMS12['greedy_cos']['score'].flatten()
x_init_matches = MMS12['greedy_cos']['matches'].flatten()

In [None]:
bin_size_benchmark = {'scoring_time':[], 'bin_width':[], 'different_score':[], 'nonzero_score':[], 'different_matches':[], 'nonzero_matches':[]}

for bin_width in blink_bin_sizes:
    print(bin_width)
    S1 = blink.discretize_spectra(small.spectrum, medium.spectrum, small.precursor_mz, medium.precursor_mz, intensity_power=0.5, bin_width=bin_width, tolerance=tolerance)
    
    t0 = time.time()
    S12 = blink.score_sparse_spectra(S1)
    t1 = time.time()
    
    score_time = t1 - t0
    
    y_score = S12['mzi'].toarray().flatten()
    y_matches = S12['mzc'].toarray().flatten()

    # one or both have to be non-zero
    idx = (x_init_score*y_score)>0
    x_score = x_init_score[idx]
    y_score = y_score[idx]
    nonzero_scores = len(x)

    # indices that are more than 0.001 different
    idx_diff = abs(x_score-y_score)>0.001
    different_scores = sum(idx_diff)
    
    bin_size_benchmark['different_score'].append( different_scores) 
    bin_size_benchmark['nonzero_score'].append(nonzero_scores) 
    
    # one or both have to be non-zero
    idx = (x_init_matches*y_matches)>0
    x_matches = x_init_matches[idx]
    y_matches = y_matches[idx]
    nonzero_matches = len(x)

    # indices that are more than 0.001 different
    idx_diff = abs(x_matches-y_matches)>0
    different_matches = sum(idx_diff)
    
    bin_size_benchmark['different_matches'].append(different_matches) 
    bin_size_benchmark['nonzero_matches'].append(nonzero_matches) 
    
    bin_size_benchmark['scoring_time'].append(score_time)
    bin_size_benchmark['bin_width'].append(bin_width)

In [None]:
plot_df = pd.DataFrame.from_dict(bin_size_benchmark, orient='columns')
plot_df['score_agreement_ratio'] = (plot_df['nonzero_score'] - plot_df['different_score']) / plot_df['nonzero_score']

fig, ax = plt.subplots(2, figsize=(18, 10), sharex = True)
ax[0].plot(plot_df['bin_width'].values, plot_df['scoring_time'].values, '-o', linewidth=3, markersize=15)

ax[0].set_ylabel('Compute Time', fontsize=40)
ax[0].tick_params(axis='both', labelsize=36)
ax[0].set_yscale('log')
ax[0].set_xscale('log')
ax[0].vlines(x=0.001, ymin=plot_df['scoring_time'].values.min()-.01, ymax=plot_df['scoring_time'].values.max()+50, color='r', linestyle='--', linewidth=4)
ax[0].grid()

ax[1].plot(plot_df['bin_width'].values, plot_df['score_agreement_ratio'].values, '-o', linewidth=3, markersize=15, color = 'tab:orange')

ax[1].set_ylabel('Equivalence', fontsize=40)
ax[1].yaxis.set_major_formatter(FormatStrFormatter('%.2f'))
ax[1].set_xlabel('BLINK Bin Width', fontsize=40)
ax[1].tick_params(axis='both', labelsize=36)
ax[1].vlines(x=0.001, ymin=plot_df['score_agreement_ratio'].values.min(), ymax=1, color='r', linestyle='--', linewidth=4)
ax[1].grid()

fig.savefig('cos_across-bins_benchmark.pdf', bbox_inches="tight")
plt.show()