# BLINK / MatchMS Speed Benchmarking

In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

import sys
sys.path.insert(0, '../')

import blink

import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import matchms as mms
from matchms.similarity import CosineGreedy, ModifiedCosine

# Load Test Data

In [2]:
gnps_all = blink.open_msms_file('/global/cfs/cdirs/metatlas/projects/spectral_libraries/ALL_GNPS_20221017.mgf')
gnps_all['precursor_mz'] = gnps_all['precursor_mz'][gnps_all.precursor_mz > 60]

In [3]:
def create_mms_spectra(row):
    """
    create MatchMS formated spectra
    """
    cols = ['name', 'precursor_mz','inchi', 'smiles', 'spectrumid']
    metadata = row[cols].to_dict()
    spectrum = mms.Spectrum(mz=np.array(row['spectrum'][0], dtype="float"), intensities=np.array(row['spectrum'][1], dtype="float"), metadata=metadata)
    
    return spectrum

def generate_sample_spectra(query_size, ref_size, msms_library=gnps_all):
    """
    sample spectra from test library
    """
    query_sample = msms_library.sample(query_size)
    ref_sample = msms_library.sample(ref_size)
    
    return query_sample, ref_sample

# Speed Comparison

### Define Speed Benchmarking Parameters

In [4]:
#Loop parameters
iteration_num = 4
replicate_num = 3
increment_mult = 10
initial_query_size = 100
initial_ref_size = 100

#MatchMS parameters
cos = CosineGreedy(tolerance=0.00099, intensity_power=0.5)
mod = ModifiedCosine(tolerance=0.00099, intensity_power=0.5)

#BLINK parameters
bin_width = 0.0001
tolerance = 0.001

### Compute Comparison

In [None]:
index = 0
replicate = 1
speed_test_results = {'query_spectra_num':{}, 'ref_spectra_num':{}, 'blink_time':{}, 'mms_time':{}, 'replicate':{}}

while replicate <= replicate_num:
    
    iteration = 1
    query_size = initial_query_size
    ref_size = initial_ref_size
    
    while iteration <= iteration_num: 

        query_sample, ref_sample = generate_sample_spectra(query_size, ref_size)

        S1 = blink.discretize_spectra(query_sample.spectrum,  query_sample.precursor_mz,  intensity_power=0.5, calc_network_score=False, bin_width=bin_width)
        S2 = blink.discretize_spectra(ref_sample.spectrum, ref_sample.precursor_mz, intensity_power=0.5, calc_network_score=False, bin_width=bin_width)

        MMS1 = query_sample.apply(lambda x: create_mms_spectra(x), axis=1)
        MMS2 = ref_sample.apply(lambda x: create_mms_spectra(x), axis=1)

        t0 = time.time()
        S12 = blink.score_sparse_spectra(S1, S2, tolerance=tolerance, calc_network_score=False)
        t1 = time.time()

        blink_time = t1 - t0

        t2 = time.time()
        MMS12 = cos.matrix(references=MMS1, queries=MMS2)
        t3 = time.time()

        mms_time = t3 - t2

        speed_test_results['query_spectra_num'][index] = query_size
        speed_test_results['ref_spectra_num'][index] = ref_size
        speed_test_results['replicate'][index] = replicate
        speed_test_results['blink_time'][index] = blink_time
        speed_test_results['mms_time'][index] = mms_time

        query_size = query_size * increment_mult
        ref_size = ref_size * increment_mult

        index += 1
        iteration += 1
        
    replicate += 1
        
df = pd.DataFrame.from_dict(speed_test_results, orient='columns')
df['comparisons'] = df['query_spectra_num'] * df['ref_spectra_num']

  inorm = np.array([1./np.linalg.norm(mzi[1]**intensity_power) for mzi in mzis])
  inorm = np.array([1./np.linalg.norm(mzi[1]**intensity_power) for mzi in mzis])
  inorm = np.array([1./np.linalg.norm(mzi[1]**intensity_power) for mzi in mzis])


### Save & Plot Results

In [None]:
plot_df = df.groupby('comparisons')[['blink_time', 'mms_time']].median()
plot_df.reset_index(inplace=True)

plt.plot(plot_df['comparisons'], plot_df['blink_time'], '-o', label='BLINK')
plt.plot(plot_df['comparisons'], plot_df['mms_time'], '-o', label='MatchMS')

plt.ylabel('Compute Time (seconds)')
plt.xlabel('# Comparisons')
plt.yscale('log')
plt.xscale('log')
plt.legend()
plt.show()

In [None]:
df.to_csv('cos_speed_benchmarking.csv')