# BLINK Tutorial

In [1]:
import sys
sys.path.insert(0, '../')

import blink
import pandas as pd

## Step 1: Read Input Data

#### BLINK can accept MGF files, mzML files, and lists of spectra as input. Correctly formatted lists of spectra and precursor m/zs can be fed directly into the discretization function, the other file formats can be parsed using a built-in helper function. 

__blink.open_msms_file():__
<br>
This function takes an MGF file or mzML file and converts into a Pandas DataFrame.

__Example Code:__

In [2]:
#parsing MGF:
mgf = blink.open_msms_file('example_data/small.mgf')

#parsing mzML:
mzml = blink.open_msms_file('example_data/C18_MSMS_SalicylicAcid.mzML')

In [3]:
mgf.head()

Unnamed: 0,name,precursor_mz,inchi,smiles,spectrumid,spectrum
0,quinic acid CollisionEnergy:102040 2M-2H+Na,405.101,"""InChI=1S/C7H12O6/c8-3-1-7(13,6(11)12)2-4(9)5(...",C1[C@H](C([C@@H](CC1(C(=O)O)O)O)O)O,CCMSLIB00010102414,"[[63.781349, 66.794624, 79.524162, 80.695747, ..."
1,"""6-methyl-1-[(3-nitrophenyl)amino]furano[2,3-h...",337.082,"""InChI=1S/C18H12N2O5/c1-10-7-16(21)25-18-13(10...",Cc1cc(=O)oc2c1ccc1occ(Nc3cccc([N+](=O)[O-])c3)c12,CCMSLIB00010110728,"[[54.141449, 92.382637, 104.289917, 131.048874..."
2,"""{4,5-dimethoxy-3-[(3,4,5-trimethoxyphenyl)car...",445.198,"""InChI=1S/C23H30N2O7/c1-8-25(9-2)23(27)15-10-1...",CCN(CC)C(=O)c1cc(N=C(O)c2cc(OC)c(OC)c(OC)c2)c(...,CCMSLIB00010105281,"[[66.804932, 83.8825, 92.385109, 104.302299, 1..."
3,"""methyl 7-(4-chlorophenyl)-2-methyl-5-oxo-4-(3...",498.168,"""InChI=1S/C27H28ClNO6/c1-14-23(27(31)35-5)24(1...",COC(=O)C1=C(C)NC2=C(C(=O)CC(c3ccc(Cl)cc3)C2)C1...,CCMSLIB00010109604,"[[60.13345, 82.429543, 103.265701, 136.881714,..."
4,l-isoleucine CollisionEnergy:102040 M-H,130.087,"""InChI=1S/C6H13NO2/c1-3-4(2)5(7)6(8)9/h4-5H,3,...",CCC(C)C(N)C(=O)O,CCMSLIB00010109482,"[[57.75724, 61.987881, 64.120399, 66.745888, 7..."


In [4]:
mzml.head()

Unnamed: 0,id,ms_level,rt,spectrum,precursor_mz,i,charge
0,2,2,0.012606,"[[52.607494, 69.94896, 85.77718, 90.99794, 92....",178.9776,1507624.0,
1,3,2,0.01425,"[[68.87898, 73.0291, 74.00742, 74.370346, 86.5...",116.928476,1652903.0,1.0
2,5,2,0.019465,"[[52.82731, 56.123146, 68.89473, 92.753006, 93...",174.955902,779920.6,
3,6,2,0.02111,"[[50.327473, 51.641243, 58.95839, 61.987972, 6...",146.965805,2494821.0,
4,8,2,0.026325,"[[53.51372, 57.077217, 57.33841, 61.98802, 74....",146.938599,3187073.0,


## Step 2: "Discretize" Spectra

#### Prior to scoring, each set of spectra is discretized. This process converts lists of m/z and intensity arrays and precursor m/zs into dictionary-based sparse matrices.  

__blink.discretize_spectra():__
<br>
This function takes lists of fragmentation spectra and precursor m/zs as input and outputs a dictionary that contains intensity values, binned m/z values, and precursor m/z values. 
<br>
__Parameters:__
<br>
1. bin_width (default=0.001). This value is used to convert the m/z floats into rounded integer bins. Using smaller bin widths, precision increases and speed decreases. Example of binning calculation using default values. 100.002 --> 100002
2. intensity_power (default=0.5). The intensity power parameter scales intensity values used in the scoring. 
3. trim_empty (default=False). If True, spectra and associated metadata will be removed if the spectra are empty.
4. remove_duplicates (default=False). This parameter optionally calls blink.remove_duplicate_ions, which averages m/z values and sums the intensities of fragment ions in a spectrum that are within a minumum distance. This can be useful for spectra that are noisy or poorly centroided.
5. calc_network_score (default=True). Optionally, BLINK can be used for molecular networking. If enabled in this step, the function will calculate another set of spectra where the m/z are subtracted from the precursor m/z for that particular spectrum. 
6. metadata (default=None). Any desired metadata is stored with this variable. If the default of None is used, the metadata will be the number of ions in each spectrum. 

__Example Code:__

In [5]:
discretized_mgf = blink.discretize_spectra(mgf.spectrum.tolist(), mgf.precursor_mz.tolist(),
                                         bin_width=0.001, intensity_power=0.5, trim_empty=False, remove_duplicates=False, calc_network_score=False,
                                         metadata=mgf.drop(columns=['spectrum']).to_dict(orient='records'))

discretized_mzml = blink.discretize_spectra(mzml.spectrum.tolist(), mzml.precursor_mz.tolist(),
                                         bin_width=0.001, intensity_power=0.5, trim_empty=False, remove_duplicates=False, calc_network_score=False,
                                         metadata=mzml.drop(columns=['spectrum']).to_dict(orient='records'))

Discretized spectra can be pre-computed and saved as numpy npz files using __blink.write_sparse_msms_file()__.

In [6]:
blink.write_sparse_msms_file('example_data/sparse_mgf.npz', discretized_mgf)
blink.write_sparse_msms_file('example_data/sparse_mzml.npz', discretized_mzml)

Likewise, saved pre-computed sparse spectra can be read with __blink.open_sparse_msms_file()__.

In [7]:
sparse_mgf = blink.open_sparse_msms_file('example_data/sparse_mgf.npz')
sparse_mzml = blink.open_sparse_msms_file('example_data/sparse_mzml.npz')

# Step 3: Score Spectra

#### Next, the two sets of spectra are scored against eachother. Given discretized spectra inputs, a matrix of pairwise scores is generated.

__blink.score_sprase_spectra():__
<br>
This function calculates the pairwise score and matching ion count matrices.
<br>
__Parameters:__
1. tolerance (default=0.01). The tolerance parameter is the minumum value between fragment ion m/zs to be considered "matching" and factor into the cosine based score. However, the true tolerance of the scoring algorithm is determined by both the "tolerance" parameter and the bin width such that true tolerance is "tolerance" - "bin_width".
2. mass_diffs (default=[0]). This optional parameter allows the user to score spectra against spectra shifted by user defined chemical masses. The default value of 0 does not shift the spectra. This feature is still in development. 
3. react_steps (default=1). This value expands the mass_diffs by a specified number of steps. This feature is still in development
4. calc_network_score (default=True). Whether or not to calculate the molecular network score. If True, the nuetral loss spectra computed during disrectization of scored. 

In [8]:
S12 = blink.score_sparse_spectra(sparse_mgf, sparse_mzml, tolerance=0.01, mass_diffs=[0], react_steps=1, calc_network_score=False)

In [9]:
#cosine similarity scores
S12['mzi']

<1000x3591 sparse matrix of type '<class 'numpy.float64'>'
	with 230424 stored elements in Compressed Sparse Row format>

In [10]:
#matching ion counts
S12['mzc']

<1000x3591 sparse matrix of type '<class 'numpy.int64'>'
	with 230424 stored elements in Compressed Sparse Row format>

# Step 4: Filter Scores

#### BLINK has several helper functions to convert the raw score and count matrices into human readable outputs. 

__blink.filter_hits()__
<br>
This function filters the score and count matrices to only include those above a user defined cutoff
<br>
__Optional Parameters:__
1. good_score (default=0.5). This is the lowest score that will be kept from the score matrix.
2. min_matches (default=5). The minumum number of matching ions to keep the score. 
3. good_matches (default=20). Keep scores with greater than or equal to this number of matches even if the score is lower than the good_score variable.
4. calc_network_score (default=True). If true, filter scores using the maximum of the precursor shifted and un-shifted score/matches. 

In [11]:
filtered_S12 = blink.filter_hits(S12, min_matches=5, good_matches=20, good_score=0.6, calc_network_score=False)

__blink.create_blink_matrix():__
<br>
This is a helper function that reshapes the score/matches matrices such that they can be easily associated with the spectral metadata
<br>
__Optional Parameters:__
1. calc_network_score (default=True). If true, create reshaped array with network score/matches as data

In [12]:
m = blink.create_blink_matrix_format(filtered_S12, calc_network_score=False)

In [13]:
df = pd.DataFrame(m, columns=['raveled_index', 'query', 'ref', 'score', 'matches'])
df = pd.merge(df, pd.DataFrame(S12['S1_metadata']).add_suffix('_query'), left_on='query', right_index=True, how='left')
df = pd.merge(df, pd.DataFrame(list(S12['S2_metadata'])).add_suffix('_ref'), left_on='ref', right_index=True, how='left')

In [15]:
df.head()

Unnamed: 0,raveled_index,query,ref,score,matches,0_query,id_ref,ms_level_ref,rt_ref,precursor_mz_ref,i_ref,charge_ref,num_ions_ref
0,639203.0,178.0,5.0,0.607461,8.0,{'name': '4-hydroxy-7-methoxy-3-nitrochromen-2...,9,2,0.027989,190.928309,2217636.0,1.0,20
1,639275.0,178.0,77.0,0.627672,9.0,{'name': '4-hydroxy-7-methoxy-3-nitrochromen-2...,117,2,0.279317,190.928223,4394044.0,,16
2,641608.0,178.0,2410.0,0.644279,11.0,{'name': '4-hydroxy-7-methoxy-3-nitrochromen-2...,3617,2,8.093017,190.928085,5153177.0,,23
3,641652.0,178.0,2454.0,0.619412,9.0,{'name': '4-hydroxy-7-methoxy-3-nitrochromen-2...,3683,2,8.212879,190.928146,4672048.0,,17
4,641696.0,178.0,2498.0,0.601222,8.0,{'name': '4-hydroxy-7-methoxy-3-nitrochromen-2...,3749,2,8.333472,190.928116,3062047.0,,21


# Using Task Runner

#### BLINK also includes a simple task-running function that covers many use cases.

__blink.get_blink_hits()__
<br>
This function takes input data as Pandas DataFrames that contain a "spectrum" and "precursor_mz" column, or mzML/MGF data files. Output is a filtered, formatted DataFrame with scores and matches between all MS2 spectra.
<br>
__Optional Parameters:__
1. calc_network_score (default=True): This parameter determines whether or not the network score (max of precursor mz shifted and unshifted mz score/matches) is calculated.
2. min_matches (default=5): See blink.filter_hits
3. good_matches (default=20): See blink.filter_hits
4. good_score (default=0.55): See blink.filter_hits
5. precursor_match (default=5): If not False, the output DataFrame is filtered to remove comparisons with precursor m/zs with a greater difference than the value of precursor_match in ppm.

In [19]:
df = blink.get_blink_hits(mgf, mzml, calc_network_score=False, precursor_match=False)

In [20]:
df.head()

Unnamed: 0,raveled_index,query,ref,score,matches,name_query,precursor_mz_query,inchi_query,smiles_query,spectrumid_query,...,precursor_mz_ref,i_ref,charge_ref,num_ions_ref,precursor_ppm_diff,jaccard_matches,overlap_matches,score_rank,matches_rank,jaccard_matches_rank
0,639201.0,178.0,3.0,0.570615,7.0,4-hydroxy-7-methoxy-3-nitrochromen-2-one Colli...,236.02,"""InChI=1S/C10H7NO6/c1-16-5-2-3-6-7(4-5)17-10(1...",COc1ccc2c(O)c([N+](=O)[O-])c(=O)oc2c1,CCMSLIB00010119964,...,146.965805,2494821.0,,18,377316.307712,0.233333,0.388889,27.0,4.0,11.0
1,639202.0,178.0,4.0,0.568388,7.0,4-hydroxy-7-methoxy-3-nitrochromen-2-one Colli...,236.02,"""InChI=1S/C10H7NO6/c1-16-5-2-3-6-7(4-5)17-10(1...",COc1ccc2c(O)c([N+](=O)[O-])c(=O)oc2c1,CCMSLIB00010119964,...,146.938599,3187073.0,,19,377431.579388,0.225806,0.368421,28.0,4.0,12.0
2,639203.0,178.0,5.0,0.607461,8.0,4-hydroxy-7-methoxy-3-nitrochromen-2-one Colli...,236.02,"""InChI=1S/C10H7NO6/c1-16-5-2-3-6-7(4-5)17-10(1...",COc1ccc2c(O)c([N+](=O)[O-])c(=O)oc2c1,CCMSLIB00010119964,...,190.928309,2217636.0,1.0,20,191050.298035,0.258065,0.421053,10.0,3.0,8.0
3,639207.0,178.0,9.0,0.539229,5.0,4-hydroxy-7-methoxy-3-nitrochromen-2-one Colli...,236.02,"""InChI=1S/C10H7NO6/c1-16-5-2-3-6-7(4-5)17-10(1...",COc1ccc2c(O)c([N+](=O)[O-])c(=O)oc2c1,CCMSLIB00010119964,...,102.948654,2124078.0,,11,563813.854017,0.2,0.454545,39.0,6.0,18.0
4,639237.0,178.0,39.0,0.583135,9.0,4-hydroxy-7-methoxy-3-nitrochromen-2-one Colli...,236.02,"""InChI=1S/C10H7NO6/c1-16-5-2-3-6-7(4-5)17-10(1...",COc1ccc2c(O)c([N+](=O)[O-])c(=O)oc2c1,CCMSLIB00010119964,...,146.965759,2518475.0,,21,377316.501664,0.290323,0.473684,21.0,2.0,5.0
