# BLINK Tutorial

In [1]:
import sys
sys.path.insert(0, '../')

import blink
import pandas as pd
from ftplib import FTP

## Step 0: Load Input Data From GNPS

#### BERKELEY-LAB.mgf Downloaded Here:
https://gnps-external.ucsd.edu/gnpslibrary

In [2]:
gnps_host = 'massive.ucsd.edu'
gnps_user = 'MSV000083475'
gnps_passwd = 'a'

In [3]:
filename = '7H12_9_1_song-75-s004-a04.mzML'    

with FTP(gnps_host, gnps_user, gnps_passwd) as ftp:
    ftp.cwd('/ccms_peak/RAW/PLATE7/')
    with open(filename, 'wb') as f:
        ftp.retrbinary('RETR ' + filename, f.write)

## Step 1: Read Input Data

#### BLINK can accept MGF files, mzML files, and lists of spectra as input. Correctly formatted lists of spectra and precursor m/zs can be fed directly into the discretization function, the other file formats can be parsed using a built-in helper function. 

__blink.open_msms_file():__

This function takes an MGF file or mzML file and converts into a Pandas DataFrame.

[BERKELEY-LAB.mgf](https://gnps-external.ucsd.edu/gnpslibrary)

In [4]:
#parsing MGF:
mgf = blink.open_msms_file('BERKELEY-LAB.mgf')

#parsing mzML:
mzml = blink.open_msms_file('7H12_9_1_song-75-s004-a04.mzML')

In [5]:
mgf.head()

Unnamed: 0,pepmass,charge,mslevel,source_instrument,filename,seq,ionmode,organism,name,pi,...,smiles,inchi,inchiaux,pubmed,submituser,libraryquality,spectrumid,scans,spectrum,precursor_mz
0,"(223.051, None)",[1-],2,LC-ESI-Orbitrap,tharwood/20220308_JGI-AK-TH_TN_507992_PlantStd...,*..*,Negative,BERKELEY-LAB,Phenazine-1-carboxylic acid CollisionEnergy:10...,Trent Northen,...,C1=CC=C2C(=C1)N=C3C=CC=C(C3=N2)C(=O)O,"""InChI=1S/C13H8N2O2/c16-13(17)8-4-3-7-11-12(8)...",,,mwang87,3,CCMSLIB00010101988,506,"[[66.795273, 74.768951, 79.180153, 86.134895, ...",223.051
1,"(223.051, None)",[1-],2,LC-ESI-Orbitrap,tharwood/20220308_JGI-AK-TH_TN_507992_PlantStd...,*..*,Negative,BERKELEY-LAB,Phenazine-1-carboxylic acid CollisionEnergy:20...,Trent Northen,...,C1=CC=C2C(=C1)N=C3C=CC=C(C3=N2)C(=O)O,"""InChI=1S/C13H8N2O2/c16-13(17)8-4-3-7-11-12(8)...",,,mwang87,3,CCMSLIB00010101989,503,"[[74.99028, 79.957001, 85.299744, 93.001129, 9...",223.051
2,"(405.182, None)",[1-],2,LC-ESI-Orbitrap,tharwood/20220308_JGI-AK-TH_TN_507992_PlantStd...,*..*,Negative,BERKELEY-LAB,indole-3-butyric acid CollisionEnergy:102040 2M-H,Trent Northen,...,C1=CC=C2C(=C1)C(=CN2)CCCC(=O)O,"""InChI=1S/C12H13NO2/c14-12(15)7-3-4-9-8-13-11-...",,,mwang87,3,CCMSLIB00010101990,603,"[[65.328674, 92.43383, 113.988976, 137.326645,...",405.182
3,"(185.061, None)",[1-],2,LC-ESI-Orbitrap,tharwood/20220308_JGI-AK-TH_TN_507992_PlantStd...,*..*,Negative,BERKELEY-LAB,1-Naphthaleneacetic acid CollisionEnergy:10204...,Trent Northen,...,C1=CC=C2C(=C1)C=CC=C2CC(=O)O,"""InChI=1S/C12H10O2/c13-12(14)8-10-6-3-5-9-4-1-...",,,mwang87,3,CCMSLIB00010101991,614,"[[52.970661, 54.45359, 93.706535, 97.066246, 9...",185.061
4,"(185.061, None)",[1-],2,LC-ESI-Orbitrap,tharwood/20220308_JGI-AK-TH_TN_507992_PlantStd...,*..*,Negative,BERKELEY-LAB,1-Naphthaleneacetic acid CollisionEnergy:20506...,Trent Northen,...,C1=CC=C2C(=C1)C=CC=C2CC(=O)O,"""InChI=1S/C12H10O2/c13-12(14)8-10-6-3-5-9-4-1-...",,,mwang87,3,CCMSLIB00010101992,605,"[[60.311302, 70.336533, 72.314659, 97.065468, ...",185.061


In [6]:
mzml.head()

Unnamed: 0,id,ms_level,rt,spectrum,precursor_mz,i,charge
0,29,2,0.074056,"[[51.023495, 52.39509, 53.00283, 53.039024, 56...",149.059667,2787562.0,1.0
1,30,2,0.077625,"[[57.16054, 62.25457, 89.9055, 109.89806, 111....",337.104489,1578322.0,1.0
2,31,2,0.081175,"[[53.03918, 54.165733, 64.39659, 77.03844, 82....",163.075234,759054.9,1.0
3,32,2,0.084741,"[[50.71475, 54.151115, 66.84381, 67.95492, 78....",360.180406,743840.8,1.0
4,34,2,0.094643,"[[64.33035, 84.11954, 90.76483, 103.29681, 110...",388.211635,355221.9,1.0


## Step 2: "Discretize" Spectra

#### Prior to scoring, each set of spectra is discretized. This process converts lists of m/z and intensity arrays and precursor m/zs into dictionary-based sparse matrices.  

__blink.discretize_spectra():__

This function takes lists of fragmentation spectra and precursor m/zs as input and outputs a dictionary that contains intensity values, binned m/z values, and precursor m/z values. 

__Parameters:__

1. bin_width (default=0.001). This value is used to convert the m/z floats into rounded integer bins. Using smaller bin widths, precision increases and speed decreases. Example of binning calculation using default values. 100.002 --> 100002

2. intensity_power (default=0.5). The intensity power parameter scales intensity values used in the scoring. 

3. trim_empty (default=False). If True, spectra and associated metadata will be removed if the spectra are empty.

4. remove_duplicates (default=False). This parameter optionally calls blink.remove_duplicate_ions, which averages m/z values and sums the intensities of fragment ions in a spectrum that are within a minumum distance. This can be useful for spectra that are noisy or poorly centroided.

5. calc_network_score (default=False). Optionally, BLINK can be used for molecular networking. If enabled in this step, the function will calculate another set of bins where the m/z are subtracted from the precursor m/z for that particular spectrum. This feature is still in development

6. metadata (default=None). Any desired metadata is stored with this variable. If the default of None is used, the metadata will be the number of ions in each spectrum. 

In [7]:
discretized_mgf = blink.discretize_spectra(mgf.spectrum.tolist(), mgf.precursor_mz.tolist(),
                                         bin_width=0.001, intensity_power=0.5, trim_empty=False, remove_duplicates=False, calc_network_score=False,
                                         metadata=mgf.drop(columns=['spectrum']).to_dict(orient='records'))

discretized_mzml = blink.discretize_spectra(mzml.spectrum.tolist(), mzml.precursor_mz.tolist(),
                                         bin_width=0.001, intensity_power=0.5, trim_empty=False, remove_duplicates=False, calc_network_score=False,
                                         metadata=mzml.drop(columns=['spectrum']).to_dict(orient='records'))

Discretized spectra can be pre-computed and saved as numpy npz files using __blink.write_sparse_msms_file()__.

In [8]:
blink.write_sparse_msms_file('sparse_mgf.npz', discretized_mgf)
blink.write_sparse_msms_file('sparse_mzml.npz', discretized_mzml)

Likewise, saved pre-computed sparse spectra can be read with __blink.open_sparse_msms_file()__.

In [9]:
sparse_mgf = blink.open_sparse_msms_file('sparse_mgf.npz')
sparse_mzml = blink.open_sparse_msms_file('sparse_mzml.npz')

## Step 3: Score Spectra

#### Next, the two sets of spectra are scored against eachother. Given discretized spectra inputs, a matrix of pairwise scores is generated.

__blink.score_sprase_spectra():__

This function calculates the pairwise score and matching ion count matrices.

__Parameters:__
1. tolerance (default=0.01). The tolerance parameter is the minumum value between fragment ion m/zs to be considered "matching" and factor into the cosine based score. However, the true tolerance of the scoring algorithm is determined by both the "tolerance" parameter and the bin width such that true tolerance is "tolerance" - "bin_width".

2. mass_diffs (default=[0]). This optional parameter allows the user to score spectra against spectra shifted by user defined chemical masses. The default value of 0 does not shift the spectra. This feature is still in development. 

3. react_steps (default=1). This value expands the mass_diffs by a specified number of steps. This feature is still in development

4. calc_network_score (default=False). Whether or not to calculate the molecular network score. If True, the precursor m/z shifted bins computed during disrectization are matched and scored, in addition to the unshifted. This feature is still in development

In [10]:
S12 = blink.score_sparse_spectra(sparse_mgf, sparse_mzml, tolerance=0.01, mass_diffs=[0], react_steps=1, calc_network_score=False)

In [11]:
#cosine similarity scores
S12['mzi']

<25009x3038 sparse matrix of type '<class 'numpy.float64'>'
	with 19115131 stored elements in Compressed Sparse Row format>

In [12]:
#matching ion counts
S12['mzc']

<25009x3038 sparse matrix of type '<class 'numpy.int64'>'
	with 19115131 stored elements in Compressed Sparse Row format>

## Step 4: Filter Scores

#### BLINK has several helper functions to convert the raw score and count matrices into human readable outputs. 

__blink.filter_hits()__

This function filters the score and count matrices to only include those above a user defined cutoff

__Optional Parameters:__
1. good_score (default=0.5). This is the lowest score that will be kept from the score matrix.

2. min_matches (default=5). The minumum number of matching ions to keep the score. 

3. good_matches (default=20). Keep scores with greater than or equal to this number of matches even if the score is lower than the good_score variable.

4. calc_network_score (default=False). If true, filter scores using the maximum of the precursor shifted and un-shifted score/matches. This feature is still in development

In [13]:
filtered_S12 = blink.filter_hits(S12, min_matches=5, good_matches=20, good_score=0.6, calc_network_score=False)

__blink.create_blink_matrix():__
<br>
This is a helper function that reshapes the score/matches matrices such that they can be easily associated with the spectral metadata
<br>
__Optional Parameters:__
1. calc_network_score (default=False). If true, create reshaped array with network score/matches as data. This feature is still in development

In [14]:
m = blink.create_blink_matrix_format(filtered_S12, calc_network_score=False)

In [15]:
df = pd.DataFrame(m, columns=['raveled_index', 'query', 'ref', 'score', 'matches'])
df = pd.merge(df, pd.DataFrame(S12['S1_metadata']).add_suffix('_query'), left_on='query', right_index=True, how='left')
df = pd.merge(df, pd.DataFrame(list(S12['S2_metadata'])).add_suffix('_ref'), left_on='ref', right_index=True, how='left')

In [16]:
df.head()

Unnamed: 0,raveled_index,query,ref,score,matches,0_query,id_ref,ms_level_ref,rt_ref,precursor_mz_ref,i_ref,charge_ref,num_ions_ref
0,27404.0,9.0,62.0,0.087154,28.0,"{'pepmass': (961.697, None), 'charge': [1+], '...",138,2,0.407787,246.13361,5520740.0,1.0,139
1,27469.0,9.0,127.0,0.2827,27.0,"{'pepmass': (961.697, None), 'charge': [1+], '...",219,2,0.677852,167.1066,307161.7,,62
2,27482.0,9.0,140.0,0.26976,21.0,"{'pepmass': (961.697, None), 'charge': [1+], '...",235,2,0.731299,169.085946,219491.2,1.0,47
3,27524.0,9.0,182.0,0.327233,31.0,"{'pepmass': (961.697, None), 'charge': [1+], '...",288,2,0.909346,167.0703,420028.8,,67
4,27537.0,9.0,195.0,0.244857,20.0,"{'pepmass': (961.697, None), 'charge': [1+], '...",304,2,0.964177,155.1066,266208.1,,56


## Using Task Runner

#### BLINK also includes a simple task-running function that covers many use cases.

__blink.get_blink_hits()__

This function takes input data as Pandas DataFrames that contain a "spectrum" and "precursor_mz" column, or mzML/MGF data files. Output is a filtered, formatted DataFrame with scores and number of matching ions between all MS2 spectra.

__Optional Parameters:__
1. calc_network_score (default=False): This parameter determines whether or not the network score (max of precursor mz shifted and unshifted mz score/matches) is calculated. This feature is still under development

2. min_matches (default=5): See blink.filter_hits

3. good_matches (default=20): See blink.filter_hits

4. good_score (default=0.55): See blink.filter_hits

5. precursor_match (default=5): If not False, the output DataFrame is filtered to remove comparisons with precursor m/zs with a greater difference than the value of precursor_match in ppm.

6. tolerance (default=0.01): see blink.score_sparse_matrices

In [17]:
df = blink.get_blink_hits(mgf, mzml, calc_network_score=False, precursor_match=5)

In [18]:
df.head()

Unnamed: 0,raveled_index,query,ref,score,matches,pepmass_query,charge_query,mslevel_query,source_instrument_query,filename_query,...,precursor_mz_ref,i_ref,charge_ref,num_ions_ref,precursor_ppm_diff,jaccard_matches,overlap_matches,score_rank,matches_rank,jaccard_matches_rank
5056,140569.0,46.0,821.0,0.242168,50.0,"(331.153, None)",[1+],2,LC-ESI-Orbitrap,tharwood/20220308_JGI-AK-TH_TN_507992_PlantStd...,...,331.153813,7164258.0,1.0,190,2.455207,0.204918,0.480769,1.0,1.0,1.0
5579,143607.0,47.0,821.0,0.141077,27.0,"(331.153, None)",[1+],2,LC-ESI-Orbitrap,tharwood/20220308_JGI-AK-TH_TN_507992_PlantStd...,...,331.153813,7164258.0,1.0,190,2.455207,0.116883,0.397059,1.0,1.0,1.0
5783,161176.0,53.0,162.0,0.872172,10.0,"(205.097, None)",[1+],2,LC-ESI-Orbitrap,tharwood/20220318_JGI-AK-TH_TN_507992_PlantStd...,...,205.0972,811456.8,,28,0.975148,0.227273,0.384615,1.0,1.0,1.0
5790,164214.0,54.0,162.0,0.817064,10.0,"(205.097, None)",[1+],2,LC-ESI-Orbitrap,tharwood/20220318_JGI-AK-TH_TN_507992_PlantStd...,...,205.0972,811456.8,,28,0.975148,0.204082,0.357143,1.0,1.0,1.0
5849,313605.0,103.0,691.0,0.207193,33.0,"(347.148, None)",[1+],2,LC-ESI-Orbitrap,tharwood/20220318_JGI-AK-TH_TN_507992_PlantStd...,...,347.14881,3256451.0,1.0,105,2.334075,0.166667,0.314286,1.0,1.0,1.0
