# BLINK Tutorial

In [1]:
import sys
sys.path.insert(0, '../')

import blink
import pandas as pd
from ftplib import FTP

## Step 0: Load Input Data From GNPS

#### BERKELEY-LAB.mgf Downloaded Here:
https://gnps-external.ucsd.edu/gnpslibrary

In [2]:
gnps_host = 'massive.ucsd.edu'
gnps_user = 'MSV000083475'
gnps_passwd = 'a'

In [3]:
filename = '7H12_9_1_song-75-s004-a04.mzML'    

with FTP(gnps_host, gnps_user, gnps_passwd) as ftp:
    ftp.cwd('/ccms_peak/RAW/PLATE7/')
    with open(filename, 'wb') as f:
        ftp.retrbinary('RETR ' + filename, f.write)

## Step 1: Read Input Data

#### BLINK can accept MGF files, mzML files, and lists of spectra as input. Correctly formatted lists of spectra and precursor m/zs can be fed directly into the discretization function, the other file formats can be parsed using a built-in helper function. 

__blink.open_msms_file():__

This function takes an MGF file or mzML file and converts into a Pandas DataFrame.

[BERKELEY-LAB.mgf](https://gnps-external.ucsd.edu/gnpslibrary)

In [4]:
#parsing MGF:
mgf = blink.open_msms_file('/global/cfs/cdirs/metatlas/projects/spectral_libraries/BERKELEY-LAB.mgf')

#parsing mzML:
mzml = blink.open_msms_file('7H12_9_1_song-75-s004-a04.mzML')

In [5]:
mgf.head()

Unnamed: 0,pepmass,charge,mslevel,source_instrument,filename,seq,ionmode,organism,name,pi,...,smiles,inchi,inchiaux,pubmed,submituser,libraryquality,spectrumid,scans,spectrum,precursor_mz
0,"(174.088, None)",[1+],2,LC-ESI-Orbitrap,test.mgf,*..*,Negative,BERKELEY-LAB,L-citrulline M-H,Trent Northen,...,N=C(O)NCCC[C@H](N)C(=O)O,InChI=1S/C6H13N3O3/c7-4(5(10)11)2-1-3-9-6(8)12...,,,mpanitchpakdi,3,CCMSLIB00006685342,1,"[[53.298698, 58.029301, 59.013401, 65.657997, ...",174.088
1,"(104.035, None)",[1+],2,LC-ESI-Orbitrap,test.mgf,*..*,Negative,BERKELEY-LAB,L-serine M-H,Trent Northen,...,N[C@@H](CO)C(=O)O,"InChI=1S/C3H7NO3/c4-2(1-5)3(6)7/h2,5H,1,4H2,(H...",,,mpanitchpakdi,3,CCMSLIB00006685343,2,"[[53.0779, 72.008904, 74.024498, 74.718102, 92...",104.035
2,"(148.044, None)",[1+],2,LC-ESI-Orbitrap,test.mgf,*..*,Negative,BERKELEY-LAB,L-methionine M-H,Trent Northen,...,CSCC[C@H](N)C(=O)O,"InChI=1S/C5H11NO2S/c1-9-3-2-4(6)5(7)8/h4H,2-3,...",,,mpanitchpakdi,3,CCMSLIB00006685344,3,"[[51.219601, 57.429298, 68.856201, 100.040001,...",148.044
3,"(131.046, None)",[1+],2,LC-ESI-Orbitrap,test.mgf,*..*,Negative,BERKELEY-LAB,L-asparagine M-H,Trent Northen,...,N=C(O)C[C@H](N)C(=O)O,"InChI=1S/C4H8N2O3/c5-2(4(8)9)1-3(6)7/h2H,1,5H2...",,,mpanitchpakdi,3,CCMSLIB00006685345,4,"[[58.0294, 70.029503, 71.013199, 71.024803, 72...",131.046
4,"(164.072, None)",[1+],2,LC-ESI-Orbitrap,test.mgf,*..*,Negative,BERKELEY-LAB,L-phenylalanine M-H,Trent Northen,...,N[C@@H](Cc1ccccc1)C(=O)O,InChI=1S/C9H11NO2/c10-8(9(11)12)6-7-4-2-1-3-5-...,,,mpanitchpakdi,3,CCMSLIB00006685346,5,"[[58.858501, 72.008904, 85.3451, 91.055298, 10...",164.072


In [6]:
mzml.head()

Unnamed: 0,id,ms_level,rt,spectrum,precursor_mz,i,charge
0,29,2,0.074056,"[[51.023495, 52.39509, 53.00283, 53.039024, 56...",149.059667,2787562.0,1.0
1,30,2,0.077625,"[[57.16054, 62.25457, 89.9055, 109.89806, 111....",337.104489,1578322.0,1.0
2,31,2,0.081175,"[[53.03918, 54.165733, 64.39659, 77.03844, 82....",163.075234,759054.9,1.0
3,32,2,0.084741,"[[50.71475, 54.151115, 66.84381, 67.95492, 78....",360.180406,743840.8,1.0
4,34,2,0.094643,"[[64.33035, 84.11954, 90.76483, 103.29681, 110...",388.211635,355221.9,1.0


## Step 2: "Discretize" Spectra

#### Prior to scoring, each set of spectra is discretized. This process converts lists of m/z and intensity arrays and precursor m/zs into dictionary-based sparse matrices.  

__blink.discretize_spectra():__

This function takes lists of fragmentation spectra and precursor m/zs as input and outputs a dictionary that contains intensity values, binned m/z values, and precursor m/z values. 

__Parameters:__

1. bin_width (default=0.001). This value is used to convert the m/z floats into rounded integer bins. Using smaller bin widths, precision increases and speed decreases. Example of binning calculation using default values. 100.002 --> 100002

2. intensity_power (default=0.5). The intensity power parameter scales intensity values used in the scoring. 

3. trim_empty (default=False). If True, spectra and associated metadata will be removed if the spectra are empty.

4. remove_duplicates (default=False). This parameter optionally calls blink.remove_duplicate_ions, which averages m/z values and sums the intensities of fragment ions in a spectrum that are within a minumum distance. This can be useful for spectra that are noisy or poorly centroided.

5. network_score (default=False). Optionally, BLINK can be used for molecular networking. If enabled in this step, the function will calculate another set of bins where the m/z are subtracted from the precursor m/z for that particular spectrum. This feature is still in development

8. tolerance (default=0.01). The tolerance parameter is the minumum value between fragment ion m/zs to be considered "matching" and factor into the cosine based score. However, the true tolerance of the scoring algorithm is determined by both the "tolerance" parameter and the bin width such that true tolerance is "tolerance" - "bin_width".

9. mass_diffs (default=[0]). This optional parameter allows the user to score spectra against spectra shifted by user defined chemical masses. The default value of 0 does not shift the spectra. This feature is still in development. 

In [7]:
discretized_spectra = blink.discretize_spectra(mgf.spectrum.tolist(), mzml.spectrum.tolist(), mgf.precursor_mz.tolist(),  mzml.precursor_mz.tolist(),
                                         bin_width=0.001, tolerance=0.01, intensity_power=0.5, trim_empty=False, remove_duplicates=False, network_score=False)

Discretized spectra can be pre-computed and saved as numpy npz files using __blink.write_sparse_msms_file()__.

In [8]:
blink.write_sparse_msms_file('sparse_spectra.npz', discretized_spectra)

Likewise, saved pre-computed sparse spectra can be read with __blink.open_sparse_msms_file()__.

In [9]:
discretized_spectra_test = blink.open_sparse_msms_file('sparse_spectra.npz')

## Step 3: Score Spectra

#### Next, the two sets of spectra are scored against eachother. Given discretized spectra inputs, a matrix of pairwise scores is generated.

__blink.score_sprase_spectra():__

This function calculates the pairwise score and matching ion count matrices.

In [10]:
%%time
S12 = blink.score_sparse_spectra(discretized_spectra)

CPU times: user 2.04 s, sys: 200 ms, total: 2.24 s
Wall time: 2.24 s


In [11]:
#cosine similarity scores
S12['mzi']

<24563x3038 sparse matrix of type '<class 'numpy.float64'>'
	with 18745890 stored elements in Compressed Sparse Row format>

In [12]:
#matching ion counts
S12['mzc']

<24563x3038 sparse matrix of type '<class 'numpy.float64'>'
	with 18745890 stored elements in Compressed Sparse Row format>

## Step 4: Filter Scores

#### BLINK has several helper functions to convert the raw score and count matrices into human readable outputs. 

__blink.filter_hits()__

This function filters the score and count matrices to only include those above a user defined cutoff

__Optional Parameters:__
1. min_score (default=0.5). This is the lowest score that will be kept from the score matrix.

2. min_matches (default=5). The minumum number of matching ions to keep the score. 

3. override_matches (default=20). Keep scores with greater than or equal to this number of matches even if the score is lower than the good_score variable.

In [13]:
filtered_S12 = blink.filter_hits(S12, min_matches=5, override_matches=20, min_score=0.6)

__blink.reformat_score_matrix():__
<br>
This is a helper function that reshapes the score/matches matrices such that they can be easily associated with the spectral metadata

In [14]:
filtered_S12

{'mzi': <24563x3038 sparse matrix of type '<class 'numpy.float64'>'
 	with 766805 stored elements in Compressed Sparse Row format>,
 'mzc': <24563x3038 sparse matrix of type '<class 'numpy.float64'>'
 	with 766805 stored elements in Compressed Sparse Row format>}

In [15]:
m = blink.reformat_score_matrix(filtered_S12)

In [16]:
df = blink.make_output_df(m)

In [17]:
df.head()

Unnamed: 0,score,matches,query,ref
0,0.674067,5.0,5.0,744.0
1,0.664316,5.0,72.0,889.0
2,0.718953,6.0,173.0,434.0
3,0.684335,7.0,173.0,176.0
4,0.648829,5.0,173.0,375.0


In [18]:
df = pd.merge(df, mgf.add_suffix("_ref"), left_on="ref", right_index=True)
df = pd.merge(df, mzml.add_suffix("_query"), left_on="query", right_index=True)

In [19]:
df.head()

Unnamed: 0,score,matches,query,ref,pepmass_ref,charge_ref,mslevel_ref,source_instrument_ref,filename_ref,seq_ref,...,scans_ref,spectrum_ref,precursor_mz_ref,id_query,ms_level_query,rt_query,spectrum_query,precursor_mz_query,i_query,charge_query
0,0.674067,5.0,5.0,744.0,"(192.063, None)",[1+],2,LC-ESI-Orbitrap,test.mgf,*..*,...,745,"[[67.8442, 69.1605, 69.196999, 132.042007, 145...",192.063,35,2,0.09821,"[[52.869865, 65.41379, 68.40481, 71.388855, 80...",365.135762,285103.9,1.0
1,0.664316,5.0,72.0,889.0,"(249.096, None)",[1+],2,LC-ESI-Orbitrap,test.mgf,*..*,...,890,"[[50.2262, 57.385502, 59.262402, 65.981903, 73...",249.096,150,2,0.450176,"[[55.05472, 55.935287, 56.04996, 57.033894, 57...",217.154765,4494628.0,1.0
344,0.049489,21.0,632.0,889.0,"(249.096, None)",[1+],2,LC-ESI-Orbitrap,test.mgf,*..*,...,890,"[[50.2262, 57.385502, 59.262402, 65.981903, 73...",249.096,850,2,2.810151,"[[53.712605, 53.735023, 54.24978, 55.054726, 6...",344.254744,1996236.0,1.0
466,0.214473,43.0,632.0,1491.0,"(311.068, None)",[1+],2,LC-ESI-Orbitrap,test.mgf,*..*,...,1492,"[[66.25206, 71.622108, 98.376122, 104.279579, ...",311.068,850,2,2.810151,"[[53.712605, 53.735023, 54.24978, 55.054726, 6...",344.254744,1996236.0,1.0
502,0.213441,58.0,632.0,1433.0,"(295.181, None)",[1+],2,LC-ESI-Orbitrap,test.mgf,*..*,...,1434,"[[53.662491, 62.912251, 67.054619, 69.56144, 6...",295.181,850,2,2.810151,"[[53.712605, 53.735023, 54.24978, 55.054726, 6...",344.254744,1996236.0,1.0
