In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import pandas as pd

from massbank2db.db import MassbankDB
from massbank2db.spectrum import MBSpectrum

# Tutorial: Use a local copy of MassBank DB

This tutorial illustrates possible usecases of a local MassBank DB and how those are supported by the ```massbank2db``` package.

## Iterate over the Spectra of a Dataset

Here we illustrate how the spectra (or MassBank entries) of a particular dataset can be iterated over. This might be usefull, when for example the spectra (including their meta-information, such as the molecular structure and the retention time (RT)) should be used to train a machine learning algorithm. 

### Individual Spectra

In [17]:
# We connect to the MassBank DB constructed in the 'Tutorial__Build_DB.ipynb'
dbfn = "massbank_example.sqlite"
with MassbankDB(dbfn) as mbdb: 
    for idx, (mol, spec, cands) in enumerate(mbdb.iter_spectra(dataset="AU_000", grouped=False)):
        
        # For illustration purposes, we only load the first 10 spectra
        if idx > 9:
            break
            
        print(spec.get("accession"), "mol=%s" % mol[3], "n_peaks=%d" % len(spec.get_mz()))

AU230360 mol=RRRXPPIDPYTNJG n_peaks=2
AU240458 mol=SNGREZUHAYWORS n_peaks=8
AU231159 mol=HTXMGVTWXZBZNC n_peaks=8
AU233059 mol=OWZPCEFYPSAJFR n_peaks=52
AU227657 mol=DJBNUMBKLMJRSA n_peaks=2
AU225154 mol=RMMXLENWKUUMAY n_peaks=4
AU218155 mol=ZQTNQVWKHCQYLQ n_peaks=5
AU116751 mol=YJQPYGGHQPGBLI n_peaks=2
AU235657 mol=RZEKVGVHFLEQIL n_peaks=4
AU234359 mol=JLKIGFTWXXRPMT n_peaks=1


### Group spectra corresponding to the same compound and setting

MassBank contains spectra corresponding to the same compound, MS and LC configuration within one (contributor, accession-prefix)-tuple as separate files, e.g. if multiple collision energies where measured. In practice we can merge those spectra, for example to given them as input to a machine learning approach. The following SQLite query is used to group the spectra:
```SQLite
GROUP BY dataset, molecule, precursor_mz, precursor_type, fragmentation_mode
```
The dataset here also encodes the ionization mode, i.e. positive or negative ionization. The iterator can be modified to return the grouped spectra:

In [19]:
# We connect to the MassBank DB constructed in the 'Tutorial__Build_DB.ipynb'
dbfn = "massbank_example.sqlite"
with MassbankDB(dbfn) as mbdb: 
    for idx, (mol, specs, cands) in enumerate(mbdb.iter_spectra(dataset="AU_000", grouped=True)):
        
        # For illustration purposes, we only load the first 10 spectra
        if idx > 9:
            break
            
        print([spec.get("accession") for spec in specs], "mol=%s" % mol[3], "n_peaks=[%s]" % ",".join([str(len(spec.get_mz())) for spec in specs]))

['AU230660', 'AU230659', 'AU230662', 'AU230658', 'AU230657'] mol=WIIZWVCIJKGZOK n_peaks=[3,6,10,12,19]
['AU238262'] mol=YGSDEFSMJLZEOE n_peaks=[4]
['AU229857', 'AU229858'] mol=BTJIUGUIPKRLHP n_peaks=[3,3]
['AU230258', 'AU230259', 'AU230257'] mol=UFBJCMHMOXMLKC n_peaks=[7,5,4]
['AU229360', 'AU229357', 'AU229358', 'AU229359', 'AU229362'] mol=UQVKZNNCIHJZLS n_peaks=[2,2,2,4,5]
['AU228157', 'AU228159', 'AU228158', 'AU228112', 'AU228110', 'AU228111'] mol=GOEMGAFJFRBGGG n_peaks=[2,3,4,3,3,2]
['AU311151'] mol=YASYVMFAVPKPKE n_peaks=[2]
['AU231058', 'AU231057', 'AU231059', 'AU231060'] mol=FZEYVTFCMJSGMP n_peaks=[3,3,8,10]
['AU231357', 'AU325853', 'AU325854', 'AU325852', 'AU231358', 'AU231360', 'AU231359', 'AU325851'] mol=ZOMSMJKLGFBRBS n_peaks=[7,4,2,6,18,10,18,3]
['AU234957', 'AU234959', 'AU234960', 'AU234958'] mol=LKJPYSCBVHEWIU n_peaks=[8,13,8,9]


### Retrive molecular candidates for the spectra

For some applications it is useful to extract a molecular candidate for each spectrum. That is, a set of potential molecules, that could have been measured in the spectrum. There are two common approaches to get the set of molecular candidates:

1. ```return_candidates="mf"```: Using the ground-truth / predicted molecular formula of a spectrum
2. ```return_candidates="mz"```: Using a mass-window around the exact mass / precursor mass of a spectrum

In [53]:
# We connect to the MassBank DB constructed in the 'Tutorial__Build_DB.ipynb'
dbfn = "massbank_example.sqlite"
pc_dbfn = "/run/media/bach/EVO500GB/data/pubchem_24-06-2019/db/pubchem.sqlite"
with MassbankDB(dbfn, pc_dbfn=pc_dbfn) as mbdb: 
    for idx, (mol, specs, cands) in enumerate(mbdb.iter_spectra(dataset="AU_000", grouped=True, return_candidates="mf")):
        
        # For illustration purposes, we only load the first 10 spectra
        if idx > 9:
            break
            
        print([spec.get("accession") for spec in specs], "mol=%s" % mol[3], "n_peaks=[%s]" % ",".join([str(len(spec.get_mz())) for spec in specs]), 
              "n_cand=%d" % len(cands))

print("\nCandidates are as Pandas DataFrame")
print(cands.head())

['AU230660', 'AU230659', 'AU230662', 'AU230658', 'AU230657'] mol=WIIZWVCIJKGZOK n_peaks=[3,6,10,12,19] n_cand=84
['AU238262'] mol=YGSDEFSMJLZEOE n_peaks=[4] n_cand=354
['AU229857', 'AU229858'] mol=BTJIUGUIPKRLHP n_peaks=[3,3] n_cand=319
['AU230258', 'AU230259', 'AU230257'] mol=UFBJCMHMOXMLKC n_peaks=[7,5,4] n_cand=108
['AU229360', 'AU229357', 'AU229358', 'AU229359', 'AU229362'] mol=UQVKZNNCIHJZLS n_peaks=[2,2,2,4,5] n_cand=2989
['AU228157', 'AU228159', 'AU228158', 'AU228112', 'AU228110', 'AU228111'] mol=GOEMGAFJFRBGGG n_peaks=[2,3,4,3,3,2] n_cand=5270
['AU311151'] mol=YASYVMFAVPKPKE n_peaks=[2] n_cand=27
['AU231058', 'AU231057', 'AU231059', 'AU231060'] mol=FZEYVTFCMJSGMP n_peaks=[3,3,8,10] n_cand=507
['AU231357', 'AU325853', 'AU325854', 'AU325852', 'AU231358', 'AU231360', 'AU231359', 'AU325851'] mol=ZOMSMJKLGFBRBS n_peaks=[7,4,2,6,18,10,18,3] n_cand=2772
['AU234957', 'AU234959', 'AU234960', 'AU234958'] mol=LKJPYSCBVHEWIU n_peaks=[8,13,8,9] n_cand=68

Candidates are as Pandas DataFrame


## Merge Spectra

In the previous examples we have shown the support for spectra grouping. Sometimes it can be useful to merge the grouped spectra into one. For that, the peaks of all spectra are joint into one peak list. Subsequently, a hierarchical clustering is applied to group peaks that are belonging to the same fragment allowing a small mass-per-charge deviation. The retention times can also be merged, e.g. by taking the minimum retention time of all spectra. 

In [52]:
# We connect to the MassBank DB constructed in the 'Tutorial__Build_DB.ipynb'
dbfn = "massbank_example.sqlite"
with MassbankDB(dbfn) as mbdb: 
    for idx, (mol, specs, cands) in enumerate(mbdb.iter_spectra(dataset="AU_000", grouped=True)):
        
        # For illustration purposes, we only load the first 10 spectra
        if idx > 9:
            break

        # Spectra peaks are merged into a single spectrum. 
        spec = MBSpectrum.merge_spectra(specs)
            
        print("NEW ID:", spec.get("accession"), "ORIGINAL IDs:", spec.get("original_accessions"), "mol=%s" % mol[3]) 
        print("\t", "n_peaks (original)=[%s]" % ",".join([str(len(spec.get_mz())) for spec in specs]), "n_peaks (merged)=[%s]" % len(spec.get_mz()))
        print("\t", "RTs (original)=[%s]" % ",".join([str(spec.get("retention_time")) for spec in specs]), "RTs (merged)=[%f]" % spec.get("retention_time"))

NEW ID: AU879682 ORIGINAL IDs: ['AU230660', 'AU230659', 'AU230662', 'AU230658', 'AU230657'] mol=WIIZWVCIJKGZOK
	 n_peaks (original)=[3,6,10,12,19] n_peaks (merged)=[27]
	 RTs (original)=[5.668,5.699,5.677,5.677,5.69] RTs (merged)=[5.668000]
NEW ID: AU484704 ORIGINAL IDs: ['AU238262'] mol=YGSDEFSMJLZEOE
	 n_peaks (original)=[4] n_peaks (merged)=[4]
	 RTs (original)=[3.637] RTs (merged)=[3.637000]
NEW ID: AU352166 ORIGINAL IDs: ['AU229857', 'AU229858'] mol=BTJIUGUIPKRLHP
	 n_peaks (original)=[3,3] n_peaks (merged)=[4]
	 RTs (original)=[5.616,5.612] RTs (merged)=[5.612000]
NEW ID: AU277224 ORIGINAL IDs: ['AU230258', 'AU230259', 'AU230257'] mol=UFBJCMHMOXMLKC
	 n_peaks (original)=[7,5,4] n_peaks (merged)=[7]
	 RTs (original)=[4.418,4.412,4.439] RTs (merged)=[4.412000]
NEW ID: AU708189 ORIGINAL IDs: ['AU229360', 'AU229357', 'AU229358', 'AU229359', 'AU229362'] mol=UQVKZNNCIHJZLS
	 n_peaks (original)=[2,2,2,4,5] n_peaks (merged)=[6]
	 RTs (original)=[7.241,7.251,7.244,7.253,7.265] RTs (merged

## Convert Spectra into the MetFrag Format

The library allows you to output the spectra in the format needed as input for MetFrag.

In [54]:
# We connect to the MassBank DB constructed in the 'Tutorial__Build_DB.ipynb'
dbfn = "massbank_example.sqlite"
pc_dbfn = "/run/media/bach/EVO500GB/data/pubchem_24-06-2019/db/pubchem.sqlite"
with MassbankDB(dbfn, pc_dbfn=pc_dbfn) as mbdb: 
    for idx, (mol, specs, cands) in enumerate(mbdb.iter_spectra(dataset="AU_000", grouped=True, return_candidates="mf")):
        # Spectra peaks are merged into a single spectrum. 
        spec = MBSpectrum.merge_spectra(specs)
        
        # Output the merged spectrum in MetFrag format
        metfrag_output = spec._to_metfrag_format(
            **{"MetFragScoreWeights": [1.0],
               "MetFragScoreTypes": ["FragmenterScore"],
               "LocalDatabasePath": "/path/to/db/" + spec.get("accession") + "_cands.csv",
               "ResultsPath": "/path/to/results",
               "NumberThreads": 4,
               "PeakListPath": "/path/to/peaks"}
        )
        
        for k, v in metfrag_output.items(): 
            print("============")
            print(k)
            print("------------")
            print(v)
        
        # Output candidates in MetFrag format
        print("============")
        print(spec.get("accession") + "_cands.csv")
        print("------------")
        print(MassbankDB._cands_to_metfrag_format(cands.head()))
        
        break

AU879682_peaks.csv
------------
45.992500	0.179649
82.946300	0.061754
121.029760	1.000000
122.031450	0.111597
122.036000	0.056842
148.038950	0.138950
151.027340	1.000000
152.034840	1.000000
153.037933	0.135667
159.028900	0.058246
159.034500	0.041058
166.015500	0.067368
176.034633	0.239209
177.040000	0.050424
179.046900	0.033952
194.045200	0.534893
195.049300	0.077854
219.038700	0.052037
221.054400	0.039129
237.055400	0.087939
249.050600	0.188786
257.033500	0.738604
258.037400	0.106091
259.027900	0.230738
321.003700	0.971359
322.007800	0.170633
323.002800	0.641791
AU879682_config.txt
------------
LocalDatabasePath=/path/to/db/AU879682_cands.csv
MaximumTreeDepth=2
ConsiderHydrogenShifts=True
MetFragDatabaseType=LocalInChI
MetFragScoreWeights=1.0
MetFragPreProcessingCandidateFilter=UnconnectedCompoundFilter,IsotopeFilter
MetFragScoreTypes=FragmenterScore
MetFragCandidateWriter=CSV
FragmentPeakMatchAbsoluteMassDeviation=0.001000
FragmentPeakMatchRelativeMassDeviation=5.000000
ResultsPath=/

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cands_out["InChIKey1"] = cands_out["InChIKey"].apply(lambda _r: _r.split("-")[0])
