In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import pandas as pd

from massbank2db.db import MassbankDB

# Tutorial: Use a local copy of MassBank DB

This tutorial illustrates possible usecases of a local MassBank DB and how those are supported by the ```massbank2db``` package.

## Iterate over the Spectra of a Dataset

Here we illustrate how the spectra (or MassBank entries) of a particular dataset can be iterated over. This might be usefull, when for example the spectra (including their meta-information, such as the molecular structure and the retention time (RT)) should be used to train a machine learning algorithm. 

### Individual Spectra

In [17]:
# We connect to the MassBank DB constructed in the 'Tutorial__Build_DB.ipynb'
dbfn = "massbank_example.sqlite"
with MassbankDB(dbfn) as mbdb: 
    for idx, (mol, spec, cands) in enumerate(mbdb.iter_spectra(dataset="AU_000", grouped=False)):
        
        # For illustration purposes, we only load the first 10 spectra
        if idx > 9:
            break
            
        print(spec.get("accession"), "mol=%s" % mol[3], "n_peaks=%d" % len(spec.get_mz()))

AU230360 mol=RRRXPPIDPYTNJG n_peaks=2
AU240458 mol=SNGREZUHAYWORS n_peaks=8
AU231159 mol=HTXMGVTWXZBZNC n_peaks=8
AU233059 mol=OWZPCEFYPSAJFR n_peaks=52
AU227657 mol=DJBNUMBKLMJRSA n_peaks=2
AU225154 mol=RMMXLENWKUUMAY n_peaks=4
AU218155 mol=ZQTNQVWKHCQYLQ n_peaks=5
AU116751 mol=YJQPYGGHQPGBLI n_peaks=2
AU235657 mol=RZEKVGVHFLEQIL n_peaks=4
AU234359 mol=JLKIGFTWXXRPMT n_peaks=1


### Group spectra corresponding to the same compound and setting

MassBank contains spectra corresponding to the same compound, MS and LC configuration within one (contributor, accession-prefix)-tuple as separate files, e.g. if multiple collision energies where measured. In practice we can merge those spectra, for example to given them as input to a machine learning approach. The following SQLite query is used to group the spectra:
```SQLite
GROUP BY dataset, molecule, precursor_mz, precursor_type, fragmentation_mode
```
The dataset here also encodes the ionization mode, i.e. positive or negative ionization. The iterator can be modified to return the grouped spectra:

In [19]:
# We connect to the MassBank DB constructed in the 'Tutorial__Build_DB.ipynb'
dbfn = "massbank_example.sqlite"
with MassbankDB(dbfn) as mbdb: 
    for idx, (mol, specs, cands) in enumerate(mbdb.iter_spectra(dataset="AU_000", grouped=True)):
        
        # For illustration purposes, we only load the first 10 spectra
        if idx > 9:
            break
            
        print([spec.get("accession") for spec in specs], "mol=%s" % mol[3], "n_peaks=[%s]" % ",".join([str(len(spec.get_mz())) for spec in specs]))

['AU230660', 'AU230659', 'AU230662', 'AU230658', 'AU230657'] mol=WIIZWVCIJKGZOK n_peaks=[3,6,10,12,19]
['AU238262'] mol=YGSDEFSMJLZEOE n_peaks=[4]
['AU229857', 'AU229858'] mol=BTJIUGUIPKRLHP n_peaks=[3,3]
['AU230258', 'AU230259', 'AU230257'] mol=UFBJCMHMOXMLKC n_peaks=[7,5,4]
['AU229360', 'AU229357', 'AU229358', 'AU229359', 'AU229362'] mol=UQVKZNNCIHJZLS n_peaks=[2,2,2,4,5]
['AU228157', 'AU228159', 'AU228158', 'AU228112', 'AU228110', 'AU228111'] mol=GOEMGAFJFRBGGG n_peaks=[2,3,4,3,3,2]
['AU311151'] mol=YASYVMFAVPKPKE n_peaks=[2]
['AU231058', 'AU231057', 'AU231059', 'AU231060'] mol=FZEYVTFCMJSGMP n_peaks=[3,3,8,10]
['AU231357', 'AU325853', 'AU325854', 'AU325852', 'AU231358', 'AU231360', 'AU231359', 'AU325851'] mol=ZOMSMJKLGFBRBS n_peaks=[7,4,2,6,18,10,18,3]
['AU234957', 'AU234959', 'AU234960', 'AU234958'] mol=LKJPYSCBVHEWIU n_peaks=[8,13,8,9]


### Retrive molecular candidates for the spectra

For some applications it is useful to extract a molecular candidate for each spectrum. That is, a set of potential molecules, that could have been measured in the spectrum. There are two common approaches to get the set of molecular candidates:

1. ```return_candidates="mf"```: Using the ground-truth / predicted molecular formula of a spectrum
2. ```return_candidates="mz"```: Using a mass-window around the exact mass / precursor mass of a spectrum

In [22]:
# We connect to the MassBank DB constructed in the 'Tutorial__Build_DB.ipynb'
dbfn = "massbank_example.sqlite"
pc_dbfn = "/run/media/bach/EVO500GB/data/pubchem_24-06-2019/db/pubchem.sqlite"
with MassbankDB(dbfn, pc_dbfn=pc_dbfn) as mbdb: 
    for idx, (mol, specs, cands) in enumerate(mbdb.iter_spectra(dataset="AU_000", grouped=True, return_candidates="mf")):
        
        # For illustration purposes, we only load the first 10 spectra
        if idx > 9:
            break
            
        print([spec.get("accession") for spec in specs], "mol=%s" % mol[3], "n_peaks=[%s]" % ",".join([str(len(spec.get_mz())) for spec in specs]), 
              "n_cand=%d" % len(cands))

['AU230660', 'AU230659', 'AU230662', 'AU230658', 'AU230657'] mol=WIIZWVCIJKGZOK n_peaks=[3,6,10,12,19] n_cand=2958
['AU238262'] mol=YGSDEFSMJLZEOE n_peaks=[4] n_cand=392
['AU229857', 'AU229858'] mol=BTJIUGUIPKRLHP n_peaks=[3,3] n_cand=323
['AU230258', 'AU230259', 'AU230257'] mol=UFBJCMHMOXMLKC n_peaks=[7,5,4] n_cand=135
['AU229360', 'AU229357', 'AU229358', 'AU229359', 'AU229362'] mol=UQVKZNNCIHJZLS n_peaks=[2,2,2,4,5] n_cand=9827
['AU228157', 'AU228159', 'AU228158', 'AU228112', 'AU228110', 'AU228111'] mol=GOEMGAFJFRBGGG n_peaks=[2,3,4,3,3,2] n_cand=7083
['AU311151'] mol=YASYVMFAVPKPKE n_peaks=[2] n_cand=245
['AU231058', 'AU231057', 'AU231059', 'AU231060'] mol=FZEYVTFCMJSGMP n_peaks=[3,3,8,10] n_cand=635
['AU231357', 'AU325853', 'AU325854', 'AU325852', 'AU231358', 'AU231360', 'AU231359', 'AU325851'] mol=ZOMSMJKLGFBRBS n_peaks=[7,4,2,6,18,10,18,3] n_cand=3498
['AU234957', 'AU234959', 'AU234960', 'AU234958'] mol=LKJPYSCBVHEWIU n_peaks=[8,13,8,9] n_cand=1873
