In [27]:
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
import pandas as pd
import json
import re
from pathlib import Path
from mimas.helper.fileio import NumpyEncoder

import seaborn as sns
from matplotlib import rcParams
rcParams['font.family'] = 'sans-serif'
rcParams['font.sans-serif'] = ['Helvetica']
rcParams['font.size'] = 12
rcParams['pdf.fonttype'] = 42
rcParams['ps.fonttype'] = 42
rcParams['svg.fonttype'] = 'none'

path_data = Path().cwd().parent / 'data'

path_figures = Path().cwd().parent / 'figures'
path_figures.mkdir(parents=True, exist_ok=True)

In [28]:
# Download MoNA data from https://mona.fiehnlab.ucdavis.edu/downloads and save it in the data folder
import urllib.request
import datetime
url = r'https://mona.fiehnlab.ucdavis.edu/rest/downloads/retrieve/03d5a22c-c1e1-4101-ac70-9a4eae437ef5'

path_mona = path_data / 'mona' / 'raw'
path_mona.mkdir(parents=True, exist_ok=True)
file_mona = path_mona / f'mona-{datetime.date.today()}.zip'

with urllib.request.urlopen(url) as response:
    data = response.read()
    with open(file_mona, 'wb') as f:
        f.write(data)


In [29]:
# Load spectra from NoNA data
from mimas.file_io import spec_file
from mimas.spectra.similarity.tools import clean_spectrum

all_spectra = {}
for spec in spec_file.read_one_spectrum(file_mona):
    ion_mode = spec.get("ion_mode", "")
    if ion_mode in {'P', 'N'}:
        try:
            precursor_mz = float(spec['precursormz'])
            peaks = clean_spectrum(spectrum=spec['peaks'], max_mz=precursor_mz-1.6,
                                   noise_threshold=0.01, remove_isotope=True, ms2_da=0.05)
        except:
            continue

        if len(peaks) > 0:
            if ion_mode not in all_spectra:
                all_spectra[ion_mode] = []
            all_spectra[ion_mode].append({'peaks': peaks,
                                          'precursor_mz': precursor_mz})

# Count number of spectra pre ion mode
for ion_mode, spectra in all_spectra.items():
    print(f'Number of spectra in {ion_mode} mode: {len(spectra)}')


Number of spectra in P mode: 808784
Number of spectra in N mode: 1196680


In [30]:
# pickle the spectra to file
import pickle
with open(path_data / 'mona' / 'raw' / 'mona_spectra.pickle', 'wb') as f:
    pickle.dump(all_spectra, f)

In [31]:
# Randomly select 100, 1000, 10000, 100000, 1000000 spectra from each ion mode
path_output_spectrum = path_data / 'mona' / 'spectral_library'
path_output_spectrum.mkdir(parents=True, exist_ok=True)

for extract_spectra_number in [100, 1000, 10000, 100000, 1000000]:
    for ion_mode in all_spectra:
        current_spectra = np.array(all_spectra[ion_mode])
        total_spectra_number = len(current_spectra)
        replace = False
        if extract_spectra_number > total_spectra_number:
            print(f'Number of spectra in {ion_mode} mode is {total_spectra_number}, less than {extract_spectra_number}')
            replace = True

        selected = np.random.choice(total_spectra_number, extract_spectra_number, replace=replace)
        selected_spectra = current_spectra[selected].tolist()

        with open(path_output_spectrum / f'spectra-charge_{ion_mode}-number_{extract_spectra_number}.pkl', 'wb') as f:
            pickle.dump(selected_spectra, f)


Number of spectra in P mode is 808784, less than 1000000
