# Compute spec2vec similarities on mass spectra dataset

In [1]:
import os
import sys
import gensim
import numpy as np

ROOT = os.path.dirname(os.getcwd())
#path_data = os.path.join(ROOT, 'data')
path_data = 'C:\\OneDrive - Netherlands eScience Center\\Project_Wageningen_iOMEGA\\matchms\\data\\'
sys.path.insert(0, ROOT)

### Import pre-processed dataset "Unique InchiKeys"

In [2]:
from matchms.importing import load_from_json

filename = os.path.join(path_data,'gnps_positive_ionmode_unique_inchikey_cleaned_by_matchms_and_lookups.json')
spectrums = load_from_json(filename)

print("number of spectra:", len(spectrums))

number of spectra: 13717


### Post-processing of data

In [3]:
from matchms.filtering import normalize_intensities
from matchms.filtering import require_minimum_number_of_peaks
from matchms.filtering import select_by_mz
from matchms.filtering import select_by_relative_intensity
from matchms.filtering import reduce_to_number_of_peaks
from matchms.filtering import add_losses

In [4]:
def post_process(s):
    s = normalize_intensities(s)
    s = select_by_mz(s, mz_from=0, mz_to=1000)
    s = require_minimum_number_of_peaks(s, n_required=10)
    s = reduce_to_number_of_peaks(s, n_required=10, ratio_desired=0.5)
    if s is None:
        return None
    s_remove_low_peaks = select_by_relative_intensity(s, intensity_from=0.001)
    if len(s_remove_low_peaks.peaks) >= 10:
        s = s_remove_low_peaks
        
    s = add_losses(s, loss_mz_from=5.0, loss_mz_to=200.0)
    return s

# apply post processing steps to the data
spectrums_postprocessed = [post_process(s) for s in spectrums]

# omit spectrums that didn't qualify for analysis
spectrums_postprocessed = [s for s in spectrums_postprocessed if s is not None]

### Load pretrained spec2vec model

In [5]:
path_models = os.path.join(path_data, "trained_models")
model_file = os.path.join(path_models, "spec2vec_UniqueInchikeys_ratio05_filtered_iter_15.model")

# Load pretrained model
model = gensim.models.Word2Vec.load(model_file)

FileNotFoundError: [Errno 2] No such file or directory: 'C:\\OneDrive - Netherlands eScience Center\\Project_Wageningen_iOMEGA\\matchms\\data\\trained_models\\spec2vec_UniqueInchikeys_ratio05_filtered_201101_iter_15.model'

In [6]:
from spec2vec import Spec2Vec
from spec2vec import SpectrumDocument 

In [11]:
documents = [SpectrumDocument(s, n_decimals=2) for s in spectrums_postprocessed]

In [12]:
documents[0].words

['peak@289.29',
 'peak@295.55',
 'peak@298.49',
 'peak@317.32',
 'peak@319.66',
 'peak@324.48',
 'peak@325.32',
 'peak@339.79',
 'peak@343.95',
 'peak@347.02',
 'peak@347.91',
 'peak@361.15',
 'peak@361.84',
 'peak@364.23',
 'peak@364.86',
 'peak@365.85',
 'peak@368.22',
 'peak@368.97',
 'peak@375.07',
 'peak@375.75',
 'peak@382.75',
 'peak@384.20',
 'peak@390.57',
 'peak@394.05',
 'peak@397.11',
 'peak@404.42',
 'peak@411.09',
 'peak@413.78',
 'peak@427.67',
 'peak@436.19',
 'peak@443.27',
 'peak@446.27',
 'peak@447.75',
 'peak@455.25',
 'peak@456.11',
 'peak@457.54',
 'peak@464.29',
 'peak@469.87',
 'peak@471.06',
 'peak@475.26',
 'peak@476.14',
 'peak@476.98',
 'peak@478.89',
 'peak@479.98',
 'peak@483.24',
 'peak@487.21',
 'peak@488.16',
 'peak@491.19',
 'peak@494.28',
 'peak@495.65',
 'peak@498.41',
 'peak@503.03',
 'peak@504.34',
 'peak@505.15',
 'peak@510.18',
 'peak@512.17',
 'peak@513.27',
 'peak@514.96',
 'peak@515.92',
 'peak@520.97',
 'peak@521.82',
 'peak@523.17',
 'peak@5

## Actual score calculation
+ Using ``Spec2Vec`` with ``intensity_weighting_power=0.5``.
+ Calculate matrix of all-vs-all similarity scores.

In [13]:
spec2vec_similarity = Spec2Vec(model, intensity_weighting_power=0.5)
similarity_matrix = spec2vec_similarity.matrix(documents, documents, is_symmetric=True)

## Store similarity matrix

In [14]:
filename = os.path.join(path_data,'similarities_spec2vec_2dec_15iter.npy')
np.save(filename, similarity_matrix)

---

# Same but now with model trained on all positive ionmode spectra
(or more preciselym all that had >= 10 peaks)

### Load pretrained spec2vec model

In [6]:
path_models = os.path.join(path_data, "trained_models")
model_file = os.path.join(path_models, "spec2vec_AllPositive_ratio05_filtered_201101_iter_15.model")

# Load pretrained model
model = gensim.models.Word2Vec.load(model_file)

In [7]:
from spec2vec import Spec2Vec
from spec2vec import SpectrumDocument 

In [8]:
documents = [SpectrumDocument(s, n_decimals=2) for s in spectrums_postprocessed]

In [9]:
documents[0].words[:10]

['peak@289.29',
 'peak@295.55',
 'peak@298.49',
 'peak@317.32',
 'peak@319.66',
 'peak@324.48',
 'peak@325.32',
 'peak@339.79',
 'peak@343.95',
 'peak@347.02']

## Actual score calculation
+ Using ``Spec2Vec`` with ``intensity_weighting_power=0.5``.
+ Calculate matrix of all-vs-all similarity scores.

In [10]:
spec2vec_similarity = Spec2Vec(model, intensity_weighting_power=0.5)

similarity_matrix = spec2vec_similarity.matrix(documents, documents, is_symmetric=True)

## Store similarity matrix

In [11]:
filename = os.path.join(path_data,'similarities_spec2vec_2dec_AllPositiveModel_15iter_201101.npy')
np.save(filename, similarity_matrix)