# Iomega workflow
## Calculate molecular fingerprint based similarities
Calculate all-vs-all similarity matrices for the data subset "Unique InchiKeys" (>12,000 spectra) using molecular fingerprints.

In [2]:
import os
import sys

path_data = os.path.join(os.path.dirname(os.getcwd()), 'data')

### Import pre-processed data subset "Unique InchiKeys"

In [5]:
from matchms.importing import load_from_json

filename = os.path.join(path_data,'gnps_positive_ionmode_unique_inchikey_cleaned_by_matchms_and_lookups.json')
spectrums = load_from_json(filename)

print("number of spectra:", len(spectrums))

number of spectra: 13717


### Post-process spectra
+ Normalize spectrum
+ Remove peaks outside m/z ratios between 0 and 1000.0
+ Discard spectra with less then 10 remaining peaks (to make it consistent with later spec2vec analysis)
+ Remove peaks with relative intensity lower than 0.01

In [6]:
from matchms.filtering import normalize_intensities
from matchms.filtering import require_minimum_number_of_peaks
from matchms.filtering import select_by_mz
from matchms.filtering import select_by_relative_intensity

In [7]:
def post_process(s):
    s = normalize_intensities(s)
    s = select_by_mz(s, mz_from=0, mz_to=1000)
    s = require_minimum_number_of_peaks(s, n_required=10)
    s = select_by_relative_intensity(s, intensity_from=0.01, intensity_to=1.0)
    return s

# apply filters to the data
spectrums = [post_process(s) for s in spectrums]

# omit spectrums that didn't qualify for analysis
spectrums = [s for s in spectrums if s is not None]

In [8]:
print("Remaining number of spectra:", len(spectrums))

Remaining number of spectra: 12797


## Derive molecular fingerprints
+ Fingerprints will be derived from inchi if possible, otherwise from smiles

# Calculate similarity score matrices
+ Similarities between all possible pairs of spectra will be calculated. This will give a similarity score matrix of size 12,797 x 12,797.
+ Careful: for the dataset used here, calculating the all-vs-all similarity score matrix will take a while (few hours).

## Calculate cosine similarity scores
+ here using ``tolerance = 0.005``
+ ``safety_points=10`` is optional, this will simply make sure that the intermediate results are occationally saved (10x during the process).

In [None]:
from matchms.similarity import CosineGreedyNumba
from matchms.custom_functions.extra_functions import all_vs_all_similarity_matrix

# Define similarity measure
similarity_measure = CosineGreedyNumba(tolerance=0.005)

filename = os.path.join(path_data, "similarities_cosine_tol0005.npy")
similarities, num_matches = all_vs_all_similarity_matrix(spectrums, similarity_measure,
                                                         filename, safety_points=10)

## Calculate modified cosine similarity scores
+ here using ``tolerance = 0.005``

In [None]:
from matchms.similarity import ModifiedCosine
from matchms.custom_functions.extra_functions import all_vs_all_similarity_matrix

# Define similarity measure
similarity_measure = ModifiedCosine(tolerance=0.005)

filename = os.path.join(path_data, "similarities_mod_cosine_tol0005.npy")
similarities, num_matches = all_vs_all_similarity_matrix(spectrums, similarity_measure,
                                                         filename, safety_points=10)