# Iomega workflow
## Calculate molecular fingerprint based similarities
Calculate all-vs-all similarity matrices for the data subset "Unique InchiKeys" (>12,000 spectra) using molecular fingerprints.

In [1]:
import os
import sys

path_data = os.path.join(os.path.dirname(os.getcwd()), 'data')

### Import pre-processed data subset "Unique InchiKeys"

In [3]:
from matchms.importing import load_from_json

filename = os.path.join(path_data,'gnps_positive_ionmode_unique_inchikey_cleaned_by_matchms_and_lookups.json')
spectrums = load_from_json(filename)

print("number of spectra:", len(spectrums))

number of spectra: 13717


### Post-process spectra
+ Normalize spectrum
+ Remove peaks outside m/z ratios between 0 and 1000.0
+ Discard spectra with less then 10 remaining peaks (to make it consistent with later spec2vec analysis)
+ Remove peaks with relative intensity lower than 0.01

In [4]:
from matchms.filtering import normalize_intensities
from matchms.filtering import require_minimum_number_of_peaks
from matchms.filtering import select_by_mz
from matchms.filtering import select_by_relative_intensity

In [5]:
def post_process(s):
    s = normalize_intensities(s)
    s = select_by_mz(s, mz_from=0, mz_to=1000)
    s = require_minimum_number_of_peaks(s, n_required=10)
    s = select_by_relative_intensity(s, intensity_from=0.01, intensity_to=1.0)
    return s

# apply filters to the data
spectrums = [post_process(s) for s in spectrums]

# omit spectrums that didn't qualify for analysis
spectrums = [s for s in spectrums if s is not None]

In [6]:
print("Remaining number of spectra:", len(spectrums))

Remaining number of spectra: 12797


## Derive molecular fingerprints
+ Fingerprints will be derived from smiles if possible, otherwise from inchi
+ Different fingerprint types can be selected: ``daylight``, ``morgan1``, ``morgan2``, ``morgan3`` (all using rdkit)
+ vector size is specified with ``nbits``, here set to 2048
--> will be used with ``Jaccard index``

In [7]:
from matchms.filtering.add_fingerprint import add_fingerprint

spectrums = [add_fingerprint(s, fingerprint_type="daylight", nbits=2048) for s in spectrums]

# Calculate similarity score matrices
+ Similarities between all possible pairs of spectra will be calculated. This will give a similarity score matrix of size 12,797 x 12,797.

In [8]:
from matchms.similarity import FingerprintSimilarityParallel

similarity_measure = FingerprintSimilarityParallel(similarity_measure="jaccard")
scores_mol_similarity = similarity_measure(spectrums, spectrums)
#start 21:53
import numpy as np
filename = os.path.join(path_data, "similarities_daylight2048_jaccard.npy")
np.save(filename, scores_mol_similarity)

ZeroDivisionError: division by zero

In [14]:
import numpy as np

for i, spec1 in enumerate(spectrums):
    for j, spec2 in enumerate(spectrums[i:], i):
        if spec1.get("fingerprint") is not None and spec2.get("fingerprint") is not None:
            u_or_v = np.bitwise_or(spec1.get("fingerprint") != 0, spec2.get("fingerprint") != 0)
            if u_or_v.sum() < 1:
                print(i, j, u_or_v)

KeyboardInterrupt: 

In [16]:
i, j


(2259, 8395)

In [20]:
scores_mol_similarity = similarity_measure(spectrums[11134:11136], spectrums[11144:11146])

In [17]:
for i, spec1 in enumerate(spectrums):
    if spec1.get("fingerprint") is None:
        print(i, "no fingerprint")
    elif spec1.get("fingerprint").sum() < 1:
        print(i, "weird")
        

543 no fingerprint
1246 no fingerprint
11145 weird


In [27]:
from matchms.utils import mol_converter

inchi = spectrums[11145].get("inchi")
inchikey = mol_converter(inchi, "inchi", "inchikey")[:14]
print(inchikey)

smiles = spectrums[11145].get("smiles")
inchikey = mol_converter(smiles, "smiles", "inchikey")[:14]
print(inchikey)

ZXJXZNDDNMQXFV
VEXZGXHMUGYJMC


In [31]:
for i, spec in enumerate(spectrums):
    inchikey_smiles = None
    inchikey_inchi = None
    inchi = spec.get("inchi")
    if inchi:
        inchikey_inchi = mol_converter(inchi, "inchi", "inchikey")
    smiles = spec.get("smiles")
    if smiles:
        inchikey_smiles = mol_converter(smiles, "smiles", "inchikey")
    if inchikey_inchi and inchikey_smiles:
        if not inchikey_inchi[:14] == inchikey_smiles[:14]:
            print(i, "\n")
            print(10* "--", smiles)
            print(10* "--", inchi)
            print(5* "--", inchikey_inchi[:14])
            print(5* "--", inchikey_smiles[:14])

40 

-------------------- CC[C@@]1([C@H](C(=C2C(=CC3=C2C(=C4C(=O)C=CC(=O)C4=C3O)O)[C@H]1O[C@@H]5C[C@@H]([C@H]([C@@H](O5)C)N(C)C)O)O)[C@H]6C(=O)C7=C([C@H]([C@@]6(CC)O[C@@H]8C[C@H]([C@@H]([C@H](O8)C)O)OC)O[C@@H]9C[C@@H]([C@H]([C@@H](O9)C)N(C)C)O)C(=[N+]=[N-])C1=C(C2=C(C(=O)C=CC2=O)C(=C71)O)O)O[C@@H]1C[C@H]([C@@H]([C@H](O1)C)O)OC
-------------------- InChI=1S/C68H82N4O24/c1-13-67(95-40-22-36(87-11)57(79)26(5)91-40)52(63(85)43-29(65(67)93-38-20-34(77)55(71(7)8)24(3)89-38)19-28-42(43)60(82)45-31(74)16-15-30(73)44(45)59(28)81)53-64(86)49-48-50(62(84)47-33(76)18-17-32(75)46(47)61(48)83)54(70-69)51(49)66(94-39-21-35(78)56(72(9)10)25(4)90-39)68(53,14-2)96-41-23-37(88-12)58(80)27(6)92-41/h15-19,24-27,34-41,52-53,55-58,65-66,77-85H,13-14,20-23H2,1-12H3/t24-,25-,26+,27+,34-,35-,36+,37+,38+,39+,40+,41+,52-,53-,55-,56-,57+,58+,65+,66+,67-,68-/m0/s1
---------- AVVUVGNOUBNZKQ
---------- NZZSDJHUISSTSC
294 

-------------------- CC[C@@]1([C@H](C(=C2C(=CC3=C2C(=C4C(=O)C=CC(=O)C4=C3O)O)[C@H]1O[C@@H]5C[C@

In [21]:
spectrums[11145].metadata

{'spectrum_id': 'CCMSLIB00000214104',
 'source_file': 'Massbank_ESI_positive_8_1_2014_peaks.mgf',
 'task': '2b0fcfed18d74148aa284e857bb00e4d',
 'scan': '-1',
 'ms_level': '2',
 'library_membership': 'MASSBANK',
 'spectrum_status': '1',
 'splash': 'splash10-0a4i-0009000000-0009000000',
 'submit_user': 'mwang87',
 'compound_name': 'FIO00351 Crystal violet',
 'ion_source': 'ESI',
 'compound_source': 'Isolated',
 'instrument': 'LC-ESI-QTOF',
 'pi': 'Putative Massbank Match',
 'data_collector': 'Massbank',
 'adduct': '[M+H]+',
 'precursor_mz': 408.221,
 'exactmass': '0.0',
 'charge': 1,
 'cas_number': '548-62-9',
 'pubmed_id': 'N/A',
 'smiles': '[Cl-1]',
 'inchi': 'InChI=1S/C25H30N3.ClH/c1-26(2)22-13-7-19(8-14-22)25(20-9-15-23(16-10-20)27(3)4)21-11-17-24(18-12-21)28(5)6;/h7-18H,1-6H3;1H/q+1;/p-1',
 'inchiaux': 'N/A',
 'library_class': '3',
 'spectrumid': 'CCMSLIB00000214104',
 'ionmode': 'positive',
 'create_time': '2014-08-01 16:31:38.0',
 'task_id': '2b0fcfed18d74148aa284e857bb00e4d',
 'u

## Derive different type of molecular fingerprints
+ Here: ``morgan3``
+ Then using ``Dice Similarity Coefficient``

In [7]:
spectrums = [add_fingerprint(s, fingerprint_type="daylight", nbits=2048) for s in spectrums]

### Calculate similarity score matrices

In [None]:
similarity_measure = FingerprintSimilarityParallel(similarity_measure="jaccard")
scores_mol_similarity = similarity_measure(spectrums, spectrums)

import numpy as np
filename = os.path.join(path_data, "similarities_morgan3_2048_dice.npy")
np.save(filename, scores_mol_similarity)