In [35]:
import pickle
import gensim
from spec2vec import Spec2Vec
import os
import numpy as np

### Example on training documents

In [2]:
documents = pickle.load(open("./models/tbdms/preprocessed/documents.pickle", "rb"))

In [3]:
model = gensim.models.Word2Vec.load("./models/tbdms/spec2vec.model")
model = Spec2Vec(model)

In [4]:
embedding = model._calculate_embedding(documents[0])
embedding[0:10]

array([15.86876743,  6.17050898, -4.49228144,  5.22535405, -0.61781243,
       14.28316867,  8.40395679, 11.2233543 ,  3.61851081, -7.97188171])

### Example on new data

In [51]:
from spec2vec_train import preprocess_file

In [68]:
model = gensim.models.Word2Vec.load("./models/tms/spec2vec.model")
model = Spec2Vec(model)

In [62]:
FILE = "./dataset/Test dataset_TMS_RAW.mgf"
assert os.path.isfile(FILE), f"File not found: {FILE}"

In [63]:
spectra_documents = preprocess_file(FILE)

In [69]:
spectra_documents_by_inchikey = {}
for sd in spectra_documents:
    inchikey = sd.metadata.get("inchikey")
    if inchikey not in spectra_documents_by_inchikey:
        spectra_documents_by_inchikey[inchikey] = []
    spectra_documents_by_inchikey[inchikey].append(sd)

In [70]:
spectra_embedding_by_inchikey = {}
for inchikey, spectra in spectra_documents_by_inchikey.items():
    spectra_embedding_by_inchikey[inchikey] = []
    for spectrum in spectra:
        spectra_embedding_by_inchikey[inchikey].append(model._calculate_embedding(spectrum))

In [71]:
# Example that embeddings of the same molecule are not identical, but similar.
def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

for inchikey in spectra_embedding_by_inchikey.keys():
    molecule_embeddings = spectra_embedding_by_inchikey[inchikey]
    similarities = []
    for i in range(len(molecule_embeddings)):
        for j in range(i+1, len(molecule_embeddings)):
            similarity = cosine_similarity(molecule_embeddings[i], molecule_embeddings[j])
            similarities.append(similarity)
    similarities = np.array(similarities)

    print(f"Molecule {inchikey} Self-Similarity: Mean {similarities.mean():.3f}, Std {similarities.std():.3f}")

Molecule VFFKJOXNCSJSAQ-UHFFFAOYSA-N Self-Similarity: Mean 0.882, Std 0.056
Molecule HVEKDTUCYABGMC-UHFFFAOYSA-N Self-Similarity: Mean 0.851, Std 0.044
Molecule NPHRHHQEJSRYEL-UHFFFAOYSA-N Self-Similarity: Mean 0.867, Std 0.039
Molecule BJXLZKDWKXQNLP-UHFFFAOYSA-N Self-Similarity: Mean 0.913, Std 0.021
Molecule YCAAUQGUAZTIAV-UHFFFAOYSA-N Self-Similarity: Mean 0.909, Std 0.034
Molecule HKLDVWRZQFKYDQ-UHFFFAOYSA-N Self-Similarity: Mean 0.864, Std 0.039
Molecule PSQXOOHLZAOXOZ-UHFFFAOYSA-N Self-Similarity: Mean 0.873, Std 0.033
Molecule VBYNSEKUVOESSR-UHFFFAOYSA-N Self-Similarity: Mean 0.954, Std 0.013
Molecule UGFXQBRWNPEHCX-UHFFFAOYSA-N Self-Similarity: Mean 0.895, Std 0.028
Molecule MIGZCFOSPQOHRU-UHFFFAOYSA-N Self-Similarity: Mean 0.892, Std 0.026
Molecule KYRPPPKHGUJSMB-UHFFFAOYSA-N Self-Similarity: Mean 0.793, Std 0.069
Molecule BNAGIDDOEKUJRZ-UHFFFAOYSA-N Self-Similarity: Mean 0.872, Std 0.038
Molecule LMFAUEGOCMAFIW-KZNAEPCWSA-N Self-Similarity: Mean 0.890, Std 0.024
Molecule RGK