```python
from mstda import Model

model = Model("data/Drugbank2019_MS_CFMID4.0_FRAGMENTS_TRAIN.json")
print(str(model))
model.save("models/drugbank_cfm4.json")
```

In [1]:
from mstda import Model
model = Model(
    "data/Drugbank2019_MS_CFMID4.0_FRAGMENTS_TRAIN.json",
    diagram_epsilon = 0.09,
    n_jobs = 3,
)
print(str(model))
model.save("models/drugbank_cfm4_eps0.09.json")

  pk = 1.0*pk / np.sum(pk, axis=axis, keepdims=True)
  pk = 1.0*pk / np.sum(pk, axis=axis, keepdims=True)
  pk = 1.0*pk / np.sum(pk, axis=axis, keepdims=True)


Trained model:
Molecules:                      8001
Topological features:           (8001, 9)
Homology dimensions:            [0, 1, 2]
Energy levels:                  3
Mass transformed:               True
Logarithmic integer mass:       True
Persistence diagram epsilon:    0.09
Persistence entropy normalized: False
Physical cores (n_jobs):        3
saved model to: /home/axean/Dokument/mstda/models/drugbank_cfm4_eps0.09.json


In [2]:
#from mstda import Model
#model = Model.load("models/drugbank_cfm4.json")
#model.n_jobs = 3
#print(str(model))

# Verify that no test molecule is in the training set

In [3]:
from json import load
from mstda_utils import print_dict, print_dict_schema
from rdkit import Chem
from tqdm.notebook import tqdm

with open("data/Drugbank2019_MS_CFMID4.0_FRAGMENTS_TEST.json") as j:
    test_data = load(j)
print_dict_schema(test_data)
canon_smiles = list(map(Chem.CanonSmiles, model.molecules))
for k in tqdm(list(test_data.keys())):
    assert Chem.CanonSmiles(k) not in canon_smiles

C[C@@H]1CC2=CC(=O)CC[C@@H]2[C@H]3CC[C@]4(C)[C@H](CC[C@H]4[C@H]13)OC(=O)C
	energy0
		mz	: <class 'list'>
		intens	: <class 'list'>
		frag	: <class 'list'>
		score	: <class 'list'>
	energy1
		mz	: <class 'list'>
		intens	: <class 'list'>
		frag	: <class 'list'>
		score	: <class 'list'>
	energy2
		mz	: <class 'list'>
		intens	: <class 'list'>
		frag	: <class 'list'>
		score	: <class 'list'>
	frag	: <class 'list'>
etc.


  0%|          | 0/1700 [00:00<?, ?it/s]

# Remove answers from query dictionary

In [4]:
from copy import deepcopy
answers = deepcopy(test_data)

for idx,k in enumerate(list(test_data.keys())):
    test_data[k].pop("frag")
    for e in ("energy0","energy1","energy2"):
        test_data[k][e].pop("frag")
        test_data[k][e].pop("score")
    test_data["test_molecule"+str(idx)] = test_data[k]
    test_data.pop(k)
    answers[k]["test_id"] = "test_molecule"+str(idx)
print("Dictionary to be used to check answers:")
print_dict_schema(answers)
print()
print("Dictionary to be used as queries:")
print_dict_schema(test_data)

Dictionary to be used to check answers:
C[C@@H]1CC2=CC(=O)CC[C@@H]2[C@H]3CC[C@]4(C)[C@H](CC[C@H]4[C@H]13)OC(=O)C
	energy0
		mz	: <class 'list'>
		intens	: <class 'list'>
		frag	: <class 'list'>
		score	: <class 'list'>
	energy1
		mz	: <class 'list'>
		intens	: <class 'list'>
		frag	: <class 'list'>
		score	: <class 'list'>
	energy2
		mz	: <class 'list'>
		intens	: <class 'list'>
		frag	: <class 'list'>
		score	: <class 'list'>
	frag	: <class 'list'>
	test_id	: <class 'str'>
etc.

Dictionary to be used as queries:
test_molecule0
	energy0
		mz	: <class 'list'>
		intens	: <class 'list'>
	energy1
		mz	: <class 'list'>
		intens	: <class 'list'>
	energy2
		mz	: <class 'list'>
		intens	: <class 'list'>
etc.


# Run a small test

In [7]:
from random import sample

small_test = { k : test_data[k] for k in sample(list(test_data.keys()), 15)}

test_results = model.query(small_test, K = 600)

In [8]:
from rdkit.Chem.Descriptors import ExactMolWt
import numpy as np

def cfm_matches(test, answer: dict):
    smiles_lut = lambda idx : Chem.CanonSmiles(answer["frag"][idx][1])
    
    mz0 = answer["energy0"]["mz"]
    mz1 = answer["energy1"]["mz"]
    mz2 = answer["energy2"]["mz"]
    
    f0 = [list(map(smiles_lut, possible_frags)) for possible_frags in answer["energy0"]["frag"]]
    f1 = [list(map(smiles_lut, possible_frags)) for possible_frags in answer["energy1"]["frag"]]
    f2 = [list(map(smiles_lut, possible_frags)) for possible_frags in answer["energy2"]["frag"]]
    f = (f0,f1,f2)
    
    cntr = 0
    test_masses = test[:,0].astype(float)
    for energy_idx, mz in enumerate((mz0,mz1,mz2)):
        for peak_idx,x in enumerate(mz):
            select = np.isclose(x, test_masses, rtol = 5e-7)
            smiles_arr = test[select, 1]
            for smiles in smiles_arr:
                if Chem.CanonSmiles(smiles) in f[energy_idx][peak_idx]:
                    cntr += 1
    return cntr

def peak_matches(test, answer : dict):
    mz0 = answer["energy0"]["mz"]
    mz1 = answer["energy1"]["mz"]
    mz2 = answer["energy2"]["mz"]
    cntr = 0
    for mz in (mz0,mz1,mz2):
        select = [np.isclose(
            np.unique(
                test[:,0].astype(float)
            ), x, rtol = 5e-7
        ).any() for x in mz]
        cntr += sum([ 1 if b else 0 for b in select ])
    return cntr
        
for t,test in test_results.items():
    for m,ans in answers.items():
        if ans["test_id"] == t:
            print(t, "\t", m)
            peak_count = sum(map(len, (ans["energy0"]["mz"], ans["energy1"]["mz"], ans["energy2"]["mz"])))
            pm = peak_matches(test, ans)
            print("Molecular weight:\t", ExactMolWt(Chem.MolFromSmiles(Chem.CanonSmiles(m))))
            print("Peak count:\t\t", peak_count)
            print("Peak matches:\t\t", pm)
            print("SMILES matches:\t\t", cfm_matches(test, ans))
            assert pm <= peak_count
            print()
            break

test_molecule1009 	 Cc1onc(c1C(=O)N[C@H]2[C@H]3SC(C)(C)[C@@H](N3C2=O)C(=O)O)c4c(Cl)cccc4Cl
Molecular weight:	 469.026597004
Peak count:		 49
Peak matches:		 34
SMILES matches:		 30

test_molecule694 	 CCOC(=O)CCCCCCCCC(C)c1ccccc1I
Molecular weight:	 416.121228168
Peak count:		 90
Peak matches:		 67
SMILES matches:		 0

test_molecule413 	 C[C@H]1CNc2c(C1)cccc2S(=O)=O
Molecular weight:	 211.066699656
Peak count:		 36
Peak matches:		 36
SMILES matches:		 13

test_molecule1088 	 CO[C@H]1C[C@@]2(C)[C@@H](CC[C@@]2(O)C#C)[C@@H]3CCc4cc(O)ccc4[C@@H]13
Molecular weight:	 326.18819469199997
Peak count:		 58
Peak matches:		 58
SMILES matches:		 9

test_molecule1282 	 COc1c(N2CCNC(C)C2)c(F)cc3C(=O)C(=CN(C4CC4)c13)C(=O)O
Molecular weight:	 375.15943440399997
Peak count:		 40
Peak matches:		 36
SMILES matches:		 5

test_molecule1418 	 CC(C)(COP(=O)(O)OP(=O)(O)OC[C@H]1O[C@H]([C@H](O)[C@@H]1OP(=O)(O)O)n2cnc3c(N)ncnc23)[C@@H](O)C(=O)NCCC(=O)NCCS
Molecular weight:	 767.1152089620001
Peak count:		 38
Peak