In [1]:
from usearch.index import Index, search, MetricKind, Matches, BatchMatches
from rdkit import Chem
from rdkit.Chem import AllChem
import pandas as pd
import numpy as np
import tqdm
import sys

In [2]:
sys.path.append('/auto/home/menuab/code/usearch-molecules')

In [3]:
from rdkit.Chem import MACCSkeys, AllChem
from rdkit import Chem, RDLogger
from multiprocessing import Pool
from transformers import AutoTokenizer
import matplotlib.pyplot as plt

  from .autonotebook import tqdm as notebook_tqdm
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [12]:
moses = pd.read_csv('../data/moses_calculated.csv', index_col=0)
moses.head()


Unnamed: 0,SMILES,SPLIT,SAS,QED,CLOGP,WEIGHT
0,CCCS(=O)c1ccc2[nH]c(=NC(=O)OC)[nH]c2c1,train,3.812618,0.897,1.681,281.083
1,CC(C)(C)C(=O)C(Oc1ccc(Cl)cc1)n1ccnc1,train,2.936327,0.862,3.729,292.098
2,CC1C2CCC(C2)C1CN(CCO)C(=O)c1ccc(Cl)cc1,test,4.128087,0.902,3.457,321.15
3,Cc1c(Cl)cccc1Nc1ncccc1C(=O)OCC(O)CO,train,2.663795,0.701,2.297,336.088
4,Cn1cnc2c1c(=O)n(CC(O)CO)c(=O)n2C,train,2.989161,0.646,-2.213,254.102


In [16]:
moses = moses[moses.SPLIT == 'train']

In [17]:
tokenizer = AutoTokenizer.from_pretrained("/auto/home/menuab/code/ChemLacticaTestSuite/src/tokenizer/ChemLacticaTokenizer_50066/")
print('tokenizer size: ', len(tokenizer))

tokenizer size:  50066


In [49]:
Index.metadata("/raid/chem/usearch_index/index-maccs.usearch")

{'matrix_included': True,
 'matrix_uses_64_bit_dimensions': False,
 'version': '2.8.10',
 'kind_metric': <MetricKind.Tanimoto: 116>,
 'kind_scalar': <ScalarKind.B1: 1>,
 'kind_key': <ScalarKind.U64: 8>,
 'kind_compressed_slot': <ScalarKind.U32: 9>,
 'count_present': 115627267,
 'count_deleted': 0,
 'dimensions': 192}

In [8]:
# index = Index.restore("/dev/shm/chem/PubChem_maccs.usearch", view=False)

In [50]:
index1 = Index.restore("/raid/chem/usearch_index/index-maccs.usearch")

In [67]:
index1

usearch.Index
- config
-- data type: ScalarKind.B1
-- dimensions: 192
-- metric: MetricKind.Tanimoto
-- connectivity: 16
-- expansion on addition:128 candidates
-- expansion on search: 64 candidates
- binary
-- uses OpenMP: 0
-- uses SimSIMD: 1
-- supports half-precision: 1
-- uses hardware acceleration: serial
- state
-- size: 115,627,267 vectors
-- memory usage: 39,567,172,888 bytes
-- max level: 4
--- 0. 115,627,267 nodes
--- 1. 7,148,289 nodes
--- 2. 461,443 nodes
--- 3. 37,720 nodes
--- 4. 5,152 nodes

In [69]:
np.pad(np.unpackbits(maccs), (0,24)).shape

(192,)

In [68]:
maccs, ecfp4, fcfp4 = smiles_to_maccs_ecfp4_fcfp4('CCCS(=O)c1ccc2[nH]c(=NC(=O)OC)[nH]c2c1')
m = index1.search(np.pad(np.unpackbits(maccs), (0,24)), 100)
m.to_list()

ValueError: The number of vector dimensions doesn't match!

In [19]:
index = Index.restore("/raid/chem/usearch_index/index-maccs-ecfp4.usearch")

In [40]:
maccs, ecfp4, fcfp4 = smiles_to_maccs_ecfp4_fcfp4('CCCS(=O)c1ccc2[nH]c(=NC(=O)OC)[nH]c2c1')
m = index.search(np.pad(ecfp4, (0, 24)), 100)
m.to_list()

In [42]:
m.to_list()

[(85275940, 0.9266055226325989),
 (40739378, 0.9268292784690857),
 (17751615, 0.9306930899620056),
 (21128694, 0.931034505367279),
 (92558917, 0.9339622855186462),
 (27356615, 0.9339622855186462),
 (25399081, 0.934959352016449),
 (54037602, 0.9351851940155029),
 (85159272, 0.9351851940155029),
 (84357576, 0.9354838728904724),
 (101195119, 0.9354838728904724),
 (5453107, 0.9354838728904724),
 (24393644, 0.9354838728904724),
 (46758127, 0.9354838728904724),
 (59357117, 0.9359999895095825),
 (43322374, 0.936170220375061),
 (109615817, 0.9363636374473572),
 (63056219, 0.9363636374473572),
 (111420175, 0.9363636374473572),
 (45674022, 0.9363636374473572),
 (44019451, 0.9365079402923584),
 (52840043, 0.9368420839309692),
 (88019857, 0.9368420839309692),
 (87020582, 0.9369369149208069),
 (85438906, 0.9369369149208069),
 (13982052, 0.9369369149208069),
 (27682230, 0.9369369149208069),
 (67802998, 0.9369369149208069),
 (44349917, 0.9369369149208069),
 (33410137, 0.9370079040527344),
 (67246079,

In [70]:
def get_pubchem_dist(mol_string, k_nearest=100):
	molecule = Chem.MolFromSmiles(mol_string)
	fp = np.array(MACCSkeys.GenMACCSKeys(molecule))[1:]
	fp = np.pad(fp, (0, 26))
	matches = index1.search(np.packbits(fp), k_nearest)
	dist = matches.distances.mean()
	return dist

In [22]:
moses = moses[:1000]

In [23]:
moses['pubchem_dist'] = [get_pubchem_dist(mol, 1000) for mol in moses.SMILES]
moses['tokenized_len'] = [len(tokenizer.encode(mol)) for mol in moses.SMILES]

ValueError: The number of vector dimensions doesn't match!

In [11]:
moses['tokenized_len'] = [len(tokenizer.encode(mol)) for mol in moses.SMILES]

In [5]:
moses = pd.read_csv('../data/moses_dist_pubchem.csv')

In [6]:
moses.head()

Unnamed: 0,SMILES,SPLIT,SAS,QED,CLOGP,WEIGHT,pubchem_dist,tokenized_len
0,CCCS(=O)c1ccc2[nH]c(=NC(=O)OC)[nH]c2c1,train,3.812618,0.897,1.681,281.083,0.165535,32
1,CC(C)(C)C(=O)C(Oc1ccc(Cl)cc1)n1ccnc1,train,2.936327,0.862,3.729,292.098,0.067525,28
2,Cc1c(Cl)cccc1Nc1ncccc1C(=O)OCC(O)CO,train,2.663795,0.701,2.297,336.088,0.128131,25
3,Cn1cnc2c1c(=O)n(CC(O)CO)c(=O)n2C,train,2.989161,0.646,-2.213,254.102,0.067316,28
4,CC1Oc2ccc(Cl)cc2N(CC(O)CO)C1=O,train,3.05575,0.853,0.807,271.061,0.12349,22


In [7]:
moses.count()

SMILES           500000
SPLIT            500000
SAS              500000
QED              500000
CLOGP            500000
WEIGHT           500000
pubchem_dist     500000
tokenized_len    500000
dtype: int64

In [10]:
moses.groupby('tokenized_len').count()

Unnamed: 0_level_0,SMILES,SPLIT,SAS,QED,CLOGP,WEIGHT,pubchem_dist
tokenized_len,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
9,8,8,8,8,8,8,8
10,8,8,8,8,8,8,8
11,13,13,13,13,13,13,13
12,40,40,40,40,40,40,40
13,84,84,84,84,84,84,84
14,215,215,215,215,215,215,215
15,486,486,486,486,486,486,486
16,1124,1124,1124,1124,1124,1124,1124
17,2145,2145,2145,2145,2145,2145,2145
18,3884,3884,3884,3884,3884,3884,3884


In [8]:

def calculate_tanimoto_distance(smiles1, smiles2):
    # Convert SMILES strings to RDKit molecules
    mol1 = Chem.MolFromSmiles(smiles1)
    mol2 = Chem.MolFromSmiles(smiles2)
    
    # Check if the molecules were successfully created
    if mol1 is None or mol2 is None:
        raise ValueError("Invalid SMILES representation")

    # Generate MACCS keys for the molecules
    keys1 = MACCSkeys.GenMACCSKeys(mol1)
    keys2 = MACCSkeys.GenMACCSKeys(mol2)

    # Calculate the Tanimoto similarity
    common_bits = sum(bit1 & bit2 for bit1, bit2 in zip(keys1, keys2))
    total_bits = sum(bit1 | bit2 for bit1, bit2 in zip(keys1, keys2))

    tanimoto_distance = 1.0 - (common_bits / total_bits)  # Tanimoto distance ranges from 0 to 1

    return tanimoto_distance

In [20]:
dist0 = moses[(moses.tokenized_len == 28) & (moses.pubchem_dist > 0.3) & (moses.SPLIT == 'train')].SMILES

In [21]:
dist0.values

array(['Cc1ccc(NC(=O)NC(C)C)cc1NC(=O)NC(C)C',
       'Cc1ccc(CC(=O)NCCS(=O)c2ccc(C)cc2)cc1',
       'Cc1cccc(CC(=O)NCCS(=O)c2ccc(Cl)cc2)c1',
       'Cc1ccccc1SCC(=O)Nc1ccc(S(C)(=O)=O)cc1',
       'O=C(CSc1ncnc2sccc12)c1c[nH]c2ccccc12',
       'CCOc1ccccc1NC(=O)c1ccc(NS(=O)(=O)CC)cc1',
       'CCc1nnsc1C(=O)Nc1sc2c(c1C(N)=O)CCC2',
       'O=C(CS(=O)(=O)c1ccccc1)Nc1ccc2sccc2c1',
       'Cc1ccc(S(=O)(=O)CCC(=O)Nc2ccccc2F)cc1',
       'Cc1cc(OS(C)(=O)=O)nc2c1cnn2C(C)C',
       'CS(=O)c1ccc(C(=O)Nc2ccc3c(c2)CCC3)cc1',
       'O=C(CCS(=O)(=O)c1ccc(Cl)cc1)Nc1cccnc1',
       'O=C(CSc1ccc(Cl)cc1)NCc1ccc2c(c1)OCCO2',
       'Cc1c(NC(=O)C2CC2)cccc1C(=O)NCC(C)C',
       'CCC(C)C(=O)Nc1cccc(C(=O)NCC(C)C)c1C',
       'O=C(Nc1cccc2cn[nH]c12)c1cc(C2CC2)on1',
       'CCn1nnc2cc(C(=O)NCc3c[nH]nc3C)ccc21'], dtype=object)

In [None]:
calculate_tanimoto_distance('COc1ccc(Oc2coc3c(C)c(OC(C)=O)ccc3c2=O)cc1', )

In [60]:
dist3 = moses[(moses.tokenized_len == 28) & (moses.pubchem_dist > 0.3) & (moses.SPLIT == 'train')].SMILES

In [70]:
molecule = Chem.MolFromSmiles()
fp = np.array(MACCSkeys.GenMACCSKeys(molecule))[1:]
fp = np.pad(fp, (0, 26))
matches = index.search(np.packbits(fp), 10, exact=True)

ArgumentError: Python argument types in
    rdkit.Chem.rdmolfiles.MolFromSmiles(str, str)
did not match C++ signature:
    MolFromSmiles(boost::python::api::object SMILES, bool sanitize=True, boost::python::dict replacements={})
    MolFromSmiles(boost::python::api::object SMILES, RDKit::SmilesParserParams params)