In [1]:
import rdkit
from rdkit.Chem import Descriptors
import h5py
import numpy as np

  from ._conv import register_converters as _register_converters


In [2]:
data_dir = "../data/"
old_grammar = h5py.File(data_dir + "zinc_grammar_dataset.h5")
new_grammar = h5py.File(data_dir + "zinc_grammar_dataset_new.h5")

In [3]:
new_indices = np.array(new_grammar['indices'])
oldlen =  len(old_grammar['indices'])

print('number of molecules new vs old:',len(new_indices),oldlen, len(new_indices)/oldlen)

number of molecules new vs old: 92636 249456 0.37135206208710153


In [4]:
print("average sequence length:")
new_seq_len = np.array(new_grammar['seq_len'])
old_seq_len = np.array(old_grammar['seq_len'])[new_indices]
print(new_seq_len.mean(), old_seq_len.mean())
print("max sequence length:")
print(new_seq_len.max(), old_seq_len.max())

average sequence length:
62.80888639405847 120.75712465995942
max sequence length:
118 217


In [5]:
fname = '../data/250k_rndm_zinc_drugs_clean.smi'

with open(fname) as f:
    smiles = f.readlines()

for i in range(len(smiles)):
    smiles[i] = smiles[i].strip()
    
new_smiles = [smiles[i] for i in new_indices]

In [6]:
from rdkit.Chem.rdmolfiles import MolFromSmiles
def num_atoms_from_smiles(smiles):
    mols = [MolFromSmiles(s) for s in smiles]
    atoms = np.array([len(m.GetAtoms()) if m is not None else 0 for m in mols])
    return atoms
num_atoms = num_atoms_from_smiles(new_smiles)

In [7]:
def num_rings_from_smiles(smiles):
    mols = [MolFromSmiles(s) for s in smiles]
    rings = np.array([Descriptors.NumAromaticRings(m) if m is not None else -1 for m in mols])
    return rings
                     
num_rings = num_rings_from_smiles(smiles)

In [8]:
print('Aromatic rings avg:', num_rings.mean(), 'std:', num_rings.std(), 'max:', num_rings.max())

Aromatic rings avg: 1.849817202232057 std: 0.9694758341086602 max: 6


In [9]:
new_rules_per_atom = (new_seq_len/num_atoms)[num_atoms>0].mean()
old_rules_per_atom = (old_seq_len/num_atoms)[num_atoms>0].mean()
print('Rules per atom:', old_rules_per_atom, new_rules_per_atom)

Rules per atom: 5.465792302346296 2.8506458596327025


In [10]:
scores_mean = np.array(new_grammar['score_mean'])
scores_std = np.array(new_grammar['score_std'])
raw_scores = np.array(new_grammar['raw_scores'])
print(scores_mean,scores_std,raw_scores.shape)
norm_scores = (raw_scores -scores_mean)/scores_std
norm_score = norm_scores.sum(1)
print(norm_score.shape, norm_score.max(), norm_score.min(), norm_score.std())

[ 2.48656181 -2.82021593 -0.02578911] [1.36771965 0.77092568 0.18803626] (92636, 3)
(92636,) 4.054522235013009 -72.4191860195049 2.069284802145453


In [11]:
print('Raw SA mean, std', raw_scores[:,1].mean(), raw_scores[:,1].std())

Raw SA mean, std -2.8202159253872527 0.7709256786567037


In [15]:
a = 5*5
b = a+1
print(a,b)

25 26
