In [2]:
import re
import gzip

import numpy as np
from tqdm import tqdm
import pandas as pd

from rdkit import Chem
from rdkit.Chem import PandasTools

smiles_to_add = [
    'CCOCC', 'CO', 'CI', 'CBr', 'CCl', 
    'CCCl', 'CCBr', 'CCI', 
    'CN', 'CCN', 'C(=N)N', 'CC(=N)N', 'NN',
    'C=O', 'CC=O', 'CC(=O)OC(C)=O', 'C1OC1',    
    'ClS(=O)(=O)C', 'FC(F)(F)S(=O)(=O)OS(=O)(=O)C(F)(F)F',
]

### MolPort Stock

download file from https://www.molport.com/shop/fast-delivery-bb

In [3]:
import os
from zipfile import ZipFile
from rdkit import RDLogger

RDLogger.DisableLog('rdApp.*')

fname = 'data/molport/3_Days/3_Days-1.sdf.gz'

if not os.path.exists(fname):
    with ZipFile('data/molport/Molport_Fast_Delivery_BB_Region_EU.zip', 'r') as zip_ref:
        zip_ref.extractall('data/molport')

df = PandasTools.LoadSDF(fname, molColName='mol').reset_index()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3539 entries, 0 to 3538
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   index             3539 non-null   int64 
 1   Molport ID        3539 non-null   object
 2   SMILES            3539 non-null   object
 3   Supplier Country  3539 non-null   object
 4   Link to Molport   3539 non-null   object
 5   ID                3539 non-null   object
 6   mol               3539 non-null   object
dtypes: int64(1), object(6)
memory usage: 193.7+ KB


In [4]:
for i in tqdm(df.index):
    try:
        # smi = df.at[i, 'SMILES']        
        # mol = Chem.MolFromSmiles(smi)
        
        mol = df.at[i, 'mol']                
        df.at[i, 'inchi_key'] = Chem.MolToInchiKey(mol)
    except Exception as e:        
        pass
        # smi = Chem.MolToSmiles(mol)
        # print(smi, e)

for smi in smiles_to_add:
    mol = Chem.MolFromSmiles(smi)
    if not mol:
        print(smi)
        continue
    inchi_key = Chem.MolToInchiKey(mol)
    if inchi_key not in df.inchi_key:
        idx = len(df)
        df.at[idx, 'SMILES'] = smi 
        df.at[idx, 'mol'] = mol
        df.at[idx, 'inchi_key'] = Chem.MolToInchiKey(mol)

100%|██████████| 3539/3539 [00:00<00:00, 3663.82it/s]


In [7]:
from src.utils import apply_mp

def mol2inchi(mol):
    try:
        return Chem.MolToInchiKey(mol)
    except:
        return None

# inchi_key = apply_mp(df.mol[:100], mol2inchi)
df.mol.apply(mol2inchi)

0       CKXJVKOWWTUZFO-ZETCQYMHSA-N
1       GGUQUEHTEDXBQP-UHFFFAOYSA-N
2       LOENBOWNSQEQMP-KRWDZBQOSA-N
3       DCVRDYIASVYSKO-UHFFFAOYSA-N
4       QJMQVPOLPLYDHZ-UHFFFAOYSA-N
                   ...             
3548    PNKUSGQVOMIXLU-UHFFFAOYSA-N
3549    OAKJQQAXSVQMHS-UHFFFAOYSA-N
3550    WSFSSNUMVMOOMR-UHFFFAOYSA-N
3551    IKHGUXGNUITLKF-UHFFFAOYSA-N
3552    WFDIJRYMOXRFFG-UHFFFAOYSA-N
Name: mol, Length: 3553, dtype: object

In [17]:
cols = ['Molport ID', 'SMILES', 'Supplier Country', 'inchi_key']
df = df[cols].dropna(subset='inchi_key')

In [19]:
with pd.HDFStore('config/molport_7d_stock.hdf5', 'w') as store:
    store.put('/table', pd.DataFrame(df.inchi_key.dropna()))

### modify zinc stock

In [4]:
print('Loading zinc stock ... ', end='')
with pd.HDFStore('config/zinc_stock.hdf5', 'r') as store:
    df = store.get('/table')
print('Done')
    
ser = pd.Series(name='inchi_key', dtype=str)
for smi in smiles_to_add:
    mol = Chem.MolFromSmiles(smi)
    try:
        Chem.SanitizeMol(mol)
        ser.loc[len(ser)] = Chem.MolToInchiKey(mol)
    except Exception as e:
        print(smi, 'failed:', e)

mask = ~ser.isin(df.inchi_key)
if mask.any():    
    with pd.HDFStore('config/zinc_stock.hdf5', 'w') as store:
        df = pd.concat([df, ser[mask]])
        store.put('/table', df)
        print(mask.sum(), 'compounds added')

Loading zinc stock ... Done


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 17422844 entries, 0 to 9
Data columns (total 1 columns):
 #   Column     Dtype 
---  ------     ----- 
 0   inchi_key  object
dtypes: object(1)
memory usage: 265.9+ MB


In [53]:
import pandas as pd

with pd.HDFStore('config/zinc_stock.hdf5', 'r') as store:
    print(store.info())
    print(store.keys())

<class 'pandas.io.pytables.HDFStore'>
File path: config/zinc_stock.hdf5
/                 [invalid_HDFStore node: group ``/table`` does not have a child named ``description``]
/table            frame        (shape->[1,1])                                                          
['/', '/table']


In [55]:
with pd.HDFStore('config/zinc_stock_orig.hdf5', 'r') as store:
    df = store.get('/table')

df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 17422831 entries, 0 to 325908
Data columns (total 1 columns):
 #   Column     Dtype 
---  ------     ----- 
 0   inchi_key  object
dtypes: object(1)
memory usage: 265.9+ MB


In [29]:
df.head()

Unnamed: 0,inchi_key
0,RYYVLZVUVIJVGH-UHFFFAOYSA-N
1,HZZVJAQRINQKSD-RQJHMYQMSA-N
2,RRTKVYSLIGQWCO-UHFFFAOYSA-N
3,YAPQBXQYLJRXSA-UHFFFAOYSA-N
4,SZPBAPFUXAADQV-UHFFFAOYSA-N


In [30]:
df.tail()

Unnamed: 0,inchi_key
325904,OKYGVTBDEYVSBV-KPPDAEKUSA-N
325905,OKYGVTBDEYVSBV-WHXUTIOJSA-N
325906,RIHTXYUOTJTDKX-UHFFFAOYSA-N
325907,WQOVAHPYNIWPHG-CYBMUJFWSA-N
325908,WQOVAHPYNIWPHG-ZDUSSCGKSA-N


In [31]:
df.index.max(), len(df)

(17396070, 17422831)

In [32]:
df.index[df.index.duplicated()]

Index([     0,      1,      2,      3,      4,      5,      6,      7,      8,
            9,
       ...
       325899, 325900, 325901, 325902, 325903, 325904, 325905, 325906, 325907,
       325908],
      dtype='int64', length=309919)

In [42]:
from rdkit import Chem

smiles = ['CO', 'CI', 'CBr', 'CCBr', 'CCI', 'CN', 'CCN', 'C=O', 'CC=O', 'NN']
ser = pd.Series(name='inchi_key')
for smi in smiles:
    mol = Chem.MolFromSmiles(smi)
    Chem.SanitizeMol(mol)
    ser.loc[len(ser)] = Chem.MolToInchiKey(mol)
    print("%5s %s" % (smi, ser.iloc[-1]))

   CO OKKJLVBELUTLKV-UHFFFAOYSA-N
   CI INQOMBQAUSQDDS-UHFFFAOYSA-N
  CBr GZUXJHMPEANEGY-UHFFFAOYSA-N
 CCBr RDHPKYGYEGBMSE-UHFFFAOYSA-N
  CCI HVTICUPFWKNHNG-UHFFFAOYSA-N
   CN BAVYZALUXZFZLV-UHFFFAOYSA-N
  CCN QUSNBJAOOMFDIB-UHFFFAOYSA-N
  C=O WSFSSNUMVMOOMR-UHFFFAOYSA-N
 CC=O IKHGUXGNUITLKF-UHFFFAOYSA-N
   NN OAKJQQAXSVQMHS-UHFFFAOYSA-N


In [44]:
mask = ser.isin(df.inchi_key)

In [45]:
ser[~mask]

0    OKKJLVBELUTLKV-UHFFFAOYSA-N
1    INQOMBQAUSQDDS-UHFFFAOYSA-N
2    GZUXJHMPEANEGY-UHFFFAOYSA-N
3    RDHPKYGYEGBMSE-UHFFFAOYSA-N
4    HVTICUPFWKNHNG-UHFFFAOYSA-N
5    BAVYZALUXZFZLV-UHFFFAOYSA-N
6    QUSNBJAOOMFDIB-UHFFFAOYSA-N
7    WSFSSNUMVMOOMR-UHFFFAOYSA-N
8    IKHGUXGNUITLKF-UHFFFAOYSA-N
9    OAKJQQAXSVQMHS-UHFFFAOYSA-N
Name: inchi_key, dtype: object

In [47]:
pd.concat([df, ser[~mask]], axis=0).shape

(17422841, 1)

In [48]:
Chem.MolToInchiKey(mol)

'OAKJQQAXSVQMHS-UHFFFAOYSA-N'

In [49]:
with pd.HDFStore('config/zinc_stock.hdf5', 'w') as store:
    store.put('/table', df)

In [52]:
import pandas as pd
from rdkit import Chem

smiles = ['CO', 'CI', 'CBr', 'CCBr', 'CCI', 'CN', 'CCN', 'C=O', 'CC=O', 'NN']

with pd.HDFStore('config/zinc_stock.hdf5', 'r') as store:
    df = store.get('/table')
    
ser = pd.Series(name='inchi_key')
for smi in smiles:
    mol = Chem.MolFromSmiles(smi)
    Chem.SanitizeMol(mol)
    ser.loc[len(ser)] = Chem.MolToInchiKey(mol)    
mask = ~(ser.isin(df.inchi_key))
df = pd.concat([df, ser[mask]], axis=0)

with pd.HDFStore('config/zinc_stock.hdf5', 'w') as store:
    store.put('/table', df)
    print(mask.sum(), 'compounds added')

10 compounds added
