In [5]:
import re
import gzip

import numpy as np
from tqdm import tqdm
import pandas as pd

from rdkit import Chem
from rdkit.Chem import PandasTools

smiles_to_add = [
    'CCOCC', 'CO', 'CI', 'CBr', 'CCl', 
    'CCCl', 'CCBr', 'CCI', 
    'CN', 'CCN', 'C(=N)N', 'CC(=N)N', 'NN',
    'C=O', 'CC=O', 'CC(=O)OC(C)=O', 'C1OC1',    
    'ClS(=O)(=O)C', 'CS(=O)(=O)OS(C)(=O)=O', 'FC(F)(F)S(=O)(=O)OS(=O)(=O)C(F)(F)F',
]

### Create MolPort Stock

download file from https://www.molport.com/shop/fast-delivery-bb

In [6]:
import os
from zipfile import ZipFile
from rdkit import RDLogger

RDLogger.DisableLog('rdApp.*')

fname = 'data/molport/7_Days/7_Days-1.sdf.gz'

if not os.path.exists(fname):
    with ZipFile('data/molport/Molport_Fast_Delivery_BB_Region_EU.zip', 'r') as zip_ref:
        zip_ref.extractall('data/molport')

df = PandasTools.LoadSDF(fname, molColName='mol').reset_index()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 323092 entries, 0 to 323091
Data columns (total 7 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   index             323092 non-null  int64 
 1   Molport ID        323092 non-null  object
 2   SMILES            323092 non-null  object
 3   Supplier Country  323092 non-null  object
 4   Link to Molport   323092 non-null  object
 5   ID                323092 non-null  object
 6   mol               323092 non-null  object
dtypes: int64(1), object(6)
memory usage: 17.3+ MB


In [7]:
for i in tqdm(df.index):
    try:
        # smi = df.at[i, 'SMILES']        
        # mol = Chem.MolFromSmiles(smi)
        
        mol = df.at[i, 'mol']                
        df.at[i, 'inchi_key'] = Chem.MolToInchiKey(mol)
    except Exception as e:        
        pass
        # smi = Chem.MolToSmiles(mol)
        # print(smi, e)

for smi in smiles_to_add:
    mol = Chem.MolFromSmiles(smi)
    if not mol:
        print(smi)
        continue
    inchi_key = Chem.MolToInchiKey(mol)
    if inchi_key not in df.inchi_key:
        idx = len(df)
        df.at[idx, 'SMILES'] = smi 
        df.at[idx, 'mol'] = mol
        df.at[idx, 'inchi_key'] = Chem.MolToInchiKey(mol)

100%|██████████| 323092/323092 [01:35<00:00, 3380.46it/s]


In [8]:
cols = ['Molport ID', 'SMILES', 'Supplier Country', 'inchi_key']
df = df[cols].dropna(subset='inchi_key')

In [9]:
with pd.HDFStore('config/molport_7d_stock.hdf5', 'w') as store:
    store.put('/table', pd.DataFrame(df.inchi_key))

### modify zinc stock

In [4]:
# print('Loading zinc stock ... ', end='')
# with pd.HDFStore('config/zinc_stock.hdf5', 'r') as store:
#     df = store.get('/table')
# print('Done')
#     
# ser = pd.Series(name='inchi_key', dtype=str)
# for smi in smiles_to_add:
#     mol = Chem.MolFromSmiles(smi)
#     try:
#         Chem.SanitizeMol(mol)
#         ser.loc[len(ser)] = Chem.MolToInchiKey(mol)
#     except Exception as e:
#         print(smi, 'failed:', e)
# 
# mask = ~ser.isin(df.inchi_key)
# if mask.any():    
#     with pd.HDFStore('config/zinc_stock.hdf5', 'w') as store:
#         df = pd.concat([df, ser[mask]])
#         store.put('/table', df)
#         print(mask.sum(), 'compounds added')

Loading zinc stock ... Done


In [5]:
# df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 17422844 entries, 0 to 9
Data columns (total 1 columns):
 #   Column     Dtype 
---  ------     ----- 
 0   inchi_key  object
dtypes: object(1)
memory usage: 265.9+ MB


In [49]:
# with pd.HDFStore('config/zinc_stock.hdf5', 'w') as store:
#     store.put('/table', df)