In [68]:
from rdkit.Chem.MolStandardize.rdMolStandardize import SuperParent
%load_ext autoreload
%autoreload 2

import sys
sys.path.append('..')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [69]:
import re
import gzip
import os
from zipfile import ZipFile

import numpy as np
from tqdm import tqdm
import pandas as pd

from src.utils import smi2mol, mol2smi, apply_mp

from rdkit import Chem
from rdkit.Chem import PandasTools
from rdkit.Chem.MolStandardize import rdMolStandardize
from rdkit import RDLogger
RDLogger.DisableLog('rdApp.*')

In [70]:
smiles_to_add = pd.Series([
    'CCOCC', 'CO', 'CI', 'CBr', 'CCl', 
    'CCCl', 'CCBr', 'CCI', 
    'CN', 'CCN', 'C(=N)N', 'CC(=N)N', 'NN',
    'C=O', 'CC=O', 'CC(=O)OC(C)=O', 'C1OC1',    
    'ClS(=O)(=O)C', 'CS(=O)(=O)OS(C)(=O)=O', 'FC(F)(F)S(=O)(=O)OS(=O)(=O)C(F)(F)F',
])

### Create MolPort Stock

download file from https://www.molport.com/shop/fast-delivery-bb

In [71]:
days = '7'

fname = f'data/molport/{days}_Days/{days}_Days-1.sdf.gz'

if not os.path.exists(fname):
    with ZipFile('data/molport/Molport_Fast_Delivery_BB_Region_EU.zip', 'r') as zip_ref:
        zip_ref.extractall('data/molport')

df = PandasTools.LoadSDF(fname, molColName='mol')
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 323092 entries, 0 to 323283
Data columns (total 6 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   Molport ID        323092 non-null  object
 1   SMILES            323092 non-null  object
 2   Supplier Country  323092 non-null  object
 3   Link to Molport   323092 non-null  object
 4   ID                323092 non-null  object
 5   mol               323092 non-null  object
dtypes: object(6)
memory usage: 17.3+ MB


In [72]:
mols_to_add = smiles_to_add.apply(smi2mol)
mols = pd.concat([df.mol, mols_to_add])
smi = mols.apply(mol2smi)
smi.dropna(inplace=True)
smi.to_csv(f'data/molport/{days}_days.smi', index=False, header=False)

### Create molbloom filter

In [74]:
import math

eps = 1e-6              # false positive rate
n = len(smi)            # number of elements
LN2 = 0.69314718056
m = round(- n * math.log(eps, 2) / (LN2 * LN2))  # calc bloom filter size in bits
m

13404285

In [None]:
!smiles2stock --files data/molport/{days}_days.smi --output config/molport_{days}d_stock.bloom --target molbloom-inchi --bloom_params {m} {n}

In [75]:
!smiles2stock --files data/molport/{days}_days.smi --output config/molport_{days}d_smiles_stock.bloom --target molbloom --bloom_params {m} {n}

Processing data/molport/7_days.smi
bloom_new: myfilter size=13404285 bits, MB=1.597915, n=323112 k=28 fp=0.000000
Created bloom stock with 322722 unique compounds
