In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import mols2grid

from rdkit import Chem
from rdkit.Chem import rdMolEnumerator
from src.utils import smi2mol, mol2smi

In [2]:
df = pd.read_csv('../data/raw/train_admet.csv')
df.rename(columns={'Drug_ID': 'id', 'Drug': 'smi', 'Y': 'target', 'property': 'prop'}, inplace=True)
df.set_index('id', inplace=True)

df['rdmol'] = df.smi.apply(Chem.MolFromSmiles)



In [3]:
df['mol_count'] = df.smi.str.split('[.]').apply(len)

In [4]:
(df.mol_count > 1).sum()

78

# Filter extended smiles

In [5]:
mask = df.smi.str.contains('[|]')
mask.sum()

0

# Preprocessing

### remove salts, water

In [6]:
from rdkit.Chem.SaltRemover import SaltRemover

remover = SaltRemover(defnData='[Cl,Na,K,N,O]')

# mols2grid.display(train.rdmol.apply(remover.StripMol))
mols2grid.display(df.query('mol_count > 1').rdmol.apply(remover.StripMol), smiles_col='smi')

MolGridWidget()

In [7]:
from rdkit.Chem.MolStandardize.rdMolStandardize import TautomerParent
from rdkit import RDLogger

from rdkit.Chem.MolStandardize.rdMolStandardize import Cleanup, Reionize, Uncharger, LargestFragmentChooser, SuperParent

uncharger = Uncharger()
lf = LargestFragmentChooser()

RDLogger.DisableLog('rdApp.*')


def preprocess(mol):
    m = remover.StripMol(mol)
    m = uncharger.uncharge(m)
    m = lf.choose(m)
    # m = TautomerParent(m, skipStandardize=True)    
    m = TautomerParent(m)
    return m

In [8]:
mols2grid.display(df.query('mol_count > 1').rdmol.apply(preprocess))

MolGridWidget()

In [9]:
# preprocess full dataset
# df.rdmol = df.rdmol.apply(SuperParent)

from src.utils import apply_mp

def super_parent(mol):
    return SuperParent(mol)

df.rdmol = apply_mp(df.rdmol, super_parent)
df.smi = df.rdmol.apply(mol2smi)

  return bound(*args, **kwds)


In [10]:
df

Unnamed: 0_level_0,Unnamed: 0,smi,target,prop,rdmol,mol_count
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0,O=[N+]([O-])c1c2c(c3ccc4cccc5ccc1c3c45)CCCC2,1,1,<rdkit.Chem.rdchem.Mol object at 0x7f8b8905f650>,1
1,1,O=c1c2ccccc2c(=O)c2c1ccc1c2[nH]c2c3c(=O)c4cccc...,0,1,<rdkit.Chem.rdchem.Mol object at 0x7f8b8905e0c0>,1
2,2,[N-]=[N+]=CC(=O)NCC(=O)NN,1,1,<rdkit.Chem.rdchem.Mol object at 0x7f8b89015170>,1
3,3,[N-]=[N+]=C1C=NC(=O)NC1=O,1,1,<rdkit.Chem.rdchem.Mol object at 0x7f8b88e84590>,1
4,4,CCCCN(CC(O)C1=CC(=[N+]=[N-])C(=O)C=C1)N=O,1,1,<rdkit.Chem.rdchem.Mol object at 0x7f8b88e87470>,1
...,...,...,...,...,...,...
7934,7934,O=c1[nH]c2cc(Cl)c(Cl)c([N+](=O)[O-])c2[nH]c1=O,1,3,<rdkit.Chem.rdchem.Mol object at 0x7f8b88f168e0>,1
7935,7935,C[S+](CCC(N)C(=O)[O-])CC1OC(n2cnc3c(N)ncnc32)C...,1,3,<rdkit.Chem.rdchem.Mol object at 0x7f8b88f16930>,1
7936,7936,CC(Cc1ccccc1)N1CC(=NC(=O)Nc2ccccc2)ON1,1,3,<rdkit.Chem.rdchem.Mol object at 0x7f8b88f16980>,1
7937,7937,CCc1c(C)[n+]([NH-])c(-c2ccc(OC)c(OC)c2)c2cc(OC...,1,3,<rdkit.Chem.rdchem.Mol object at 0x7f8b88f169d0>,1


In [11]:
df[['smi', 'target', 'prop']].to_csv('../data/processed/train.csv')

In [12]:
import pandas as pd
from src.utils import smi2mol

test = pd.read_csv('../data/raw/test_data.csv').rename(
    columns={'Drug_ID': 'id', 'Drug': 'smi', 'Y': 'target', 'property': 'prop'}
)
test.set_index('id', inplace=True)

test['rdmol'] = test.smi.apply(smi2mol)

In [13]:
test.rdmol = test.rdmol.apply(SuperParent)
test.smi = test.rdmol.apply(mol2smi)

In [14]:
test[['smi', 'prop']].to_csv('../data/processed/test.csv')