# MEHC-CURATION
## 1. Import libraries

In [7]:
import numpy as np
import pandas as pd
from parallel_pandas import ParallelPandas
from rdkit import Chem
from rdkit.Chem.SaltRemover import SaltRemover

In [8]:
from curation.utils import *
from curation.validate import *
from curation.cleaning import *

## 2. Initialize CPUs

In [9]:
ParallelPandas.initialize(n_cpu=16, split_factor=4, disable_pr_bar=True)

In [10]:
smiles = pd.read_csv('data/cyp_train.csv')
smiles = pd.Series(smiles.iloc[:, 1])
smiles = pd.DataFrame(smiles)
smiles

Unnamed: 0,compound
0,CN(C(=O)Cc1ccccc1)[C@@H]1CC[C@@]2(CCCO2)C[C@H]...
1,COc1ccc(/C=N/NC(N)=S)cc1OC(=O)c1cccc2ccccc12
2,COc1ccccc1COC(=O)Nc1c(C)nn(C)c1Cl
3,COc1ccc(Cl)cc1S(=O)(=O)N1CCN(C(=O)c2cc(OC)c(OC...
4,O=C(c1ccccc1F)c1cn(Cc2ccccc2)c2ccccc12
...,...
41230,NS(=O)(=O)c1cc2c(cc1Cl)NC=NS2(=O)=O
41231,CN1CCCC1CCNC1C2CC3CC(C2)CC1C3
41232,Cn1c(=O)c(-c2ccc(F)cc2)nc2cnc(OCc3ccccc3)nc21
41233,O=C(CNS(=O)(=O)c1ccc(Br)cc1)N1CCOCC1


## 3. Remove unvalidated SMILES strings
In this part, unvalidated SMILES strings will be deleted out of the input data. These SMILES strings include SMILES strings with false format, mixture, inorganic, and organometallic compounds.

In [11]:
validated_smiles = remove_unwanted_smiles(smiles)
validated_smiles

Number of input smiles for validating: 41235
Number of successful smiles: 41224
Number of failed smiles: 11


Unnamed: 0,compound
0,CN(C(=O)Cc1ccccc1)C1CCC2(CCCO2)CC1N1CCCC1
1,COc1ccc(C=NNC(N)=S)cc1OC(=O)c1cccc2ccccc12
2,COc1ccccc1COC(=O)Nc1c(C)nn(C)c1Cl
3,COc1ccc(Cl)cc1S(=O)(=O)N1CCN(C(=O)c2cc(OC)c(OC...
4,O=C(c1ccccc1F)c1cn(Cc2ccccc2)c2ccccc12
...,...
41230,NS(=O)(=O)c1cc2c(cc1Cl)NC=NS2(=O)=O
41231,CN1CCCC1CCNC1C2CC3CC(C2)CC1C3
41232,Cn1c(=O)c(-c2ccc(F)cc2)nc2cnc(OCc3ccccc3)nc21
41233,O=C(CNS(=O)(=O)c1ccc(Br)cc1)N1CCOCC1


## 4. Clean SMILES strings
In this step, salts are identified, counter-ions are deleted, and remaining SMILES strings are neutralized.

In [12]:
cleaned_smiles = clean_salts_and_neutralize_smiles_data(validated_smiles)
cleaned_smiles

Pre-cleaned smiles data: 41224
Number of salts were cleaned: 5
Number of substances were neutralized: 9
Post-cleaned smiles data: 41224


0                CN(C(=O)Cc1ccccc1)C1CCC2(CCCO2)CC1N1CCCC1
1               COc1ccc(C=NNC(N)=S)cc1OC(=O)c1cccc2ccccc12
2                        COc1ccccc1COC(=O)Nc1c(C)nn(C)c1Cl
3        COc1ccc(Cl)cc1S(=O)(=O)N1CCN(C(=O)c2cc(OC)c(OC...
4                   O=C(c1ccccc1F)c1cn(Cc2ccccc2)c2ccccc12
                               ...                        
41230                  NS(=O)(=O)c1cc2c(cc1Cl)NC=NS2(=O)=O
41231                        CN1CCCC1CCNC1C2CC3CC(C2)CC1C3
41232        Cn1c(=O)c(-c2ccc(F)cc2)nc2cnc(OCc3ccccc3)nc21
41233                 O=C(CNS(=O)(=O)c1ccc(Br)cc1)N1CCOCC1
41234    CCc1c(O)c2c(c(O)c1C(C)=O)C(=O)c1c(cc(O)c(C(=O)...
Name: compound, Length: 41224, dtype: object