# MEHC-CURATION
## 1. Import libraries

In [1]:
import numpy as np
import pandas as pd
import os
from parallel_pandas import ParallelPandas
from rdkit import Chem
from rdkit.Chem.SaltRemover import SaltRemover

In [2]:
from curation.utils import *
from curation.validate import *
from curation.cleaning import *
from curation.normalization import *

## 2. Initialize CPUs

In [3]:
ParallelPandas.initialize(n_cpu=16, split_factor=4, disable_pr_bar=True)

In [4]:
smiles = pd.read_csv('data/cyp_train.csv')
smiles = pd.Series(smiles.iloc[:, 1])
smiles = pd.DataFrame(smiles)
smiles

Unnamed: 0,compound
0,CN(C(=O)Cc1ccccc1)[C@@H]1CC[C@@]2(CCCO2)C[C@H]...
1,COc1ccc(/C=N/NC(N)=S)cc1OC(=O)c1cccc2ccccc12
2,COc1ccccc1COC(=O)Nc1c(C)nn(C)c1Cl
3,COc1ccc(Cl)cc1S(=O)(=O)N1CCN(C(=O)c2cc(OC)c(OC...
4,O=C(c1ccccc1F)c1cn(Cc2ccccc2)c2ccccc12
...,...
41230,NS(=O)(=O)c1cc2c(cc1Cl)NC=NS2(=O)=O
41231,CN1CCCC1CCNC1C2CC3CC(C2)CC1C3
41232,Cn1c(=O)c(-c2ccc(F)cc2)nc2cnc(OCc3ccccc3)nc21
41233,O=C(CNS(=O)(=O)c1ccc(Br)cc1)N1CCOCC1


In [5]:
reports_dir_path = os.path.join(os.getcwd(), 'reports')
reports_dir_path

'/Users/stevepham/Documents/Bioinfo/NZResearchGr/mehc-curation/reports'

## 3. Remove unvalidated SMILES strings
In this part, unvalidated SMILES strings will be deleted out of the input data. These SMILES strings include SMILES strings with false format, mixture, inorganic, and organometallic compounds.

### 3.1. Validated SMILES data

In [6]:
validated_smiles = check_validate_smiles_in_dataframe(smiles, reports_dir_path, get_report_text_file=True, print_log=True)
validated_smiles

Number of input SMILES: 41235
Number of valid SMILES: 41235
Number of invalid SMILES: 0


Unnamed: 0,compound
0,CN(C(=O)Cc1ccccc1)C1CCC2(CCCO2)CC1N1CCCC1
1,COc1ccc(C=NNC(N)=S)cc1OC(=O)c1cccc2ccccc12
2,COc1ccccc1COC(=O)Nc1c(C)nn(C)c1Cl
3,COc1ccc(Cl)cc1S(=O)(=O)N1CCN(C(=O)c2cc(OC)c(OC...
4,O=C(c1ccccc1F)c1cn(Cc2ccccc2)c2ccccc12
...,...
41230,NS(=O)(=O)c1cc2c(cc1Cl)NC=NS2(=O)=O
41231,CN1CCCC1CCNC1C2CC3CC(C2)CC1C3
41232,Cn1c(=O)c(-c2ccc(F)cc2)nc2cnc(OCc3ccccc3)nc21
41233,O=C(CNS(=O)(=O)c1ccc(Br)cc1)N1CCOCC1


### 3.2. Remove mixtures in SMILES data

In [6]:
mixture_removed_smiles = remove_mixtures(smiles, reports_dir_path, get_report_text_file=True, print_log=True)
mixture_removed_smiles

Number of input SMILES: 41235
Number of non-mixture SMILES: 41235
Number of mixture SMILES: 0


0        CN(C(=O)Cc1ccccc1)[C@@H]1CC[C@@]2(CCCO2)C[C@H]...
1             COc1ccc(/C=N/NC(N)=S)cc1OC(=O)c1cccc2ccccc12
2                        COc1ccccc1COC(=O)Nc1c(C)nn(C)c1Cl
3        COc1ccc(Cl)cc1S(=O)(=O)N1CCN(C(=O)c2cc(OC)c(OC...
4                   O=C(c1ccccc1F)c1cn(Cc2ccccc2)c2ccccc12
                               ...                        
41230                  NS(=O)(=O)c1cc2c(cc1Cl)NC=NS2(=O)=O
41231                        CN1CCCC1CCNC1C2CC3CC(C2)CC1C3
41232        Cn1c(=O)c(-c2ccc(F)cc2)nc2cnc(OCc3ccccc3)nc21
41233                 O=C(CNS(=O)(=O)c1ccc(Br)cc1)N1CCOCC1
41234    CCc1c(O)c2c(c(O)c1C(C)=O)C(=O)c1c(cc(O)c(C(=O)...
Name: compound, Length: 41235, dtype: object

## 4. Clean SMILES strings
In this step, salts are identified, counter-ions are deleted, and remaining SMILES strings are neutralized.

In [6]:
cleaned_smiles = clean_salts_and_neutralize_smiles_data(validated_smiles)
cleaned_smiles

Pre-cleaned smiles data: 41224
Number of salts were cleaned: 5
Number of substances were neutralized: 9
Post-cleaned smiles data: 41224


Unnamed: 0,compound
0,CN(C(=O)Cc1ccccc1)C1CCC2(CCCO2)CC1N1CCCC1
1,COc1ccc(C=NNC(N)=S)cc1OC(=O)c1cccc2ccccc12
2,COc1ccccc1COC(=O)Nc1c(C)nn(C)c1Cl
3,COc1ccc(Cl)cc1S(=O)(=O)N1CCN(C(=O)c2cc(OC)c(OC...
4,O=C(c1ccccc1F)c1cn(Cc2ccccc2)c2ccccc12
...,...
41230,NS(=O)(=O)c1cc2c(cc1Cl)NC=NS2(=O)=O
41231,CN1CCCC1CCNC1C2CC3CC(C2)CC1C3
41232,Cn1c(=O)c(-c2ccc(F)cc2)nc2cnc(OCc3ccccc3)nc21
41233,O=C(CNS(=O)(=O)c1ccc(Br)cc1)N1CCOCC1


## 5. Normalize tautomers in data
In this step, tautomers in data will be normalized to become only one specific form for all tautomers.

In [7]:
tautomerized_smiles = normalize_tautomer_in_dataframe(cleaned_smiles)
tautomerized_smiles

[15:59:44] Can't kekulize mol.  Unkekulized atoms: 5 17 18 19 28 29
[15:59:44] Can't kekulize mol.  Unkekulized atoms: 3 5 6 10
[15:59:44] Can't kekulize mol.  Unkekulized atoms: 2 9
[15:59:44] Can't kekulize mol.  Unkekulized atoms: 2 9
[15:59:44] Can't kekulize mol.  Unkekulized atoms: 2 9
[15:59:44] Can't kekulize mol.  Unkekulized atoms: 11 15
[15:59:44] Can't kekulize mol.  Unkekulized atoms: 10 14
[15:59:44] Can't kekulize mol.  Unkekulized atoms: 10 14
[15:59:45] Can't kekulize mol.  Unkekulized atoms: 6 9 13 16
[15:59:45] Can't kekulize mol.  Unkekulized atoms: 6 9 13 16
[15:59:45] Can't kekulize mol.  Unkekulized atoms: 2 6
[15:59:45] Can't kekulize mol.  Unkekulized atoms: 2 6
[15:59:45] Can't kekulize mol.  Unkekulized atoms: 2 6
[15:59:45] Can't kekulize mol.  Unkekulized atoms: 2 6
[15:59:45] Tautomer enumeration stopped at 162 tautomers: max transforms reached
[15:59:45] Can't kekulize mol.  Unkekulized atoms: 2 6
[15:59:45] Can't kekulize mol.  Unkekulized atoms: 2 6
[15

Number of SMILES strings before tautomer normalizing: 41224
Number of SMILES tautomers are normalizing: 3984
Number of SMILES strings after tautomer normalizing: 41224


Unnamed: 0,compound
0,CN(C(=O)Cc1ccccc1)C1CCC2(CCCO2)CC1N1CCCC1
1,COc1ccc(C=NNC(N)=S)cc1OC(=O)c1cccc2ccccc12
2,COc1ccccc1COC(=O)Nc1c(C)nn(C)c1Cl
3,COc1ccc(Cl)cc1S(=O)(=O)N1CCN(C(=O)c2cc(OC)c(OC...
4,O=C(c1ccccc1F)c1cn(Cc2ccccc2)c2ccccc12
...,...
41230,NS(=O)(=O)c1cc2c(cc1Cl)N=CNS2(=O)=O
41231,CN1CCCC1CCNC1C2CC3CC(C2)CC1C3
41232,Cn1c(=O)c(-c2ccc(F)cc2)nc2cnc(OCc3ccccc3)nc21
41233,O=C(CNS(=O)(=O)c1ccc(Br)cc1)N1CCOCC1


In [11]:
duplicated_smiles, post_removed_smiles = remove_duplicates_in_dataframe(tautomerized_smiles)
duplicated_smiles

Number of input SMILES strings: 41224
Number of unique SMILES strings: 14366
Number of duplicate SMILES strings: 26858


Unnamed: 0,compound
0,CN(C(=O)Cc1ccccc1)C1CCC2(CCCO2)CC1N1CCCC1
1,COc1ccc(C=NNC(N)=S)cc1OC(=O)c1cccc2ccccc12
2,COc1ccccc1COC(=O)Nc1c(C)nn(C)c1Cl
3,COc1ccc(Cl)cc1S(=O)(=O)N1CCN(C(=O)c2cc(OC)c(OC...
4,O=C(c1ccccc1F)c1cn(Cc2ccccc2)c2ccccc12
...,...
41154,O=C(NCC1CCCO1)c1cccc(F)c1
41195,COc1ccc(C(=O)NC2CCN(C(=S)Nc3cccc(C)c3)CC2)cc1
41197,CCNC(=O)c1ccc(-n2nc(C(F)(F)F)c3c2CCCC3)cc1
41208,O=C(c1ccccc1Nc1ccc(SC(F)F)cc1)N1CCCCC1


In [10]:
post_removed_smiles

Unnamed: 0,compound
0,CN(C(=O)Cc1ccccc1)C1CCC2(CCCO2)CC1N1CCCC1
1,COc1ccc(C=NNC(N)=S)cc1OC(=O)c1cccc2ccccc12
2,COc1ccccc1COC(=O)Nc1c(C)nn(C)c1Cl
3,COc1ccc(Cl)cc1S(=O)(=O)N1CCN(C(=O)c2cc(OC)c(OC...
4,O=C(c1ccccc1F)c1cn(Cc2ccccc2)c2ccccc12
...,...
41154,O=C(NCC1CCCO1)c1cccc(F)c1
41195,COc1ccc(C(=O)NC2CCN(C(=S)Nc3cccc(C)c3)CC2)cc1
41197,CCNC(=O)c1ccc(-n2nc(C(F)(F)F)c3c2CCCC3)cc1
41208,O=C(c1ccccc1Nc1ccc(SC(F)F)cc1)N1CCCCC1
