# MEHC-CURATION
## 1. Import libraries

In [1]:
import numpy as np
import pandas as pd
import os
from parallel_pandas import ParallelPandas
from rdkit import Chem
from rdkit.Chem.SaltRemover import SaltRemover

In [2]:
from curation.utils import *
from curation.validate import *
from curation.cleaning import *
from curation.normalization import *

## 2. Initialize CPUs

In [3]:
pd.options.mode.chained_assignment = None  # default='warn'

In [4]:
ParallelPandas.initialize(n_cpu=16, split_factor=4, disable_pr_bar=True)

In [5]:
smiles = pd.read_csv('data/cyp_train.csv')
smiles = pd.Series(smiles.iloc[:, 1])
smiles = pd.DataFrame(smiles)
smiles

Unnamed: 0,compound
0,CN(C(=O)Cc1ccccc1)[C@@H]1CC[C@@]2(CCCO2)C[C@H]...
1,COc1ccc(/C=N/NC(N)=S)cc1OC(=O)c1cccc2ccccc12
2,COc1ccccc1COC(=O)Nc1c(C)nn(C)c1Cl
3,COc1ccc(Cl)cc1S(=O)(=O)N1CCN(C(=O)c2cc(OC)c(OC...
4,O=C(c1ccccc1F)c1cn(Cc2ccccc2)c2ccccc12
...,...
41230,NS(=O)(=O)c1cc2c(cc1Cl)NC=NS2(=O)=O
41231,CN1CCCC1CCNC1C2CC3CC(C2)CC1C3
41232,Cn1c(=O)c(-c2ccc(F)cc2)nc2cnc(OCc3ccccc3)nc21
41233,O=C(CNS(=O)(=O)c1ccc(Br)cc1)N1CCOCC1


In [6]:
reports_dir_path = os.path.join(os.getcwd(), 'reports')
reports_dir_path

'/Users/stevepham/Documents/Bioinfo/NZResearchGr/mehc-curation/reports'

## 3. Remove invalid SMILES strings
In this part, invalid SMILES strings will be deleted out of the input data. These SMILES strings include SMILES strings with incorrect format, mixture, inorganic, and organometallic compounds.

### 3.1. Validate SMILES data

In [7]:
validating = ValidationStage(smiles)
valid_smiles, invalid_smiles = validating.check_valid_smiles(output_dir_path=reports_dir_path, print_logs=True, get_invalid_smile_indexes=True, get_report=True)
valid_smiles

Number of input SMILES: 41235
Number of valid SMILES: 41235
Number of invalid SMILES: 0



Unnamed: 0,compound
0,CN(C(=O)Cc1ccccc1)C1CCC2(CCCO2)CC1N1CCCC1
1,COc1ccc(C=NNC(N)=S)cc1OC(=O)c1cccc2ccccc12
2,COc1ccccc1COC(=O)Nc1c(C)nn(C)c1Cl
3,COc1ccc(Cl)cc1S(=O)(=O)N1CCN(C(=O)c2cc(OC)c(OC...
4,O=C(c1ccccc1F)c1cn(Cc2ccccc2)c2ccccc12
...,...
41230,NS(=O)(=O)c1cc2c(cc1Cl)NC=NS2(=O)=O
41231,CN1CCCC1CCNC1C2CC3CC(C2)CC1C3
41232,Cn1c(=O)c(-c2ccc(F)cc2)nc2cnc(OCc3ccccc3)nc21
41233,O=C(CNS(=O)(=O)c1ccc(Br)cc1)N1CCOCC1


### 3.2. Remove mixtures in SMILES data

In [8]:
removing_mixtures = ValidationStage(smiles)
mixture_removed_smiles = removing_mixtures.remove_mixtures(output_dir_path=reports_dir_path, get_report=True, print_logs=True)
mixture_removed_smiles

Number of input SMILES: 41235
Number of non-mixture SMILES: 41235
Number of mixture SMILES: 0



Unnamed: 0,compound
0,CN(C(=O)Cc1ccccc1)C1CCC2(CCCO2)CC1N1CCCC1
1,COc1ccc(C=NNC(N)=S)cc1OC(=O)c1cccc2ccccc12
2,COc1ccccc1COC(=O)Nc1c(C)nn(C)c1Cl
3,COc1ccc(Cl)cc1S(=O)(=O)N1CCN(C(=O)c2cc(OC)c(OC...
4,O=C(c1ccccc1F)c1cn(Cc2ccccc2)c2ccccc12
...,...
41230,NS(=O)(=O)c1cc2c(cc1Cl)NC=NS2(=O)=O
41231,CN1CCCC1CCNC1C2CC3CC(C2)CC1C3
41232,Cn1c(=O)c(-c2ccc(F)cc2)nc2cnc(OCc3ccccc3)nc21
41233,O=C(CNS(=O)(=O)c1ccc(Br)cc1)N1CCOCC1


### 3.3. Remove inorganic compounds in SMILES data

In [9]:
removing_inorganics = ValidationStage(smiles)
inorganic_compounds_removed_data = removing_inorganics.remove_inorganic_compounds(output_dir_path=reports_dir_path, get_report=True, print_logs=True)
inorganic_compounds_removed_data

Number of input SMILES: 41235
Number of organic compounds: 41224
Number of inorganic compounds: 11
List of inorganic indexes: 
[3867, 5630, 11006, 12769, 13197, 23864, 24454, 25575, 30277, 33457, 41055]
3867. O=P(O)(O)OP(=O)(O)O
5630. NO
11006. NNNNN
12769. O=P(O)(O)OP(=O)(O)O
13197. NO
23864. NO
24454. O=P(O)(O)OP(=O)(O)O
25575. NO
30277. O=P(O)(O)OP(=O)(O)O
33457. NNNNN
41055. NO


Unnamed: 0,compound
0,CN(C(=O)Cc1ccccc1)C1CCC2(CCCO2)CC1N1CCCC1
1,COc1ccc(C=NNC(N)=S)cc1OC(=O)c1cccc2ccccc12
2,COc1ccccc1COC(=O)Nc1c(C)nn(C)c1Cl
3,COc1ccc(Cl)cc1S(=O)(=O)N1CCN(C(=O)c2cc(OC)c(OC...
4,O=C(c1ccccc1F)c1cn(Cc2ccccc2)c2ccccc12
...,...
41230,NS(=O)(=O)c1cc2c(cc1Cl)NC=NS2(=O)=O
41231,CN1CCCC1CCNC1C2CC3CC(C2)CC1C3
41232,Cn1c(=O)c(-c2ccc(F)cc2)nc2cnc(OCc3ccccc3)nc21
41233,O=C(CNS(=O)(=O)c1ccc(Br)cc1)N1CCOCC1


### 3.4. Remove organometallic compounds in SMILES data

In [10]:
removing_organometallics = ValidationStage(smiles)
organometallic_compounds_removed_data = removing_organometallics.remove_organometallic_compounds(output_dir_path=reports_dir_path, get_report=True, print_logs=True)
organometallic_compounds_removed_data

Number of input SMILES: 41235
Number of organic compounds: 41235
Number of organometallic compounds: 0



Unnamed: 0,compound
0,CN(C(=O)Cc1ccccc1)C1CCC2(CCCO2)CC1N1CCCC1
1,COc1ccc(C=NNC(N)=S)cc1OC(=O)c1cccc2ccccc12
2,COc1ccccc1COC(=O)Nc1c(C)nn(C)c1Cl
3,COc1ccc(Cl)cc1S(=O)(=O)N1CCN(C(=O)c2cc(OC)c(OC...
4,O=C(c1ccccc1F)c1cn(Cc2ccccc2)c2ccccc12
...,...
41230,NS(=O)(=O)c1cc2c(cc1Cl)NC=NS2(=O)=O
41231,CN1CCCC1CCNC1C2CC3CC(C2)CC1C3
41232,Cn1c(=O)c(-c2ccc(F)cc2)nc2cnc(OCc3ccccc3)nc21
41233,O=C(CNS(=O)(=O)c1ccc(Br)cc1)N1CCOCC1


### 3.5. Validate SMILES data completely

In [7]:
completely_valid_smiles = ValidationStage(smiles).wholly_validate_smiles(output_dir_path=reports_dir_path, get_report=True)
completely_valid_smiles

VALIDATION STEP:
Number of input SMILES: 41235
Number of valid SMILES: 41235
Number of invalid SMILES: 0

----------
MIXTURES REMOVING STEP:
Number of input SMILES: 41235
Number of non-mixture SMILES: 41235
Number of mixture SMILES: 0

----------
INORGANICS REMOVING STEP:
Number of input SMILES: 41235
Number of organic compounds: 41224
Number of inorganic compounds: 11
List of inorganic indexes: 
[3867, 5630, 11006, 12769, 13197, 23864, 24454, 25575, 30277, 33457, 41055]
3867. O=P(O)(O)OP(=O)(O)O
5630. NO
11006. NNNNN
12769. O=P(O)(O)OP(=O)(O)O
13197. NO
23864. NO
24454. O=P(O)(O)OP(=O)(O)O
25575. NO
30277. O=P(O)(O)OP(=O)(O)O
33457. NNNNN
41055. NO

----------
ORGANOMETALLIC REMOVING STEP:
Number of input SMILES: 41224
Number of organic compounds: 41224
Number of organometallic compounds: 0

----------
VALIDATE COMPLETE!
SUMMARY:
Number of input SMILES: 41235
Number of invalid SMILES: 11
Number of valid SMILES: 41224
List of invalid SMILES indexes: 
[3867, 5630, 11006, 12769, 13197, 2

Unnamed: 0,compound
0,CN(C(=O)Cc1ccccc1)C1CCC2(CCCO2)CC1N1CCCC1
1,COc1ccc(C=NNC(N)=S)cc1OC(=O)c1cccc2ccccc12
2,COc1ccccc1COC(=O)Nc1c(C)nn(C)c1Cl
3,COc1ccc(Cl)cc1S(=O)(=O)N1CCN(C(=O)c2cc(OC)c(OC...
4,O=C(c1ccccc1F)c1cn(Cc2ccccc2)c2ccccc12
...,...
41230,NS(=O)(=O)c1cc2c(cc1Cl)NC=NS2(=O)=O
41231,CN1CCCC1CCNC1C2CC3CC(C2)CC1C3
41232,Cn1c(=O)c(-c2ccc(F)cc2)nc2cnc(OCc3ccccc3)nc21
41233,O=C(CNS(=O)(=O)c1ccc(Br)cc1)N1CCOCC1


## 4. Clean SMILES strings
In this step, salts are identified, counter-ions are deleted, and remaining SMILES strings are neutralized.

In [7]:
cleaned_salts = CleaningStage(smiles).clean_salts(output_dir_path=reports_dir_path, get_report=True)
cleaned_salts

Pre-cleaned smiles data: 41235
Number of salts were cleaned: 5
Number of substance were missing after cleaning salts: 5
Post-cleaned smiles data: 41230
List of salts were cleaned: [4892, 12898, 24951, 30734, 37176]
4892. O=C(O)C(O)C(O)C(=O)O
12898. O=C(O)C(O)C(O)C(=O)O
24951. O=C(O)C(O)C(O)C(=O)O
30734. O=C(O)C(O)C(O)C(=O)O
37176. O=C(O)C(O)C(O)C(=O)O


Unnamed: 0,compound
0,CN(C(=O)Cc1ccccc1)C1CCC2(CCCO2)CC1N1CCCC1
1,COc1ccc(C=NNC(N)=S)cc1OC(=O)c1cccc2ccccc12
2,COc1ccccc1COC(=O)Nc1c(C)nn(C)c1Cl
3,COc1ccc(Cl)cc1S(=O)(=O)N1CCN(C(=O)c2cc(OC)c(OC...
4,O=C(c1ccccc1F)c1cn(Cc2ccccc2)c2ccccc12
...,...
41230,NS(=O)(=O)c1cc2c(cc1Cl)NC=NS2(=O)=O
41231,CN1CCCC1CCNC1C2CC3CC(C2)CC1C3
41232,Cn1c(=O)c(-c2ccc(F)cc2)nc2cnc(OCc3ccccc3)nc21
41233,O=C(CNS(=O)(=O)c1ccc(Br)cc1)N1CCOCC1


In [8]:
neutralized = CleaningStage(smiles).neutralize(output_dir_path=reports_dir_path, get_report=True)
neutralized

Pre-cleaned smiles data: 41235
Number of salts were cleaned: 9
Post-cleaned smiles data: 41235
List of salts were cleaned:
1. O=[N+]([O-])c1cccc2c[nH]nc12
2. Cc1ccc(NC(=O)c2cc3ccccc3cc2O)c(C)c1
3. O=[N+]([O-])c1cccc2c[nH]nc12
4. CCc1cc[n+](-c2nc3ccccc3nc2C(=C=N)C#N)cc1
5. O=[N+]([O-])c1cccc2c[nH]nc12
6. Cc1ccc(NC(=O)c2cc3ccccc3cc2O)c(C)c1
7. O=[N+]([O-])c1cccc2c[nH]nc12
8. Cc1ccc(NC(=O)c2cc3ccccc3cc2O)c(C)c1
9. CC1=NN(c2ccc(S(=O)(=O)O)cc2C)C(=O)C1N=Nc1cc(S(=O)(=O)O)ccc1C(=O)O


Unnamed: 0,compound
0,CN(C(=O)Cc1ccccc1)C1CCC2(CCCO2)CC1N1CCCC1
1,COc1ccc(C=NNC(N)=S)cc1OC(=O)c1cccc2ccccc12
2,COc1ccccc1COC(=O)Nc1c(C)nn(C)c1Cl
3,COc1ccc(Cl)cc1S(=O)(=O)N1CCN(C(=O)c2cc(OC)c(OC...
4,O=C(c1ccccc1F)c1cn(Cc2ccccc2)c2ccccc12
...,...
41230,NS(=O)(=O)c1cc2c(cc1Cl)NC=NS2(=O)=O
41231,CN1CCCC1CCNC1C2CC3CC(C2)CC1C3
41232,Cn1c(=O)c(-c2ccc(F)cc2)nc2cnc(OCc3ccccc3)nc21
41233,O=C(CNS(=O)(=O)c1ccc(Br)cc1)N1CCOCC1


In [9]:
cleaned_smiles = CleaningStage(smiles).clean_and_neutralize(output_dir_path=reports_dir_path, get_report=True)
cleaned_smiles

Pre-cleaned smiles data: 41235
Number of salts were cleaned: 5
Number of substance were missing after cleaning salts: 5
Post-cleaned smiles data: 41230
List of salts were cleaned:
1. O=C(O)C(O)C(O)C(=O)O
2. O=C(O)C(O)C(O)C(=O)O
3. O=C(O)C(O)C(O)C(=O)O
4. O=C(O)C(O)C(O)C(=O)O
5. O=C(O)C(O)C(O)C(=O)O
Pre-cleaned smiles data: 41230
Number of salts were cleaned: 9
Post-cleaned smiles data: 41230
List of salts were cleaned:
1. O=[N+]([O-])c1cccc2c[nH]nc12
2. Cc1ccc(NC(=O)c2cc3ccccc3cc2O)c(C)c1
3. O=[N+]([O-])c1cccc2c[nH]nc12
4. CCc1cc[n+](-c2nc3ccccc3nc2C(=C=N)C#N)cc1
5. O=[N+]([O-])c1cccc2c[nH]nc12
6. Cc1ccc(NC(=O)c2cc3ccccc3cc2O)c(C)c1
7. O=[N+]([O-])c1cccc2c[nH]nc12
8. Cc1ccc(NC(=O)c2cc3ccccc3cc2O)c(C)c1
9. CC1=NN(c2ccc(S(=O)(=O)O)cc2C)C(=O)C1N=Nc1cc(S(=O)(=O)O)ccc1C(=O)O
Pre-cleaned SMILES data: 41235
Number of salts were cleaned: 5
Number of substances were neutralized: 9
Post-cleaned SMILES data: 41230



Unnamed: 0,compound
0,CN(C(=O)Cc1ccccc1)C1CCC2(CCCO2)CC1N1CCCC1
1,COc1ccc(C=NNC(N)=S)cc1OC(=O)c1cccc2ccccc12
2,COc1ccccc1COC(=O)Nc1c(C)nn(C)c1Cl
3,COc1ccc(Cl)cc1S(=O)(=O)N1CCN(C(=O)c2cc(OC)c(OC...
4,O=C(c1ccccc1F)c1cn(Cc2ccccc2)c2ccccc12
...,...
41225,NS(=O)(=O)c1cc2c(cc1Cl)NC=NS2(=O)=O
41226,CN1CCCC1CCNC1C2CC3CC(C2)CC1C3
41227,Cn1c(=O)c(-c2ccc(F)cc2)nc2cnc(OCc3ccccc3)nc21
41228,O=C(CNS(=O)(=O)c1ccc(Br)cc1)N1CCOCC1


## 5. Normalize tautomers in data
In this step, tautomers in data will be normalized to become only one specific form for all tautomers.

In [7]:
tautomerized_smiles = normalize_tautomer_in_dataframe(smiles)
tautomerized_smiles

[17:12:55] Can't kekulize mol.  Unkekulized atoms: 5 17 18 19 28 29
[17:12:55] Can't kekulize mol.  Unkekulized atoms: 3 5 6 10
[17:12:55] Can't kekulize mol.  Unkekulized atoms: 10 14
[17:12:55] Can't kekulize mol.  Unkekulized atoms: 10 14
[17:12:55] Can't kekulize mol.  Unkekulized atoms: 11 15
[17:12:55] Can't kekulize mol.  Unkekulized atoms: 6 9 13 16
[17:12:55] Can't kekulize mol.  Unkekulized atoms: 6 9 13 16
[17:12:55] Can't kekulize mol.  Unkekulized atoms: 2 6
[17:12:55] Can't kekulize mol.  Unkekulized atoms: 2 6
[17:12:55] Can't kekulize mol.  Unkekulized atoms: 2 6
[17:12:55] Can't kekulize mol.  Unkekulized atoms: 2 6
[17:12:55] Can't kekulize mol.  Unkekulized atoms: 2 6
[17:12:55] Can't kekulize mol.  Unkekulized atoms: 2 6
[17:12:55] Can't kekulize mol.  Unkekulized atoms: 2 6
[17:12:55] Can't kekulize mol.  Unkekulized atoms: 2 6
[17:12:55] Can't kekulize mol.  Unkekulized atoms: 2 6
[17:12:55] Tautomer enumeration stopped at 162 tautomers: max transforms reached
[17

Number of SMILES strings before tautomer normalizing: 41235
Number of SMILES tautomers are normalizing: 3984
Number of SMILES strings after tautomer normalizing: 41235



Unnamed: 0,compound
0,CN(C(=O)Cc1ccccc1)C1CCC2(CCCO2)CC1N1CCCC1
1,COc1ccc(C=NNC(N)=S)cc1OC(=O)c1cccc2ccccc12
2,COc1ccccc1COC(=O)Nc1c(C)nn(C)c1Cl
3,COc1ccc(Cl)cc1S(=O)(=O)N1CCN(C(=O)c2cc(OC)c(OC...
4,O=C(c1ccccc1F)c1cn(Cc2ccccc2)c2ccccc12
...,...
41230,NS(=O)(=O)c1cc2c(cc1Cl)N=CNS2(=O)=O
41231,CN1CCCC1CCNC1C2CC3CC(C2)CC1C3
41232,Cn1c(=O)c(-c2ccc(F)cc2)nc2cnc(OCc3ccccc3)nc21
41233,O=C(CNS(=O)(=O)c1ccc(Br)cc1)N1CCOCC1


In [10]:
duplicated_smiles, post_removed_smiles = remove_duplicates_in_dataframe(smiles, report_dir_path=reports_dir_path, get_report_text_file=True)
duplicated_smiles.columns

Number of input SMILES strings: 41235
Number of unique SMILES strings: 14373
Number of duplicate SMILES strings: 26862



Index(['compound', 'index'], dtype='object')

In [8]:
post_removed_smiles

Unnamed: 0,compound
0,CN(C(=O)Cc1ccccc1)C1CCC2(CCCO2)CC1N1CCCC1
1,COc1ccc(C=NNC(N)=S)cc1OC(=O)c1cccc2ccccc12
2,COc1ccccc1COC(=O)Nc1c(C)nn(C)c1Cl
3,COc1ccc(Cl)cc1S(=O)(=O)N1CCN(C(=O)c2cc(OC)c(OC...
4,O=C(c1ccccc1F)c1cn(Cc2ccccc2)c2ccccc12
...,...
41154,O=C(NCC1CCCO1)c1cccc(F)c1
41195,COc1ccc(C(=O)NC2CCN(C(=S)Nc3cccc(C)c3)CC2)cc1
41197,CCNC(=O)c1ccc(-n2nc(C(F)(F)F)c3c2CCCC3)cc1
41208,O=C(c1ccccc1Nc1ccc(SC(F)F)cc1)N1CCCCC1
