# MEHC-CURATION
## 0. Install libraries

In [1]:
%pip install parallel_pandas
%conda install conda-forge::rdkit

Note: you may need to restart the kernel to use updated packages.
Retrieving notices: done
Channels:
 - conda-forge
 - bioconda
 - defaults
 - salilab
 - https://conda.graylab.jhu.edu
Platform: osx-arm64
Collecting package metadata (repodata.json): done
Solving environment: done

## Package Plan ##

  environment location: /opt/anaconda3/envs/mehc_curation

  added / updated specs:
    - conda-forge::rdkit


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    joblib-1.5.2               |     pyhd8ed1ab_0         219 KB  conda-forge
    libpq-18.0                 |       h31f7a3a_0         2.5 MB  conda-forge
    librdkit-2025.09.1         |       hab8bf10_1         6.5 MB  conda-forge
    rdkit-2025.09.1            |  py310h7b699c0_1        17.5 MB  conda-forge
    ------------------------------------------------------------
                                           Total:        26.8 MB

T

## 1. Import libraries

In [1]:
import numpy as np
import pandas as pd
import os
from parallel_pandas import ParallelPandas
from rdkit import Chem
from rdkit.Chem.SaltRemover import SaltRemover

In [2]:
from curation.utils import deduplicate, RemoveSpecificSMILES, GetReport
from curation.validate import ValidationStage
from curation.cleaning import *
from curation.normalization import *
from curation.refinement import Refinement

## 2. Initialize CPUs, import data set and set-up the output directory

In [3]:
pd.options.mode.chained_assignment = None  # default='warn'

In [4]:
smiles = pd.read_csv('ml_testing/bbbp/BBBP.csv')
col = list(smiles.columns)
col.reverse()
smiles = smiles[col]
smiles.drop(columns=['num', 'name'], inplace=True)
smiles.to_csv('reports/bbbp.csv', index=False)
smiles

Unnamed: 0,smiles,p_np
0,[Cl].CC(C)NCC(O)COc1cccc2ccccc12,1
1,C(=O)(OC(C)(C)C)CCCc1ccc(cc1)N(CCCl)CCCl,1
2,c12c3c(N4CCN(C)CC4)c(F)cc1c(c(C(O)=O)cn2C(C)CO...,1
3,C1CCN(CC1)Cc1cccc(c1)OCCCNC(=O)C,1
4,Cc1onc(c2ccccc2Cl)c1C(=O)N[C@H]3[C@H]4SC(C)(C)...,1
...,...,...
2045,C1=C(Cl)C(=C(C2=C1NC(=O)C(N2)=O)[N+](=O)[O-])Cl,1
2046,[C@H]3([N]2C1=C(C(=NC=N1)N)N=C2)[C@@H]([C@@H](...,1
2047,[O+]1=N[N](C=C1[N-]C(NC2=CC=CC=C2)=O)C(CC3=CC=...,1
2048,C1=C(OC)C(=CC2=C1C(=[N+](C(=C2CC)C)[NH-])C3=CC...,1


In [8]:
output_dir = os.path.join(os.getcwd(), 'reports')
output_dir

'/Users/stevepham/Documents/Bioinfo/NZResearchGr/mehc-curation/reports'

## 3. Remove invalid SMILES strings
In this part, invalid SMILES strings will be deleted out of the input data. These SMILES strings include SMILES strings with incorrect format, mixture, inorganic, and organometallic compounds.

### 3.1. Validate SMILES data

In [7]:
valid_smiles = ValidationStage(smiles).validate_smi(output_dir=output_dir, get_report=True, print_logs=True, n_cpu=16, split_factor=4)
valid_smiles



*****************************************************************
*								*
*			1. VALIDATION		        	*
*								*
*****************************************************************

0. VALIDITY CHECK
Number of input SMILES: 7831
Number of invalid SMILES: 0
Number of valid SMILES: 7831


*****************************************************************
*								*
*			-----END----- 				*
*								*
*****************************************************************




Unnamed: 0,smiles,mol_id,SR-p53,SR-MMP,SR-HSE,SR-ATAD5,SR-ARE,NR-PPAR-gamma,NR-ER-LBD,NR-ER,NR-Aromatase,NR-AhR,NR-AR-LBD,NR-AR
0,CCOc1ccc2nc(S(N)(=O)=O)sc2c1,TOX3021,0.0,0.0,0.0,0.0,1.0,0.0,0.0,,,1.0,0.0,0.0
1,CCN1C(=O)NC(c2ccccc2)C1=O,TOX3020,0.0,0.0,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,CC[C@]1(O)CC[C@H]2[C@@H]3CCC4=CCCC[C@@H]4[C@H]...,TOX3024,,,0.0,,0.0,,,,,,,
3,CCCN(CC)C(CC)C(=O)Nc1c(C)cccc1C,TOX3027,0.0,0.0,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,CC(O)(P(=O)(O)O)P(=O)(O)O,TOX20800,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7826,CCOc1nc2cccc(C(=O)O)c2n1Cc1ccc(-c2ccccc2-c2nnn...,TOX2725,,,0.0,,0.0,,,,,,,
7827,CC(=O)[C@H]1CC[C@H]2[C@@H]3CCC4=CC(=O)CC[C@]4(...,TOX2370,0.0,,0.0,0.0,,,0.0,1.0,0.0,0.0,1.0,1.0
7828,C[C@]12CC[C@H]3[C@@H](CCC4=CC(=O)CC[C@@]43C)[C...,TOX2371,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0
7829,C[C@]12CC[C@@H]3c4ccc(O)cc4CC[C@H]3[C@@H]1CC[C...,TOX2377,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,,0.0,1.0,1.0


### 3.2. Remove mixtures in SMILES data

In [8]:
mixture_removed_smiles = ValidationStage(valid_smiles).rm_mixtures(output_dir=output_dir, get_report=True, print_logs=True, n_cpu=16, split_factor=4)
mixture_removed_smiles



*****************************************************************
*								*
*			1. VALIDATION		        	*
*								*
*****************************************************************

0. VALIDITY CHECK
Number of input SMILES: 7831
Number of invalid SMILES: 0
Number of valid SMILES: 7831

1. MIXTURE REMOVAL
Number of input SMILES: 7831
Number of mixture SMILES: 220
Number of non-mixture SMILES: 7611


*****************************************************************
*								*
*			-----END----- 				*
*								*
*****************************************************************


Unnamed: 0,smiles,mol_id,SR-p53,SR-MMP,SR-HSE,SR-ATAD5,SR-ARE,NR-PPAR-gamma,NR-ER-LBD,NR-ER,NR-Aromatase,NR-AhR,NR-AR-LBD,NR-AR
0,CCOc1ccc2nc(S(N)(=O)=O)sc2c1,TOX3021,0.0,0.0,0.0,0.0,1.0,0.0,0.0,,,1.0,0.0,0.0
1,CCN1C(=O)NC(c2ccccc2)C1=O,TOX3020,0.0,0.0,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,CC[C@]1(O)CC[C@H]2[C@@H]3CCC4=CCCC[C@@H]4[C@H]...,TOX3024,,,0.0,,0.0,,,,,,,
3,CCCN(CC)C(CC)C(=O)Nc1c(C)cccc1C,TOX3027,0.0,0.0,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,CC(O)(P(=O)(O)O)P(=O)(O)O,TOX20800,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7826,CCOc1nc2cccc(C(=O)O)c2n1Cc1ccc(-c2ccccc2-c2nnn...,TOX2725,,,0.0,,0.0,,,,,,,
7827,CC(=O)[C@H]1CC[C@H]2[C@@H]3CCC4=CC(=O)CC[C@]4(...,TOX2370,0.0,,0.0,0.0,,,0.0,1.0,0.0,0.0,1.0,1.0
7828,C[C@]12CC[C@H]3[C@@H](CCC4=CC(=O)CC[C@@]43C)[C...,TOX2371,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0
7829,C[C@]12CC[C@@H]3c4ccc(O)cc4CC[C@H]3[C@@H]1CC[C...,TOX2377,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,,0.0,1.0,1.0


### 3.3. Remove inorganic compounds in SMILES data

In [8]:
inorganic_compounds_removed_data = ValidationStage(mixture_removed_smiles).rm_inorganics(output_dir=output_dir, get_report=True, print_logs=True, n_cpu=16, split_factor=4)
inorganic_compounds_removed_data

*****************************************************************
*								*
*			1. VALIDATION		        	*
*								*
*****************************************************************

0. VALIDITY CHECK
Number of input SMILES: 41236
Number of invalid SMILES: 0
Number of valid SMILES: 41236

2. INORGANICS REMOVAL
Number of input SMILES: 41236
Number of inorganic compounds: 11
Number of organic compounds: 41225


*****************************************************************
*								*
*			-----END----- 				*
*								*
*****************************************************************


Unnamed: 0,compound
0,CN(C(=O)Cc1ccccc1)[C@@H]1CC[C@@]2(CCCO2)C[C@H]...
1,COc1ccc(/C=N/NC(N)=S)cc1OC(=O)c1cccc2ccccc12
2,COc1ccccc1COC(=O)Nc1c(C)nn(C)c1Cl
3,COc1ccc(Cl)cc1S(=O)(=O)N1CCN(C(=O)c2cc(OC)c(OC...
4,O=C(c1ccccc1F)c1cn(Cc2ccccc2)c2ccccc12
...,...
41231,CN1CCCC1CCNC1C2CC3CC(C2)CC1C3
41232,Cn1c(=O)c(-c2ccc(F)cc2)nc2cnc(OCc3ccccc3)nc21
41233,O=C(CNS(=O)(=O)c1ccc(Br)cc1)N1CCOCC1
41234,CCc1c(O)c2c(c(O)c1C(C)=O)C(=O)c1c(cc(O)c(C(=O)...


### 3.4. Remove organometallic compounds in SMILES data

In [10]:
organometallic_compounds_removed_data = ValidationStage(smiles).rm_organometallics(output_dir=output_dir, get_report=True, print_logs=True, n_cpu=16, split_factor=4)
organometallic_compounds_removed_data

n_cpu: 16, split_factor: 4
                                            compound  is_valid
0  CN(C(=O)Cc1ccccc1)[C@@H]1CC[C@@]2(CCCO2)C[C@H]...      True
1       COc1ccc(/C=N/NC(N)=S)cc1OC(=O)c1cccc2ccccc12      True
2                  COc1ccccc1COC(=O)Nc1c(C)nn(C)c1Cl      True
3  COc1ccc(Cl)cc1S(=O)(=O)N1CCN(C(=O)c2cc(OC)c(OC...      True
4             O=C(c1ccccc1F)c1cn(Cc2ccccc2)c2ccccc12      True
*****************************************************************
*								*
*			1. VALIDATION		        	*
*								*
*****************************************************************

0. VALIDITY CHECK
Number of input SMILES: 41236
Number of invalid SMILES: 0
Number of valid SMILES: 41236

3. ORGANOMETALLICS REMOVAL
Number of input SMILES: 41236
Number of organometallic compounds: 0
Number of organics: 41236


*****************************************************************
*								*
*			-----END----- 				*
*								*
*****************************************************************


Unnamed: 0,compound
0,CN(C(=O)Cc1ccccc1)[C@@H]1CC[C@@]2(CCCO2)C[C@H]...
1,COc1ccc(/C=N/NC(N)=S)cc1OC(=O)c1cccc2ccccc12
2,COc1ccccc1COC(=O)Nc1c(C)nn(C)c1Cl
3,COc1ccc(Cl)cc1S(=O)(=O)N1CCN(C(=O)c2cc(OC)c(OC...
4,O=C(c1ccccc1F)c1cn(Cc2ccccc2)c2ccccc12
...,...
41231,CN1CCCC1CCNC1C2CC3CC(C2)CC1C3
41232,Cn1c(=O)c(-c2ccc(F)cc2)nc2cnc(OCc3ccccc3)nc21
41233,O=C(CNS(=O)(=O)c1ccc(Br)cc1)N1CCOCC1
41234,CCc1c(O)c2c(c(O)c1C(C)=O)C(=O)c1c(cc(O)c(C(=O)...


### 3.5. Validate SMILES data completely

In [9]:
completely_valid_smiles = ValidationStage(smiles).complete_validation(output_dir=output_dir, get_report=True, n_cpu=16, split_factor=4)
completely_valid_smiles

[17:46:13] Explicit valence for atom # 11 N, 4, is greater than permitted
[17:46:13] Explicit valence for atom # 12 N, 4, is greater than permitted
[17:46:13] Explicit valence for atom # 5 N, 4, is greater than permitted
[17:46:13] Explicit valence for atom # 5 N, 4, is greater than permitted
[17:46:13] Explicit valence for atom # 5 N, 4, is greater than permitted
[17:46:13] Explicit valence for atom # 5 N, 4, is greater than permitted
[17:46:13] Explicit valence for atom # 5 N, 4, is greater than permitted
[17:46:13] Explicit valence for atom # 5 N, 4, is greater than permitted
[17:46:13] Explicit valence for atom # 1 N, 4, is greater than permitted
[17:46:13] Explicit valence for atom # 6 N, 4, is greater than permitted
[17:46:13] Explicit valence for atom # 6 N, 4, is greater than permitted


*****************************************************************
*								*
*			1. VALIDATION		        	*
*								*
*****************************************************************

0. VALIDITY CHECK
Number of input SMILES: 2050
Number of invalid SMILES: 11
Number of valid SMILES: 2039

1.1. MIXTURE REMOVAL
Number of input SMILES: 2039
Number of mixture SMILES: 37
Number of non-mixture SMILES: 2002

1.2. INORGANICS REMOVAL
Number of input SMILES: 2002
Number of inorganic compounds: 1
Number of organic compounds: 2001

1.3. ORGANOMETALLICS REMOVAL
Number of input SMILES: 2001
Number of organometallic compounds: 0
Number of organics: 2001

------------------------------------------------
DUPLICATES REMOVAL
------------------------------------------------
Number of input SMILES: 2001
Number of perfect duplicated SMILES: 50
Number of partial duplicated SMILES: 0
Number of unique SMILES: 1927


*****************************************************************
*								*
*			-----END-----

Unnamed: 0,smiles,p_np
0,CC(C)NCC(O)COc1cccc2ccccc12.[Cl],1
1,CC(C)(C)OC(=O)CCCc1ccc(N(CCCl)CCCl)cc1,1
2,CC1COc2c(N3CCN(C)CC3)c(F)cc3c(=O)c(C(=O)O)cn1c23,1
3,CC(=O)NCCCOc1cccc(CN2CCCCC2)c1,1
4,Cc1onc(-c2ccccc2Cl)c1C(=O)N[C@@H]1C(=O)N2[C@@H...,1
...,...,...
1922,O=c1[nH]c2cc(Cl)c(Cl)c([N+](=O)[O-])c2[nH]c1=O,1
1923,C[S+](CC[C@H](N)C(=O)[O-])C[C@H]1O[C@@H](n2cnc...,1
1924,CC(Cc1ccccc1)n1cc([N-]C(=O)Nc2ccccc2)[o+]n1,1
1925,CCc1c(C)[n+]([NH-])c(-c2ccc(OC)c(OC)c2)c2cc(OC...,1


## 4. Clean SMILES strings
In this step, salts are identified, counter-ions are deleted, and remaining SMILES strings are neutralized.

In [10]:
cleaned_salts = CleaningStage(smiles).cl_salts(output_dir=output_dir, get_report=True, param_deduplicate=True, n_cpu=16, split_factor=4)
cleaned_salts


*****************************************************************
*								*
*			2. CLEANING				*
*								*
*****************************************************************

0. VALIDITY CHECK
Number of input SMILES: 41236
Number of invalid SMILES: 0
Number of valid SMILES: 41236

2.1. SALTS CLEANING
Number of input SMILES: 41236
Number of desalted compounds: 0
Number of unprocessable compounds: 0
Number of post-desalting compounds: 41236

------------------------------------------------
DUPLICATES REMOVAL
------------------------------------------------
Number of input SMILES: 41236
Number of duplicated SMILES: 26862
Number of unique SMILES: 14374


*****************************************************************
*								*
*			-----END----- 				*
*								*
*****************************************************************


Unnamed: 0,compound
0,CN(C(=O)Cc1ccccc1)[C@@H]1CC[C@@]2(CCCO2)C[C@H]...
1,COc1ccc(/C=N/NC(N)=S)cc1OC(=O)c1cccc2ccccc12
2,COc1ccccc1COC(=O)Nc1c(C)nn(C)c1Cl
3,COc1ccc(Cl)cc1S(=O)(=O)N1CCN(C(=O)c2cc(OC)c(OC...
4,O=C(c1ccccc1F)c1cn(Cc2ccccc2)c2ccccc12
...,...
14369,COc1ccc(C(=O)NC2CCN(C(=S)Nc3cccc(C)c3)CC2)cc1
14370,CCNC(=O)c1ccc(-n2nc(C(F)(F)F)c3c2CCCC3)cc1
14371,O=C(c1ccccc1Nc1ccc(SC(F)F)cc1)N1CCCCC1
14372,CC1=NN(c2ccc(S(=O)(=O)O)cc2C)C(=[OH+])[C@H]1/N...


In [11]:
neutralized = CleaningStage(smiles).neutralize(output_dir=output_dir, get_report=True, param_deduplicate=True, n_cpu=16, split_factor=4)
neutralized


*****************************************************************
*								*
*			2. CLEANING				*
*								*
*****************************************************************

0. VALIDITY CHECK
Number of input SMILES: 41236
Number of invalid SMILES: 0
Number of valid SMILES: 41236

2.2. NEUTRALIZATION
Number of input SMILES: 41236
Number of neutralized compounds: 9
Number of post-neutralization compounds: 41236

------------------------------------------------
DUPLICATES REMOVAL
------------------------------------------------
Number of input SMILES: 41236
Number of duplicated SMILES: 26862
Number of unique SMILES: 14374


*****************************************************************
*								*
*			-----END----- 				*
*								*
*****************************************************************


Unnamed: 0,compound
0,CN(C(=O)Cc1ccccc1)[C@@H]1CC[C@@]2(CCCO2)C[C@H]...
1,COc1ccc(/C=N/NC(N)=S)cc1OC(=O)c1cccc2ccccc12
2,COc1ccccc1COC(=O)Nc1c(C)nn(C)c1Cl
3,COc1ccc(Cl)cc1S(=O)(=O)N1CCN(C(=O)c2cc(OC)c(OC...
4,O=C(c1ccccc1F)c1cn(Cc2ccccc2)c2ccccc12
...,...
14369,COc1ccc(C(=O)NC2CCN(C(=S)Nc3cccc(C)c3)CC2)cc1
14370,CCNC(=O)c1ccc(-n2nc(C(F)(F)F)c3c2CCCC3)cc1
14371,O=C(c1ccccc1Nc1ccc(SC(F)F)cc1)N1CCCCC1
14372,CC1=NN(c2ccc(S(=O)(=O)O)cc2C)C(=O)[C@H]1/N=N/c...


In [10]:
cleaned_smiles = CleaningStage(smiles).complete_cleaning(output_dir=output_dir, get_report=True, param_deduplicate=True, n_cpu=16, split_factor=4)
cleaned_smiles

[17:46:35] Explicit valence for atom # 11 N, 4, is greater than permitted
[17:46:35] Explicit valence for atom # 12 N, 4, is greater than permitted
[17:46:35] Explicit valence for atom # 5 N, 4, is greater than permitted
[17:46:35] Explicit valence for atom # 5 N, 4, is greater than permitted
[17:46:35] Explicit valence for atom # 5 N, 4, is greater than permitted
[17:46:35] Explicit valence for atom # 5 N, 4, is greater than permitted
[17:46:35] Explicit valence for atom # 5 N, 4, is greater than permitted
[17:46:35] Explicit valence for atom # 5 N, 4, is greater than permitted
[17:46:35] Explicit valence for atom # 1 N, 4, is greater than permitted
[17:46:35] Explicit valence for atom # 6 N, 4, is greater than permitted
[17:46:36] Explicit valence for atom # 6 N, 4, is greater than permitted



*****************************************************************
*								*
*			2. CLEANING				*
*								*
*****************************************************************

0. VALIDITY CHECK
Number of input SMILES: 2050
Number of invalid SMILES: 11
Number of valid SMILES: 2039

2.1. SALTS CLEANING
Number of input SMILES: 2039
Number of desalted compounds: 98
Number of unprocessable compounds: 0
Number of post-desalting compounds: 2039

2.2. NEUTRALIZATION
Number of input SMILES: 2039
Number of neutralized compounds: 29
Number of unprocessable compound: 0
Number of post-neutralization compounds: 2039

------------------------------------------------
DUPLICATES REMOVAL
------------------------------------------------
Number of input SMILES: 2039
Number of perfect duplicated SMILES: 51
Number of partial duplicated SMILES: 0
Number of unique SMILES: 1961


*****************************************************************
*								*
*			-----END----- 				*
*								*
*******************

Unnamed: 0,smiles,p_np
0,CC(C)(C)OC(=O)CCCc1ccc(N(CCCl)CCCl)cc1,1
1,CC1COc2c(N3CCN(C)CC3)c(F)cc3c(=O)c(C(=O)O)cn1c23,1
2,CC(=O)NCCCOc1cccc(CN2CCCCC2)c1,1
3,Cc1onc(-c2ccccc2Cl)c1C(=O)N[C@@H]1C(=O)N2[C@@H...,1
4,CCN1CCN(C(=O)N[C@@H](C(=O)N[C@@H]2C(=O)N3C(C(=...,1
...,...,...
1956,O=c1[nH]c2cc(Cl)c(Cl)c([N+](=O)[O-])c2[nH]c1=O,1
1957,C[S+](CC[C@H](N)C(=O)O)C[C@H]1O[C@@H](n2cnc3c(...,1
1958,CC(Cc1ccccc1)n1cc(NC(=O)Nc2ccccc2)[o+]n1,1
1959,CCc1c(C)[n+]([NH-])c(-c2ccc(OC)c(OC)c2)c2cc(OC...,1


## 5. Normalize tautomers in data
In this step, tautomers in data will be normalized to become only one specific form for all tautomers.

In [13]:
tautomerized_smiles = NormalizingStage(smiles).detautomerize(output_dir=output_dir, get_report=True, param_deduplicate=True, n_cpu=16, split_factor=4)
tautomerized_smiles

[17:42:22] Can't kekulize mol.  Unkekulized atoms: 11 15
[17:42:22] Can't kekulize mol.  Unkekulized atoms: 5 17 18 19 28 29
[17:42:22] Can't kekulize mol.  Unkekulized atoms: 10 14
[17:42:22] Can't kekulize mol.  Unkekulized atoms: 3 5 6 10
[17:42:22] Can't kekulize mol.  Unkekulized atoms: 10 14
[17:42:22] Can't kekulize mol.  Unkekulized atoms: 6 9 13 16
[17:42:22] Can't kekulize mol.  Unkekulized atoms: 6 9 13 16
[17:42:22] Can't kekulize mol.  Unkekulized atoms: 2 6
[17:42:22] Can't kekulize mol.  Unkekulized atoms: 2 6
[17:42:22] Can't kekulize mol.  Unkekulized atoms: 2 6
[17:42:22] Can't kekulize mol.  Unkekulized atoms: 2 6
[17:42:22] Tautomer enumeration stopped at 162 tautomers: max transforms reached
[17:42:22] Can't kekulize mol.  Unkekulized atoms: 9 10 11 15 24 25
[17:42:22] Can't kekulize mol.  Unkekulized atoms: 2 6
[17:42:22] Can't kekulize mol.  Unkekulized atoms: 2 6
[17:42:22] Can't kekulize mol.  Unkekulized atoms: 2 6
[17:42:22] Can't kekulize mol.  Unkekulized a


*****************************************************************
*								*
*			3. NORMALIZATION			*
*								*
*****************************************************************

0. VALIDITY CHECK
Number of input SMILES: 41236
Number of invalid SMILES: 0
Number of valid SMILES: 41236

3.2. DETAUTOMERIZATION
Number of input SMILES: 41236
Number of detautomerized compounds: 7313
Number of post-detautomerization compounds: 41236

------------------------------------------------
DUPLICATES REMOVAL
------------------------------------------------
Number of input SMILES: 41236
Number of duplicated SMILES: 26865
Number of unique SMILES: 14371


*****************************************************************
*								*
*			-----END----- 				*
*								*
*****************************************************************


Unnamed: 0,compound
0,CN(C(=O)Cc1ccccc1)[C@@H]1CC[C@@]2(CCCO2)C[C@H]...
1,COc1ccc(/C=N/NC(N)=S)cc1OC(=O)c1cccc2ccccc12
2,COc1ccccc1COC(=O)Nc1c(C)nn(C)c1Cl
3,COc1ccc(Cl)cc1S(=O)(=O)N1CCN(C(=O)c2cc(OC)c(OC...
4,O=C(c1ccccc1F)c1cn(Cc2ccccc2)c2ccccc12
...,...
14366,COc1ccc(C(=O)NC2CCN(C(=S)Nc3cccc(C)c3)CC2)cc1
14367,CCNC(=O)c1ccc(-n2nc(C(F)(F)F)c3c2CCCC3)cc1
14368,O=C(c1ccccc1Nc1ccc(SC(F)F)cc1)N1CCCCC1
14369,Cc1cc(S(=O)(=O)O)ccc1-n1[nH]c(C)c(N=Nc2cc(S(=O...


In [14]:
stereoisomerized_smiles = NormalizingStage(cleaned_smiles).destereoisomerize(output_dir=output_dir, get_report=True, param_deduplicate=True, n_cpu=16, split_factor=4)
stereoisomerized_smiles

KeyboardInterrupt: 

In [13]:
post_validate = ValidationStage(stereoisomerized_smiles).validate_smi(output_dir=output_dir, get_report=True, n_cpu=16, split_factor=4)

*****************************************************************
*								*
*			1. VALIDATION		        	*
*								*
*****************************************************************

0. VALIDITY CHECK
Number of input SMILES: 7492
Number of invalid SMILES: 375
Number of valid SMILES: 7117


*****************************************************************
*								*
*			-----END----- 				*
*								*
*****************************************************************


In [11]:
post_normalized_smiles = NormalizingStage(smiles).complete_normalization(output_dir=output_dir, get_report=True, param_deduplicate=True, n_cpu=16, split_factor=4)
post_normalized_smiles

[17:47:09] Explicit valence for atom # 11 N, 4, is greater than permitted
[17:47:09] Explicit valence for atom # 12 N, 4, is greater than permitted
[17:47:09] Explicit valence for atom # 5 N, 4, is greater than permitted
[17:47:09] Explicit valence for atom # 5 N, 4, is greater than permitted
[17:47:09] Explicit valence for atom # 5 N, 4, is greater than permitted
[17:47:09] Explicit valence for atom # 5 N, 4, is greater than permitted
[17:47:09] Explicit valence for atom # 5 N, 4, is greater than permitted
[17:47:09] Explicit valence for atom # 5 N, 4, is greater than permitted
[17:47:09] Explicit valence for atom # 1 N, 4, is greater than permitted
[17:47:09] Explicit valence for atom # 6 N, 4, is greater than permitted
[17:47:09] Explicit valence for atom # 6 N, 4, is greater than permitted
[17:47:25] Can't kekulize mol.  Unkekulized atoms: 3 7
[17:47:25] Can't kekulize mol.  Unkekulized atoms: 3 7
[17:47:25] Can't kekulize mol.  Unkekulized atoms: 3 7
[17:47:25] Can't kekulize mol.


*****************************************************************
*								*
*			3. NORMALIZATION			*
*								*
*****************************************************************

0. VALIDITY CHECK
Number of input SMILES: 2050
Number of invalid SMILES: 11
Number of valid SMILES: 2039

3.1. DESTEREOISOMERIZATION
Number of input SMILES: 2039
Number of destereoisomerized compounds: 717
Number of post-destereoisomerization compounds: 2039

3.2. DETAUTOMERIZATION
Number of input SMILES: 2039
Number of detautomerized compounds: 249
Number of post-detautomerization compounds: 2039

------------------------------------------------
DUPLICATES REMOVAL
------------------------------------------------
Number of input SMILES: 2039
Number of perfect duplicated SMILES: 71
Number of partial duplicated SMILES: 0
Number of unique SMILES: 1934


*****************************************************************
*								*
*			-----END----- 				*
*								*
************************************************

Unnamed: 0,smiles,p_np
0,CC(C)NCC(O)COc1cccc2ccccc12.[Cl],1
1,CC(C)(C)OC(=O)CCCc1ccc(N(CCCl)CCCl)cc1,1
2,CC1COc2c(N3CCN(C)CC3)c(F)cc3c(=O)c(C(=O)O)cn1c23,1
3,CC(=O)NCCCOc1cccc(CN2CCCCC2)c1,1
4,Cc1onc(-c2ccccc2Cl)c1C(=O)NC1C(=O)N2C1SC(C)(C)...,1
...,...,...
1929,O=c1[nH]c2cc(Cl)c(Cl)c([N+](=O)[O-])c2[nH]c1=O,1
1930,C[S+](CCC(N)C(=O)[O-])CC1OC(n2cnc3c(N)ncnc32)C...,1
1931,CC(Cc1ccccc1)n1cc([N-]C(=O)Nc2ccccc2)[o+]n1,1
1932,CCc1c(C)[n+]([NH-])c(-c2ccc(OC)c(OC)c2)c2cc(OC...,1


In [15]:
post_removed_smiles, duplicated_smiles = deduplicate(smiles, validate=False, show_dup_smi_and_idx=True, output_dir=output_dir, get_report=True, n_cpu=16, split_factor=4)
duplicated_smiles

------------------------------------------------
DUPLICATES REMOVAL
------------------------------------------------
Number of input SMILES: 41236
Number of duplicated SMILES: 26862
Number of unique SMILES: 14374


*****************************************************************
*								*
*			-----END----- 				*
*								*
*****************************************************************


Unnamed: 0,compound,is_valid_x,is_valid_y,index
0,CN(C(=O)Cc1ccccc1)[C@@H]1CC[C@@]2(CCCO2)C[C@H]...,True,True,"(0, 21940, 28681)"
1,COc1ccc(/C=N/NC(N)=S)cc1OC(=O)c1cccc2ccccc12,True,True,"(1, 10775, 33546)"
2,COc1ccccc1COC(=O)Nc1c(C)nn(C)c1Cl,True,True,"(2, 14278)"
3,COc1ccc(Cl)cc1S(=O)(=O)N1CCN(C(=O)c2cc(OC)c(OC...,True,,
4,O=C(c1ccccc1F)c1cn(Cc2ccccc2)c2ccccc12,True,True,"(4, 24990)"
...,...,...,...,...
41231,CN1CCCC1CCNC1C2CC3CC(C2)CC1C3,True,True,"(3698, 17119, 19798, 25781, 41231)"
41232,Cn1c(=O)c(-c2ccc(F)cc2)nc2cnc(OCc3ccccc3)nc21,True,True,"(1740, 13123, 21706, 25678, 41232)"
41233,O=C(CNS(=O)(=O)c1ccc(Br)cc1)N1CCOCC1,True,True,"(7564, 12861, 29056, 41233)"
41234,CCc1c(O)c2c(c(O)c1C(C)=O)C(=O)c1c(cc(O)c(C(=O)...,True,True,"(574, 16585, 17663, 41234)"


In [7]:
post_removed_smiles

Unnamed: 0,compound
0,CN(C(=O)Cc1ccccc1)[C@@H]1CC[C@@]2(CCCO2)C[C@H]...
1,COc1ccc(/C=N/NC(N)=S)cc1OC(=O)c1cccc2ccccc12
2,COc1ccccc1COC(=O)Nc1c(C)nn(C)c1Cl
3,COc1ccc(Cl)cc1S(=O)(=O)N1CCN(C(=O)c2cc(OC)c(OC...
4,O=C(c1ccccc1F)c1cn(Cc2ccccc2)c2ccccc12
...,...
41195,COc1ccc(C(=O)NC2CCN(C(=S)Nc3cccc(C)c3)CC2)cc1
41197,CCNC(=O)c1ccc(-n2nc(C(F)(F)F)c3c2CCCC3)cc1
41208,O=C(c1ccccc1Nc1ccc(SC(F)F)cc1)N1CCCCC1
41219,CC1=NN(c2ccc(S(=O)(=O)O)cc2C)C(=[OH+])[C@H]1/N...


# 7. Refinement

In [16]:
post_refined_smiles = Refinement(smiles).refine_smiles(output_dir=output_dir, get_report=True, n_cpu=16, split_factor=4)
post_refined_smiles

[22:36:42] Explicit valence for atom # 11 N, 4, is greater than permitted
[22:36:42] Explicit valence for atom # 12 N, 4, is greater than permitted
[22:36:42] Explicit valence for atom # 5 N, 4, is greater than permitted
[22:36:42] Explicit valence for atom # 5 N, 4, is greater than permitted
[22:36:42] Explicit valence for atom # 5 N, 4, is greater than permitted
[22:36:42] Explicit valence for atom # 5 N, 4, is greater than permitted
[22:36:42] Explicit valence for atom # 5 N, 4, is greater than permitted
[22:36:42] Explicit valence for atom # 5 N, 4, is greater than permitted
[22:36:42] Explicit valence for atom # 1 N, 4, is greater than permitted
[22:36:42] Explicit valence for atom # 6 N, 4, is greater than permitted
[22:36:42] Explicit valence for atom # 6 N, 4, is greater than permitted

[22:37:40] Can't kekulize mol.  Unkekulized atoms: 3 7
[22:37:40] Can't kekulize mol.  Unkekulized atoms: 3 7
[22:37:40] Can't kekulize mol.  Unkekulized atoms: 3 7
[22:37:40] Can't kekulize mol

*****************************************************************
*								*
*			1. VALIDATION		        	*
*								*
*****************************************************************

0. VALIDITY CHECK
Number of input SMILES: 2050
Number of invalid SMILES: 11
Number of valid SMILES: 2039

1.1. MIXTURE REMOVAL
Number of input SMILES: 2039
Number of mixture SMILES: 37
Number of non-mixture SMILES: 2002

1.2. INORGANICS REMOVAL
Number of input SMILES: 2002
Number of inorganic compounds: 1
Number of organic compounds: 2001

1.3. ORGANOMETALLICS REMOVAL
Number of input SMILES: 2001
Number of organometallic compounds: 0
Number of organics: 2001

------------------------------------------------
DUPLICATES REMOVAL
------------------------------------------------
Number of input SMILES: 2001
Number of perfect duplicated SMILES: 50
Number of partial duplicated SMILES: 0
Number of unique SMILES: 1927


*****************************************************************
*								*
*			2. CLEANING		

Unnamed: 0,smiles,p_np
0,CC(C)(C)OC(=O)CCCc1ccc(N(CCCl)CCCl)cc1,1
1,CC1COc2c(N3CCN(C)CC3)c(F)cc3c(=O)c(C(=O)O)cn1c23,1
2,CC(=O)NCCCOc1cccc(CN2CCCCC2)c1,1
3,Cc1onc(-c2ccccc2Cl)c1C(=O)NC1C(=O)N2C1SC(C)(C)...,1
4,CCN1CCN(C(=O)NC(C(=O)NC2C(=O)N3C(C(=O)O)=C(CSc...,1
...,...,...
1889,O=c1[nH]c2cc(Cl)c(Cl)c([N+](=O)[O-])c2[nH]c1=O,1
1890,C[S+](CCC(N)C(=O)O)CC1OC(n2cnc3c(N)ncnc32)C(O)C1O,1
1891,CC(Cc1ccccc1)n1cc(NC(=O)Nc2ccccc2)[o+]n1,1
1892,CCc1c(C)[n+]([NH-])c(-c2ccc(OC)c(OC)c2)c2cc(OC...,1
