# MEHC-CURATION
## 0. Install libraries

In [1]:
%pip install parallel_pandas
%conda install conda-forge::rdkit

Note: you may need to restart the kernel to use updated packages.
Retrieving notices: ...working... done
Collecting package metadata (current_repodata.json): done
Solving environment: \ 
The environment is inconsistent, please check the package plan carefully
The following packages are causing the inconsistency:

  - defaults/osx-arm64::notebook-shim==0.2.3=py311hca03da5_0
  - defaults/osx-arm64::jsonschema==4.19.2=py311hca03da5_0
  - defaults/osx-arm64::jsonschema-specifications==2023.7.1=py311hca03da5_0
  - defaults/osx-arm64::nbconvert==7.10.0=py311hca03da5_0
  - defaults/osx-arm64::jupyterlab_server==2.25.1=py311hca03da5_0
  - conda-forge/osx-arm64::rdkit==2022.09.5=py311h912b5b5_0
  - defaults/osx-arm64::pandas==2.2.1=py311h7aedaa7_0
  - defaults/osx-arm64::referencing==0.30.2=py311hca03da5_0
  - defaults/osx-arm64::distributed==2024.5.0=py311hca03da5_0
  - defaults/osx-arm64::jupyterlab==4.0.11=py311hca03da5_0
  - defaults/osx-arm64::nbclient==0.8.0=py311hca03da5_

## 1. Import libraries

In [1]:
import numpy as np
import pandas as pd
import os
from parallel_pandas import ParallelPandas
from rdkit import Chem
from rdkit.Chem.SaltRemover import SaltRemover

In [2]:
from curation.utils import deduplicate, RemoveSpecificSMILES, GetReport
from curation.validate import ValidationStage
from curation.cleaning import *
from curation.normalization import *
from curation.refinement import Refinement

## 2. Initialize CPUs, import data set and set-up the output directory

In [3]:
pd.options.mode.chained_assignment = None  # default='warn'

In [5]:
smiles = pd.read_csv('ml_testing/tox21/tox21.csv')
col = list(smiles.columns)
col.reverse()
smiles = smiles[col]
smiles

Unnamed: 0,smiles,mol_id,SR-p53,SR-MMP,SR-HSE,SR-ATAD5,SR-ARE,NR-PPAR-gamma,NR-ER-LBD,NR-ER,NR-Aromatase,NR-AhR,NR-AR-LBD,NR-AR
0,CCOc1ccc2nc(S(N)(=O)=O)sc2c1,TOX3021,0.0,0.0,0.0,0.0,1.0,0.0,0.0,,,1.0,0.0,0.0
1,CCN1C(=O)NC(c2ccccc2)C1=O,TOX3020,0.0,0.0,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,CC[C@]1(O)CC[C@H]2[C@@H]3CCC4=CCCC[C@@H]4[C@H]...,TOX3024,,,0.0,,0.0,,,,,,,
3,CCCN(CC)C(CC)C(=O)Nc1c(C)cccc1C,TOX3027,0.0,0.0,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,CC(O)(P(=O)(O)O)P(=O)(O)O,TOX20800,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7826,CCOc1nc2cccc(C(=O)O)c2n1Cc1ccc(-c2ccccc2-c2nnn...,TOX2725,,,0.0,,0.0,,,,,,,
7827,CC(=O)[C@H]1CC[C@H]2[C@@H]3CCC4=CC(=O)CC[C@]4(...,TOX2370,0.0,,0.0,0.0,,,0.0,1.0,0.0,0.0,1.0,1.0
7828,C[C@]12CC[C@H]3[C@@H](CCC4=CC(=O)CC[C@@]43C)[C...,TOX2371,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0
7829,C[C@]12CC[C@@H]3c4ccc(O)cc4CC[C@H]3[C@@H]1CC[C...,TOX2377,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,,0.0,1.0,1.0


In [6]:
output_dir = os.path.join(os.getcwd(), 'reports')
output_dir

'/Users/stevepham/Documents/Bioinfo/NZResearchGr/mehc-curation/reports'

## 3. Remove invalid SMILES strings
In this part, invalid SMILES strings will be deleted out of the input data. These SMILES strings include SMILES strings with incorrect format, mixture, inorganic, and organometallic compounds.

### 3.1. Validate SMILES data

In [7]:
valid_smiles = ValidationStage(smiles).validate_smi(output_dir=output_dir, get_report=True, print_logs=True, n_cpu=16, split_factor=4)
valid_smiles



*****************************************************************
*								*
*			1. VALIDATION		        	*
*								*
*****************************************************************

0. VALIDITY CHECK
Number of input SMILES: 7831
Number of invalid SMILES: 0
Number of valid SMILES: 7831


*****************************************************************
*								*
*			-----END----- 				*
*								*
*****************************************************************




Unnamed: 0,smiles,mol_id,SR-p53,SR-MMP,SR-HSE,SR-ATAD5,SR-ARE,NR-PPAR-gamma,NR-ER-LBD,NR-ER,NR-Aromatase,NR-AhR,NR-AR-LBD,NR-AR
0,CCOc1ccc2nc(S(N)(=O)=O)sc2c1,TOX3021,0.0,0.0,0.0,0.0,1.0,0.0,0.0,,,1.0,0.0,0.0
1,CCN1C(=O)NC(c2ccccc2)C1=O,TOX3020,0.0,0.0,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,CC[C@]1(O)CC[C@H]2[C@@H]3CCC4=CCCC[C@@H]4[C@H]...,TOX3024,,,0.0,,0.0,,,,,,,
3,CCCN(CC)C(CC)C(=O)Nc1c(C)cccc1C,TOX3027,0.0,0.0,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,CC(O)(P(=O)(O)O)P(=O)(O)O,TOX20800,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7826,CCOc1nc2cccc(C(=O)O)c2n1Cc1ccc(-c2ccccc2-c2nnn...,TOX2725,,,0.0,,0.0,,,,,,,
7827,CC(=O)[C@H]1CC[C@H]2[C@@H]3CCC4=CC(=O)CC[C@]4(...,TOX2370,0.0,,0.0,0.0,,,0.0,1.0,0.0,0.0,1.0,1.0
7828,C[C@]12CC[C@H]3[C@@H](CCC4=CC(=O)CC[C@@]43C)[C...,TOX2371,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0
7829,C[C@]12CC[C@@H]3c4ccc(O)cc4CC[C@H]3[C@@H]1CC[C...,TOX2377,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,,0.0,1.0,1.0


### 3.2. Remove mixtures in SMILES data

In [8]:
mixture_removed_smiles = ValidationStage(valid_smiles).rm_mixtures(output_dir=output_dir, get_report=True, print_logs=True, n_cpu=16, split_factor=4)
mixture_removed_smiles



*****************************************************************
*								*
*			1. VALIDATION		        	*
*								*
*****************************************************************

0. VALIDITY CHECK
Number of input SMILES: 7831
Number of invalid SMILES: 0
Number of valid SMILES: 7831

1. MIXTURE REMOVAL
Number of input SMILES: 7831
Number of mixture SMILES: 220
Number of non-mixture SMILES: 7611


*****************************************************************
*								*
*			-----END----- 				*
*								*
*****************************************************************


Unnamed: 0,smiles,mol_id,SR-p53,SR-MMP,SR-HSE,SR-ATAD5,SR-ARE,NR-PPAR-gamma,NR-ER-LBD,NR-ER,NR-Aromatase,NR-AhR,NR-AR-LBD,NR-AR
0,CCOc1ccc2nc(S(N)(=O)=O)sc2c1,TOX3021,0.0,0.0,0.0,0.0,1.0,0.0,0.0,,,1.0,0.0,0.0
1,CCN1C(=O)NC(c2ccccc2)C1=O,TOX3020,0.0,0.0,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,CC[C@]1(O)CC[C@H]2[C@@H]3CCC4=CCCC[C@@H]4[C@H]...,TOX3024,,,0.0,,0.0,,,,,,,
3,CCCN(CC)C(CC)C(=O)Nc1c(C)cccc1C,TOX3027,0.0,0.0,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,CC(O)(P(=O)(O)O)P(=O)(O)O,TOX20800,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7826,CCOc1nc2cccc(C(=O)O)c2n1Cc1ccc(-c2ccccc2-c2nnn...,TOX2725,,,0.0,,0.0,,,,,,,
7827,CC(=O)[C@H]1CC[C@H]2[C@@H]3CCC4=CC(=O)CC[C@]4(...,TOX2370,0.0,,0.0,0.0,,,0.0,1.0,0.0,0.0,1.0,1.0
7828,C[C@]12CC[C@H]3[C@@H](CCC4=CC(=O)CC[C@@]43C)[C...,TOX2371,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0
7829,C[C@]12CC[C@@H]3c4ccc(O)cc4CC[C@H]3[C@@H]1CC[C...,TOX2377,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,,0.0,1.0,1.0


### 3.3. Remove inorganic compounds in SMILES data

In [8]:
inorganic_compounds_removed_data = ValidationStage(mixture_removed_smiles).rm_inorganics(output_dir=output_dir, get_report=True, print_logs=True, n_cpu=16, split_factor=4)
inorganic_compounds_removed_data

*****************************************************************
*								*
*			1. VALIDATION		        	*
*								*
*****************************************************************

0. VALIDITY CHECK
Number of input SMILES: 41236
Number of invalid SMILES: 0
Number of valid SMILES: 41236

2. INORGANICS REMOVAL
Number of input SMILES: 41236
Number of inorganic compounds: 11
Number of organic compounds: 41225


*****************************************************************
*								*
*			-----END----- 				*
*								*
*****************************************************************


Unnamed: 0,compound
0,CN(C(=O)Cc1ccccc1)[C@@H]1CC[C@@]2(CCCO2)C[C@H]...
1,COc1ccc(/C=N/NC(N)=S)cc1OC(=O)c1cccc2ccccc12
2,COc1ccccc1COC(=O)Nc1c(C)nn(C)c1Cl
3,COc1ccc(Cl)cc1S(=O)(=O)N1CCN(C(=O)c2cc(OC)c(OC...
4,O=C(c1ccccc1F)c1cn(Cc2ccccc2)c2ccccc12
...,...
41231,CN1CCCC1CCNC1C2CC3CC(C2)CC1C3
41232,Cn1c(=O)c(-c2ccc(F)cc2)nc2cnc(OCc3ccccc3)nc21
41233,O=C(CNS(=O)(=O)c1ccc(Br)cc1)N1CCOCC1
41234,CCc1c(O)c2c(c(O)c1C(C)=O)C(=O)c1c(cc(O)c(C(=O)...


### 3.4. Remove organometallic compounds in SMILES data

In [10]:
organometallic_compounds_removed_data = ValidationStage(smiles).rm_organometallics(output_dir=output_dir, get_report=True, print_logs=True, n_cpu=16, split_factor=4)
organometallic_compounds_removed_data

n_cpu: 16, split_factor: 4
                                            compound  is_valid
0  CN(C(=O)Cc1ccccc1)[C@@H]1CC[C@@]2(CCCO2)C[C@H]...      True
1       COc1ccc(/C=N/NC(N)=S)cc1OC(=O)c1cccc2ccccc12      True
2                  COc1ccccc1COC(=O)Nc1c(C)nn(C)c1Cl      True
3  COc1ccc(Cl)cc1S(=O)(=O)N1CCN(C(=O)c2cc(OC)c(OC...      True
4             O=C(c1ccccc1F)c1cn(Cc2ccccc2)c2ccccc12      True
*****************************************************************
*								*
*			1. VALIDATION		        	*
*								*
*****************************************************************

0. VALIDITY CHECK
Number of input SMILES: 41236
Number of invalid SMILES: 0
Number of valid SMILES: 41236

3. ORGANOMETALLICS REMOVAL
Number of input SMILES: 41236
Number of organometallic compounds: 0
Number of organics: 41236


*****************************************************************
*								*
*			-----END----- 				*
*								*
*****************************************************************


Unnamed: 0,compound
0,CN(C(=O)Cc1ccccc1)[C@@H]1CC[C@@]2(CCCO2)C[C@H]...
1,COc1ccc(/C=N/NC(N)=S)cc1OC(=O)c1cccc2ccccc12
2,COc1ccccc1COC(=O)Nc1c(C)nn(C)c1Cl
3,COc1ccc(Cl)cc1S(=O)(=O)N1CCN(C(=O)c2cc(OC)c(OC...
4,O=C(c1ccccc1F)c1cn(Cc2ccccc2)c2ccccc12
...,...
41231,CN1CCCC1CCNC1C2CC3CC(C2)CC1C3
41232,Cn1c(=O)c(-c2ccc(F)cc2)nc2cnc(OCc3ccccc3)nc21
41233,O=C(CNS(=O)(=O)c1ccc(Br)cc1)N1CCOCC1
41234,CCc1c(O)c2c(c(O)c1C(C)=O)C(=O)c1c(cc(O)c(C(=O)...


### 3.5. Validate SMILES data completely

In [9]:
completely_valid_smiles = ValidationStage(smiles).complete_validation(output_dir=output_dir, get_report=True, n_cpu=16, split_factor=4)
completely_valid_smiles



*****************************************************************
*								*
*			1. VALIDATION		        	*
*								*
*****************************************************************

0. VALIDITY CHECK
Number of input SMILES: 7831
Number of invalid SMILES: 0
Number of valid SMILES: 7831

1. MIXTURE REMOVAL
Number of input SMILES: 7831
Number of mixture SMILES: 220
Number of non-mixture SMILES: 7611

2. INORGANICS REMOVAL
Number of input SMILES: 7611
Number of inorganic compounds: 78
Number of organic compounds: 7533

3. ORGANOMETALLICS REMOVAL
Number of input SMILES: 7533
Number of organometallic compounds: 38
Number of organics: 7495

------------------------------------------------
DUPLICATES REMOVAL
------------------------------------------------
Number of input SMILES: 7495
Number of duplicated SMILES: 0
Number of unique SMILES: 7495


*****************************************************************
*								*
*			-----END----- 				*
*								*
**********************************

Unnamed: 0,smiles,mol_id,SR-p53,SR-MMP,SR-HSE,SR-ATAD5,SR-ARE,NR-PPAR-gamma,NR-ER-LBD,NR-ER,NR-Aromatase,NR-AhR,NR-AR-LBD,NR-AR
0,CCOc1ccc2nc(S(N)(=O)=O)sc2c1,TOX3021,0.0,0.0,0.0,0.0,1.0,0.0,0.0,,,1.0,0.0,0.0
1,CCN1C(=O)NC(c2ccccc2)C1=O,TOX3020,0.0,0.0,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,CC[C@]1(O)CC[C@H]2[C@@H]3CCC4=CCCC[C@@H]4[C@H]...,TOX3024,,,0.0,,0.0,,,,,,,
3,CCCN(CC)C(CC)C(=O)Nc1c(C)cccc1C,TOX3027,0.0,0.0,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,CC(O)(P(=O)(O)O)P(=O)(O)O,TOX20800,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7490,CCOc1nc2cccc(C(=O)O)c2n1Cc1ccc(-c2ccccc2-c2nnn...,TOX2725,,,0.0,,0.0,,,,,,,
7491,CC(=O)[C@H]1CC[C@H]2[C@@H]3CCC4=CC(=O)CC[C@]4(...,TOX2370,0.0,,0.0,0.0,,,0.0,1.0,0.0,0.0,1.0,1.0
7492,C[C@]12CC[C@H]3[C@@H](CCC4=CC(=O)CC[C@@]43C)[C...,TOX2371,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0
7493,C[C@]12CC[C@@H]3c4ccc(O)cc4CC[C@H]3[C@@H]1CC[C...,TOX2377,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,,0.0,1.0,1.0


## 4. Clean SMILES strings
In this step, salts are identified, counter-ions are deleted, and remaining SMILES strings are neutralized.

In [10]:
cleaned_salts = CleaningStage(smiles).cl_salts(output_dir=output_dir, get_report=True, param_deduplicate=True, n_cpu=16, split_factor=4)
cleaned_salts


*****************************************************************
*								*
*			2. CLEANING				*
*								*
*****************************************************************

0. VALIDITY CHECK
Number of input SMILES: 41236
Number of invalid SMILES: 0
Number of valid SMILES: 41236

2.1. SALTS CLEANING
Number of input SMILES: 41236
Number of desalted compounds: 0
Number of unprocessable compounds: 0
Number of post-desalting compounds: 41236

------------------------------------------------
DUPLICATES REMOVAL
------------------------------------------------
Number of input SMILES: 41236
Number of duplicated SMILES: 26862
Number of unique SMILES: 14374


*****************************************************************
*								*
*			-----END----- 				*
*								*
*****************************************************************


Unnamed: 0,compound
0,CN(C(=O)Cc1ccccc1)[C@@H]1CC[C@@]2(CCCO2)C[C@H]...
1,COc1ccc(/C=N/NC(N)=S)cc1OC(=O)c1cccc2ccccc12
2,COc1ccccc1COC(=O)Nc1c(C)nn(C)c1Cl
3,COc1ccc(Cl)cc1S(=O)(=O)N1CCN(C(=O)c2cc(OC)c(OC...
4,O=C(c1ccccc1F)c1cn(Cc2ccccc2)c2ccccc12
...,...
14369,COc1ccc(C(=O)NC2CCN(C(=S)Nc3cccc(C)c3)CC2)cc1
14370,CCNC(=O)c1ccc(-n2nc(C(F)(F)F)c3c2CCCC3)cc1
14371,O=C(c1ccccc1Nc1ccc(SC(F)F)cc1)N1CCCCC1
14372,CC1=NN(c2ccc(S(=O)(=O)O)cc2C)C(=[OH+])[C@H]1/N...


In [11]:
neutralized = CleaningStage(smiles).neutralize(output_dir=output_dir, get_report=True, param_deduplicate=True, n_cpu=16, split_factor=4)
neutralized


*****************************************************************
*								*
*			2. CLEANING				*
*								*
*****************************************************************

0. VALIDITY CHECK
Number of input SMILES: 41236
Number of invalid SMILES: 0
Number of valid SMILES: 41236

2.2. NEUTRALIZATION
Number of input SMILES: 41236
Number of neutralized compounds: 9
Number of post-neutralization compounds: 41236

------------------------------------------------
DUPLICATES REMOVAL
------------------------------------------------
Number of input SMILES: 41236
Number of duplicated SMILES: 26862
Number of unique SMILES: 14374


*****************************************************************
*								*
*			-----END----- 				*
*								*
*****************************************************************


Unnamed: 0,compound
0,CN(C(=O)Cc1ccccc1)[C@@H]1CC[C@@]2(CCCO2)C[C@H]...
1,COc1ccc(/C=N/NC(N)=S)cc1OC(=O)c1cccc2ccccc12
2,COc1ccccc1COC(=O)Nc1c(C)nn(C)c1Cl
3,COc1ccc(Cl)cc1S(=O)(=O)N1CCN(C(=O)c2cc(OC)c(OC...
4,O=C(c1ccccc1F)c1cn(Cc2ccccc2)c2ccccc12
...,...
14369,COc1ccc(C(=O)NC2CCN(C(=S)Nc3cccc(C)c3)CC2)cc1
14370,CCNC(=O)c1ccc(-n2nc(C(F)(F)F)c3c2CCCC3)cc1
14371,O=C(c1ccccc1Nc1ccc(SC(F)F)cc1)N1CCCCC1
14372,CC1=NN(c2ccc(S(=O)(=O)O)cc2C)C(=O)[C@H]1/N=N/c...


In [10]:
cleaned_smiles = CleaningStage(completely_valid_smiles).cl_and_neutralize(output_dir=output_dir, get_report=True, param_deduplicate=True, n_cpu=16, split_factor=4)
cleaned_smiles


*****************************************************************
*								*
*			2. CLEANING				*
*								*
*****************************************************************

0. VALIDITY CHECK
Number of input SMILES: 7495
Number of invalid SMILES: 0
Number of valid SMILES: 7495

2.1. SALTS CLEANING
Number of input SMILES: 7495
Number of desalted compounds: 4
Number of unprocessable compounds: 0
Number of post-desalting compounds: 7495

2.2. NEUTRALIZATION
Number of input SMILES: 7495
Number of neutralized compounds: 360
Number of unprocessable compound: 0
Number of post-neutralization compounds: 7495

------------------------------------------------
DUPLICATES REMOVAL
------------------------------------------------
Number of input SMILES: 7495
Number of duplicated SMILES: 0
Number of unique SMILES: 7495


*****************************************************************
*								*
*			-----END----- 				*
*								*
*****************************************************************


Unnamed: 0,smiles,mol_id,SR-p53,SR-MMP,SR-HSE,SR-ATAD5,SR-ARE,NR-PPAR-gamma,NR-ER-LBD,NR-ER,NR-Aromatase,NR-AhR,NR-AR-LBD,NR-AR
0,CCOc1ccc2nc(S(N)(=O)=O)sc2c1,TOX3021,0.0,0.0,0.0,0.0,1.0,0.0,0.0,,,1.0,0.0,0.0
1,CCN1C(=O)NC(c2ccccc2)C1=O,TOX3020,0.0,0.0,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,CC[C@]1(O)CC[C@H]2[C@@H]3CCC4=CCCC[C@@H]4[C@H]...,TOX3024,,,0.0,,0.0,,,,,,,
3,CCCN(CC)C(CC)C(=O)Nc1c(C)cccc1C,TOX3027,0.0,0.0,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,,TOX20800,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7490,CCOc1nc2cccc(C(=O)O)c2n1Cc1ccc(-c2ccccc2-c2nnn...,TOX2725,,,0.0,,0.0,,,,,,,
7491,CC(=O)[C@H]1CC[C@H]2[C@@H]3CCC4=CC(=O)CC[C@]4(...,TOX2370,0.0,,0.0,0.0,,,0.0,1.0,0.0,0.0,1.0,1.0
7492,C[C@]12CC[C@H]3[C@@H](CCC4=CC(=O)CC[C@@]43C)[C...,TOX2371,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0
7493,C[C@]12CC[C@@H]3c4ccc(O)cc4CC[C@H]3[C@@H]1CC[C...,TOX2377,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,,0.0,1.0,1.0


## 5. Normalize tautomers in data
In this step, tautomers in data will be normalized to become only one specific form for all tautomers.

In [13]:
tautomerized_smiles = NormalizingStage(smiles).detautomerize(output_dir=output_dir, get_report=True, param_deduplicate=True, n_cpu=16, split_factor=4)
tautomerized_smiles

[17:42:22] Can't kekulize mol.  Unkekulized atoms: 11 15
[17:42:22] Can't kekulize mol.  Unkekulized atoms: 5 17 18 19 28 29
[17:42:22] Can't kekulize mol.  Unkekulized atoms: 10 14
[17:42:22] Can't kekulize mol.  Unkekulized atoms: 3 5 6 10
[17:42:22] Can't kekulize mol.  Unkekulized atoms: 10 14
[17:42:22] Can't kekulize mol.  Unkekulized atoms: 6 9 13 16
[17:42:22] Can't kekulize mol.  Unkekulized atoms: 6 9 13 16
[17:42:22] Can't kekulize mol.  Unkekulized atoms: 2 6
[17:42:22] Can't kekulize mol.  Unkekulized atoms: 2 6
[17:42:22] Can't kekulize mol.  Unkekulized atoms: 2 6
[17:42:22] Can't kekulize mol.  Unkekulized atoms: 2 6
[17:42:22] Tautomer enumeration stopped at 162 tautomers: max transforms reached
[17:42:22] Can't kekulize mol.  Unkekulized atoms: 9 10 11 15 24 25
[17:42:22] Can't kekulize mol.  Unkekulized atoms: 2 6
[17:42:22] Can't kekulize mol.  Unkekulized atoms: 2 6
[17:42:22] Can't kekulize mol.  Unkekulized atoms: 2 6
[17:42:22] Can't kekulize mol.  Unkekulized a


*****************************************************************
*								*
*			3. NORMALIZATION			*
*								*
*****************************************************************

0. VALIDITY CHECK
Number of input SMILES: 41236
Number of invalid SMILES: 0
Number of valid SMILES: 41236

3.2. DETAUTOMERIZATION
Number of input SMILES: 41236
Number of detautomerized compounds: 7313
Number of post-detautomerization compounds: 41236

------------------------------------------------
DUPLICATES REMOVAL
------------------------------------------------
Number of input SMILES: 41236
Number of duplicated SMILES: 26865
Number of unique SMILES: 14371


*****************************************************************
*								*
*			-----END----- 				*
*								*
*****************************************************************


Unnamed: 0,compound
0,CN(C(=O)Cc1ccccc1)[C@@H]1CC[C@@]2(CCCO2)C[C@H]...
1,COc1ccc(/C=N/NC(N)=S)cc1OC(=O)c1cccc2ccccc12
2,COc1ccccc1COC(=O)Nc1c(C)nn(C)c1Cl
3,COc1ccc(Cl)cc1S(=O)(=O)N1CCN(C(=O)c2cc(OC)c(OC...
4,O=C(c1ccccc1F)c1cn(Cc2ccccc2)c2ccccc12
...,...
14366,COc1ccc(C(=O)NC2CCN(C(=S)Nc3cccc(C)c3)CC2)cc1
14367,CCNC(=O)c1ccc(-n2nc(C(F)(F)F)c3c2CCCC3)cc1
14368,O=C(c1ccccc1Nc1ccc(SC(F)F)cc1)N1CCCCC1
14369,Cc1cc(S(=O)(=O)O)ccc1-n1[nH]c(C)c(N=Nc2cc(S(=O...


In [14]:
stereoisomerized_smiles = NormalizingStage(cleaned_smiles).destereoisomerize(output_dir=output_dir, get_report=True, param_deduplicate=True, n_cpu=16, split_factor=4)
stereoisomerized_smiles

KeyboardInterrupt: 

In [13]:
post_validate = ValidationStage(stereoisomerized_smiles).validate_smi(output_dir=output_dir, get_report=True, n_cpu=16, split_factor=4)

*****************************************************************
*								*
*			1. VALIDATION		        	*
*								*
*****************************************************************

0. VALIDITY CHECK
Number of input SMILES: 7492
Number of invalid SMILES: 375
Number of valid SMILES: 7117


*****************************************************************
*								*
*			-----END----- 				*
*								*
*****************************************************************


In [11]:
post_normalized_smiles = NormalizingStage(cleaned_smiles).complete_normalization(output_dir=output_dir, get_report=True, param_deduplicate=True, n_cpu=16, split_factor=4)
post_normalized_smiles

[23:18:58] Can't kekulize mol.  Unkekulized atoms: 5 10
[23:18:58] Can't kekulize mol.  Unkekulized atoms: 3 12
[23:18:58] Can't kekulize mol.  Unkekulized atoms: 2 6
[23:18:58] Can't kekulize mol.  Unkekulized atoms: 2 6
[23:18:58] Can't kekulize mol.  Unkekulized atoms: 4 9
[23:18:58] Tautomer enumeration stopped at 318 tautomers: max transforms reached
[23:18:58] Tautomer enumeration stopped at 189 tautomers: max transforms reached
[23:18:58] Can't kekulize mol.  Unkekulized atoms: 3 5 6 10
[23:18:58] Tautomer enumeration stopped at 346 tautomers: max transforms reached
[23:18:58] Tautomer enumeration stopped at 327 tautomers: max transforms reached
[23:18:58] Can't kekulize mol.  Unkekulized atoms: 3 8
[23:18:58] Tautomer enumeration stopped at 296 tautomers: max transforms reached
[23:18:58] Tautomer enumeration stopped at 164 tautomers: max transforms reached
[23:18:59] Tautomer enumeration stopped at 1000 tautomers: max tautomers reached
[23:18:59] Tautomer enumeration stopped a

TypeError: No registered converter was able to produce a C++ rvalue of type std::__1::basic_string<wchar_t, std::__1::char_traits<wchar_t>, std::__1::allocator<wchar_t>> from this Python object of type float

In [15]:
post_removed_smiles, duplicated_smiles = deduplicate(smiles, validate=False, show_dup_smi_and_idx=True, output_dir=output_dir, get_report=True, n_cpu=16, split_factor=4)
duplicated_smiles

------------------------------------------------
DUPLICATES REMOVAL
------------------------------------------------
Number of input SMILES: 41236
Number of duplicated SMILES: 26862
Number of unique SMILES: 14374


*****************************************************************
*								*
*			-----END----- 				*
*								*
*****************************************************************


Unnamed: 0,compound,is_valid_x,is_valid_y,index
0,CN(C(=O)Cc1ccccc1)[C@@H]1CC[C@@]2(CCCO2)C[C@H]...,True,True,"(0, 21940, 28681)"
1,COc1ccc(/C=N/NC(N)=S)cc1OC(=O)c1cccc2ccccc12,True,True,"(1, 10775, 33546)"
2,COc1ccccc1COC(=O)Nc1c(C)nn(C)c1Cl,True,True,"(2, 14278)"
3,COc1ccc(Cl)cc1S(=O)(=O)N1CCN(C(=O)c2cc(OC)c(OC...,True,,
4,O=C(c1ccccc1F)c1cn(Cc2ccccc2)c2ccccc12,True,True,"(4, 24990)"
...,...,...,...,...
41231,CN1CCCC1CCNC1C2CC3CC(C2)CC1C3,True,True,"(3698, 17119, 19798, 25781, 41231)"
41232,Cn1c(=O)c(-c2ccc(F)cc2)nc2cnc(OCc3ccccc3)nc21,True,True,"(1740, 13123, 21706, 25678, 41232)"
41233,O=C(CNS(=O)(=O)c1ccc(Br)cc1)N1CCOCC1,True,True,"(7564, 12861, 29056, 41233)"
41234,CCc1c(O)c2c(c(O)c1C(C)=O)C(=O)c1c(cc(O)c(C(=O)...,True,True,"(574, 16585, 17663, 41234)"


In [7]:
post_removed_smiles

Unnamed: 0,compound
0,CN(C(=O)Cc1ccccc1)[C@@H]1CC[C@@]2(CCCO2)C[C@H]...
1,COc1ccc(/C=N/NC(N)=S)cc1OC(=O)c1cccc2ccccc12
2,COc1ccccc1COC(=O)Nc1c(C)nn(C)c1Cl
3,COc1ccc(Cl)cc1S(=O)(=O)N1CCN(C(=O)c2cc(OC)c(OC...
4,O=C(c1ccccc1F)c1cn(Cc2ccccc2)c2ccccc12
...,...
41195,COc1ccc(C(=O)NC2CCN(C(=S)Nc3cccc(C)c3)CC2)cc1
41197,CCNC(=O)c1ccc(-n2nc(C(F)(F)F)c3c2CCCC3)cc1
41208,O=C(c1ccccc1Nc1ccc(SC(F)F)cc1)N1CCCCC1
41219,CC1=NN(c2ccc(S(=O)(=O)O)cc2C)C(=[OH+])[C@H]1/N...


# 7. Refinement

In [16]:
post_refined_smiles = Refinement(smiles).refine_smiles(output_dir=output_dir, get_report=True, n_cpu=16, split_factor=4)
post_refined_smiles

[17:49:06] Can't kekulize mol.  Unkekulized atoms: 2 6
[17:49:06] Can't kekulize mol.  Unkekulized atoms: 2 6
[17:49:06] Can't kekulize mol.  Unkekulized atoms: 2 6
[17:49:06] Can't kekulize mol.  Unkekulized atoms: 2 18
[17:49:06] Can't kekulize mol.  Unkekulized atoms: 2 18
[17:49:06] Can't kekulize mol.  Unkekulized atoms: 6 9 13 16
[17:49:06] Can't kekulize mol.  Unkekulized atoms: 2 18
[17:49:06] Can't kekulize mol.  Unkekulized atoms: 2 18
[17:49:06] Can't kekulize mol.  Unkekulized atoms: 6 9 13 16
[17:49:06] Can't kekulize mol.  Unkekulized atoms: 9 10 11 22 29 30
[17:49:06] Can't kekulize mol.  Unkekulized atoms: 14 18
[17:49:06] Can't kekulize mol.  Unkekulized atoms: 2 6
[17:49:06] Can't kekulize mol.  Unkekulized atoms: 2 6
[17:49:06] Can't kekulize mol.  Unkekulized atoms: 2 6
[17:49:06] Can't kekulize mol.  Unkekulized atoms: 2 6
[17:49:06] Can't kekulize mol.  Unkekulized atoms: 2 6
[17:49:06] Can't kekulize mol.  Unkekulized atoms: 3 4 5 14 15 18
[17:49:06] Can't kekuli

*****************************************************************
*								*
*			1. VALIDATION		        	*
*								*
*****************************************************************

0. VALIDITY CHECK
Number of input SMILES: 41236
Number of invalid SMILES: 0
Number of valid SMILES: 41236

1. MIXTURE REMOVAL
Number of input SMILES: 41236
Number of mixture SMILES: 0
Number of non-mixture SMILES: 41236

2. INORGANICS REMOVAL
Number of input SMILES: 41236
Number of inorganic compounds: 11
Number of organic compounds: 41225

3. ORGANOMETALLICS REMOVAL
Number of input SMILES: 41225
Number of organometallic compounds: 0
Number of organics: 41225

------------------------------------------------
DUPLICATES REMOVAL
------------------------------------------------
Number of input SMILES: 41225
Number of duplicated SMILES: 26854
Number of unique SMILES: 14371


*****************************************************************
*								*
*			2. CLEANING				*
*								*
**************************

Unnamed: 0,compound
0,CN(C(=O)Cc1ccccc1)C1CCC2(CCCO2)CC1N1CCCC1
1,COc1ccc(C=NNC(N)=S)cc1OC(=O)c1cccc2ccccc12
2,COc1ccccc1COC(=O)Nc1c(C)nn(C)c1Cl
3,COc1ccc(Cl)cc1S(=O)(=O)N1CCN(C(=O)c2cc(OC)c(OC...
4,O=C(c1ccccc1F)c1cn(Cc2ccccc2)c2ccccc12
...,...
14362,COc1ccc(C(=O)NC2CCN(C(=S)Nc3cccc(C)c3)CC2)cc1
14363,CCNC(=O)c1ccc(-n2nc(C(F)(F)F)c3c2CCCC3)cc1
14364,O=C(c1ccccc1Nc1ccc(SC(F)F)cc1)N1CCCCC1
14365,Cc1cc(S(=O)(=O)O)ccc1-n1[nH]c(C)c(N=Nc2cc(S(=O...
