# MEHC-CURATION
## 1. Import libraries

In [1]:
import numpy as np
import pandas as pd
import os
from parallel_pandas import ParallelPandas
from rdkit import Chem
from rdkit.Chem.SaltRemover import SaltRemover

In [2]:
from curation.utils import remove_duplicates, RemoveSpecificSMILES
from curation.validate import ValidationStage
from curation.cleaning import *
from curation.normalization import *
from curation.refinement import Refinement

## 2. Initialize CPUs

In [3]:
pd.options.mode.chained_assignment = None  # default='warn'

In [4]:
ParallelPandas.initialize(n_cpu=16, split_factor=4, disable_pr_bar=True)

In [5]:
smiles = pd.read_csv('data/cyp_train.csv')
smiles = pd.Series(smiles.iloc[:, 1])
smiles = pd.DataFrame(smiles)
smiles

Unnamed: 0,compound
0,CN(C(=O)Cc1ccccc1)[C@@H]1CC[C@@]2(CCCO2)C[C@H]...
1,COc1ccc(/C=N/NC(N)=S)cc1OC(=O)c1cccc2ccccc12
2,COc1ccccc1COC(=O)Nc1c(C)nn(C)c1Cl
3,COc1ccc(Cl)cc1S(=O)(=O)N1CCN(C(=O)c2cc(OC)c(OC...
4,O=C(c1ccccc1F)c1cn(Cc2ccccc2)c2ccccc12
...,...
41231,CN1CCCC1CCNC1C2CC3CC(C2)CC1C3
41232,Cn1c(=O)c(-c2ccc(F)cc2)nc2cnc(OCc3ccccc3)nc21
41233,O=C(CNS(=O)(=O)c1ccc(Br)cc1)N1CCOCC1
41234,CCc1c(O)c2c(c(O)c1C(C)=O)C(=O)c1c(cc(O)c(C(=O)...


In [6]:
reports_dir_path = os.path.join(os.getcwd(), 'reports')
reports_dir_path

'/Users/stevepham/Documents/Bioinfo/NZResearchGr/mehc-curation/reports'

In [7]:
duplicated_smiles = smiles[smiles.duplicated(keep=False)]
duplicated_smiles_include_idx = duplicated_smiles.groupby(duplicated_smiles.columns.tolist(), sort=False).p_apply(lambda x: tuple(x.index)).reset_index(name='index')
post_smiles = smiles.merge(duplicated_smiles_include_idx, on='compound', how='left')
post_smiles

Process SpawnPoolWorker-10:
Process SpawnPoolWorker-17:
Process SpawnPoolWorker-8:
Process SpawnPoolWorker-4:
Process SpawnPoolWorker-11:
Process SpawnPoolWorker-6:
Process SpawnPoolWorker-3:
Process SpawnPoolWorker-9:
Process SpawnPoolWorker-13:
Process SpawnPoolWorker-2:
Process SpawnPoolWorker-12:
Process SpawnPoolWorker-14:
Process SpawnPoolWorker-5:
Process SpawnPoolWorker-15:
Process SpawnPoolWorker-7:
Process SpawnPoolWorker-16:
Traceback (most recent call last):
  File "/opt/anaconda3/envs/testenv/lib/python3.11/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/opt/anaconda3/envs/testenv/lib/python3.11/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/opt/anaconda3/envs/testenv/lib/python3.11/multiprocessing/pool.py", line 114, in worker
    task = get()
           ^^^^^
  File "/opt/anaconda3/envs/testenv/lib/python3.11/multiprocessing/queues.py", line 367, in get
    return _ForkingPickler.loads(res

KeyboardInterrupt: 

In [None]:
duplicated_smiles_include_idx

## 3. Remove invalid SMILES strings
In this part, invalid SMILES strings will be deleted out of the input data. These SMILES strings include SMILES strings with incorrect format, mixture, inorganic, and organometallic compounds.

### 3.1. Validate SMILES data

In [7]:
validating = ValidationStage(smiles)
valid_smiles, invalid_smiles = validating.check_valid_smiles(output_dir_path=reports_dir_path, print_logs=True, get_invalid_smile_indexes=True, get_report=True)
valid_smiles

Number of input SMILES: 41236
Number of valid SMILES: 41236
Number of invalid SMILES: 0



Unnamed: 0,compound
0,CN(C(=O)Cc1ccccc1)[C@@H]1CC[C@@]2(CCCO2)C[C@H]...
1,COc1ccc(/C=N/NC(N)=S)cc1OC(=O)c1cccc2ccccc12
2,COc1ccccc1COC(=O)Nc1c(C)nn(C)c1Cl
3,COc1ccc(Cl)cc1S(=O)(=O)N1CCN(C(=O)c2cc(OC)c(OC...
4,O=C(c1ccccc1F)c1cn(Cc2ccccc2)c2ccccc12
...,...
41231,CN1CCCC1CCNC1C2CC3CC(C2)CC1C3
41232,Cn1c(=O)c(-c2ccc(F)cc2)nc2cnc(OCc3ccccc3)nc21
41233,O=C(CNS(=O)(=O)c1ccc(Br)cc1)N1CCOCC1
41234,CCc1c(O)c2c(c(O)c1C(C)=O)C(=O)c1c(cc(O)c(C(=O)...


### 3.2. Remove mixtures in SMILES data

In [8]:
removing_mixtures = ValidationStage(smiles)
mixture_removed_smiles = removing_mixtures.remove_mixtures(output_dir_path=reports_dir_path, get_report=True, print_logs=True)
mixture_removed_smiles

Number of input SMILES: 41236
Number of valid SMILES: 41236
Number of invalid SMILES: 0

Number of input SMILES: 41236
Number of non-mixture SMILES: 41236
Number of mixture SMILES: 0



Unnamed: 0,compound
0,CN(C(=O)Cc1ccccc1)[C@@H]1CC[C@@]2(CCCO2)C[C@H]...
1,COc1ccc(/C=N/NC(N)=S)cc1OC(=O)c1cccc2ccccc12
2,COc1ccccc1COC(=O)Nc1c(C)nn(C)c1Cl
3,COc1ccc(Cl)cc1S(=O)(=O)N1CCN(C(=O)c2cc(OC)c(OC...
4,O=C(c1ccccc1F)c1cn(Cc2ccccc2)c2ccccc12
...,...
41231,CN1CCCC1CCNC1C2CC3CC(C2)CC1C3
41232,Cn1c(=O)c(-c2ccc(F)cc2)nc2cnc(OCc3ccccc3)nc21
41233,O=C(CNS(=O)(=O)c1ccc(Br)cc1)N1CCOCC1
41234,CCc1c(O)c2c(c(O)c1C(C)=O)C(=O)c1c(cc(O)c(C(=O)...


### 3.3. Remove inorganic compounds in SMILES data

In [10]:
removing_inorganics = ValidationStage(smiles)
inorganic_compounds_removed_data = removing_inorganics.remove_inorganics(output_dir_path=reports_dir_path, get_report=True, print_logs=True)
inorganic_compounds_removed_data

Number of input SMILES: 41236
Number of valid SMILES: 41236
Number of invalid SMILES: 0

Number of input SMILES: 41236
Number of organic compounds: 41225
Number of inorganic compounds: 11
List of inorganic indexes: 
[3867, 5630, 11006, 12769, 13197, 23864, 24454, 25575, 30277, 33457, 41055]
3867. O=P(O)(O)OP(=O)(O)O
5630. NO
11006. NNNNN
12769. O=P(O)(O)OP(=O)(O)O
13197. NO
23864. NO
24454. O=P(O)(O)OP(=O)(O)O
25575. NO
30277. O=P(O)(O)OP(=O)(O)O
33457. NNNNN
41055. NO



Unnamed: 0,compound
0,CN(C(=O)Cc1ccccc1)[C@@H]1CC[C@@]2(CCCO2)C[C@H]...
1,COc1ccc(/C=N/NC(N)=S)cc1OC(=O)c1cccc2ccccc12
2,COc1ccccc1COC(=O)Nc1c(C)nn(C)c1Cl
3,COc1ccc(Cl)cc1S(=O)(=O)N1CCN(C(=O)c2cc(OC)c(OC...
4,O=C(c1ccccc1F)c1cn(Cc2ccccc2)c2ccccc12
...,...
41231,CN1CCCC1CCNC1C2CC3CC(C2)CC1C3
41232,Cn1c(=O)c(-c2ccc(F)cc2)nc2cnc(OCc3ccccc3)nc21
41233,O=C(CNS(=O)(=O)c1ccc(Br)cc1)N1CCOCC1
41234,CCc1c(O)c2c(c(O)c1C(C)=O)C(=O)c1c(cc(O)c(C(=O)...


### 3.4. Remove organometallic compounds in SMILES data

In [11]:
removing_organometallics = ValidationStage(smiles)
organometallic_compounds_removed_data = removing_organometallics.remove_organometallics(output_dir_path=reports_dir_path, get_report=True, print_logs=True)
organometallic_compounds_removed_data

Number of input SMILES: 41236
Number of valid SMILES: 41236
Number of invalid SMILES: 0

Number of input SMILES: 41236
Number of organic compounds: 41236
Number of organometallic compounds: 0



Unnamed: 0,compound
0,CN(C(=O)Cc1ccccc1)[C@@H]1CC[C@@]2(CCCO2)C[C@H]...
1,COc1ccc(/C=N/NC(N)=S)cc1OC(=O)c1cccc2ccccc12
2,COc1ccccc1COC(=O)Nc1c(C)nn(C)c1Cl
3,COc1ccc(Cl)cc1S(=O)(=O)N1CCN(C(=O)c2cc(OC)c(OC...
4,O=C(c1ccccc1F)c1cn(Cc2ccccc2)c2ccccc12
...,...
41231,CN1CCCC1CCNC1C2CC3CC(C2)CC1C3
41232,Cn1c(=O)c(-c2ccc(F)cc2)nc2cnc(OCc3ccccc3)nc21
41233,O=C(CNS(=O)(=O)c1ccc(Br)cc1)N1CCOCC1
41234,CCc1c(O)c2c(c(O)c1C(C)=O)C(=O)c1c(cc(O)c(C(=O)...


### 3.5. Validate SMILES data completely

In [12]:
completely_valid_smiles = ValidationStage(smiles).wholly_validate_smiles(output_dir_path=reports_dir_path, get_report=True)
completely_valid_smiles

VALIDATION STEP:
Number of input SMILES: 41236
Number of valid SMILES: 41236
Number of invalid SMILES: 0

----------
MIXTURES REMOVING STEP:
Number of input SMILES: 41236
Number of non-mixture SMILES: 41236
Number of mixture SMILES: 0

----------
INORGANICS REMOVING STEP:
Number of input SMILES: 41236
Number of organic compounds: 41225
Number of inorganic compounds: 11
List of inorganic indexes: 
[3867, 5630, 11006, 12769, 13197, 23864, 24454, 25575, 30277, 33457, 41055]
3867. O=P(O)(O)OP(=O)(O)O
5630. NO
11006. NNNNN
12769. O=P(O)(O)OP(=O)(O)O
13197. NO
23864. NO
24454. O=P(O)(O)OP(=O)(O)O
25575. NO
30277. O=P(O)(O)OP(=O)(O)O
33457. NNNNN
41055. NO

----------
ORGANOMETALLIC REMOVING STEP:
Number of input SMILES: 41225
Number of organic compounds: 41225
Number of organometallic compounds: 0

----------
VALIDATE COMPLETE!
SUMMARY:
Number of input SMILES: 41236
Number of invalid SMILES: 11
Number of valid SMILES: 41225
List of invalid SMILES indexes: 
[3867, 5630, 11006, 12769, 13197, 2

Unnamed: 0,compound
0,CN(C(=O)Cc1ccccc1)[C@@H]1CC[C@@]2(CCCO2)C[C@H]...
1,COc1ccc(/C=N/NC(N)=S)cc1OC(=O)c1cccc2ccccc12
2,COc1ccccc1COC(=O)Nc1c(C)nn(C)c1Cl
3,COc1ccc(Cl)cc1S(=O)(=O)N1CCN(C(=O)c2cc(OC)c(OC...
4,O=C(c1ccccc1F)c1cn(Cc2ccccc2)c2ccccc12
...,...
41231,CN1CCCC1CCNC1C2CC3CC(C2)CC1C3
41232,Cn1c(=O)c(-c2ccc(F)cc2)nc2cnc(OCc3ccccc3)nc21
41233,O=C(CNS(=O)(=O)c1ccc(Br)cc1)N1CCOCC1
41234,CCc1c(O)c2c(c(O)c1C(C)=O)C(=O)c1c(cc(O)c(C(=O)...


## 4. Clean SMILES strings
In this step, salts are identified, counter-ions are deleted, and remaining SMILES strings are neutralized.

In [13]:
cleaned_salts = CleaningStage(smiles).clean_salts(output_dir_path=reports_dir_path, get_report=True)
cleaned_salts

Number of input SMILES: 41236
Number of valid SMILES: 41236
Number of invalid SMILES: 0

Pre-cleaned smiles data: 41236
Number of salts were cleaned: 0
Number of substance were missing after cleaning salts: 0
Post-cleaned smiles data: 41236



Unnamed: 0,compound
0,CN(C(=O)Cc1ccccc1)[C@@H]1CC[C@@]2(CCCO2)C[C@H]...
1,COc1ccc(/C=N/NC(N)=S)cc1OC(=O)c1cccc2ccccc12
2,COc1ccccc1COC(=O)Nc1c(C)nn(C)c1Cl
3,COc1ccc(Cl)cc1S(=O)(=O)N1CCN(C(=O)c2cc(OC)c(OC...
4,O=C(c1ccccc1F)c1cn(Cc2ccccc2)c2ccccc12
...,...
41231,CN1CCCC1CCNC1C2CC3CC(C2)CC1C3
41232,Cn1c(=O)c(-c2ccc(F)cc2)nc2cnc(OCc3ccccc3)nc21
41233,O=C(CNS(=O)(=O)c1ccc(Br)cc1)N1CCOCC1
41234,CCc1c(O)c2c(c(O)c1C(C)=O)C(=O)c1c(cc(O)c(C(=O)...


In [7]:
neutralized = CleaningStage(smiles).neutralize(output_dir_path=reports_dir_path, get_report=True)
neutralized

Pre-cleaned smiles data: 41236
Number of salts were cleaned: 9
Post-cleaned smiles data: 41236
List of salts were cleaned:
[3517, 4838, 10908, 20493, 24213, 24882, 27437, 28120, 41219]
3517. O=[N+]([O-])c1cccc2c[n-]nc12
4838. Cc1ccc(NC(=[OH+])c2cc3ccccc3cc2O)c(C)c1
10908. O=[N+]([O-])c1cccc2c[n-]nc12
20493. CCc1cc[n+](-c2nc3ccccc3nc2C(=C=[N-])C#N)cc1
24213. O=[N+]([O-])c1cccc2c[n-]nc12
24882. Cc1ccc(NC(=[OH+])c2cc3ccccc3cc2O)c(C)c1
27437. O=[N+]([O-])c1cccc2c[n-]nc12
28120. Cc1ccc(NC(=[OH+])c2cc3ccccc3cc2O)c(C)c1
41219. CC1=NN(c2ccc(S(=O)(=O)O)cc2C)C(=[OH+])[C@H]1/N=N/c1cc(S(=O)(=O)O)ccc1C(=O)O



Unnamed: 0,compound
0,CN(C(=O)Cc1ccccc1)[C@@H]1CC[C@@]2(CCCO2)C[C@H]...
1,COc1ccc(/C=N/NC(N)=S)cc1OC(=O)c1cccc2ccccc12
2,COc1ccccc1COC(=O)Nc1c(C)nn(C)c1Cl
3,COc1ccc(Cl)cc1S(=O)(=O)N1CCN(C(=O)c2cc(OC)c(OC...
4,O=C(c1ccccc1F)c1cn(Cc2ccccc2)c2ccccc12
...,...
41231,CN1CCCC1CCNC1C2CC3CC(C2)CC1C3
41232,Cn1c(=O)c(-c2ccc(F)cc2)nc2cnc(OCc3ccccc3)nc21
41233,O=C(CNS(=O)(=O)c1ccc(Br)cc1)N1CCOCC1
41234,CCc1c(O)c2c(c(O)c1C(C)=O)C(=O)c1c(cc(O)c(C(=O)...


In [9]:
cleaned_smiles = CleaningStage(smiles).clean_and_neutralize(output_dir_path=reports_dir_path, get_report=True)
cleaned_smiles

VALIDATION STEP:
Number of input SMILES: 41236
Number of valid SMILES: 41236
Number of invalid SMILES: 0

----------
SALTS CLEANING STEP:
Pre-cleaned smiles data: 41236
Number of salts were cleaned: 0
Number of substance were missing after cleaning salts: 0
Post-cleaned smiles data: 41236

----------
NEUTRALIZING STEP:
Pre-cleaned smiles data: 41236
Number of salts were cleaned: 9
Post-cleaned smiles data: 41236
List of salts were cleaned:
[3517, 4838, 10908, 20493, 24213, 24882, 27437, 28120, 41219]
3517. O=[N+]([O-])c1cccc2c[n-]nc12
4838. Cc1ccc(NC(=[OH+])c2cc3ccccc3cc2O)c(C)c1
10908. O=[N+]([O-])c1cccc2c[n-]nc12
20493. CCc1cc[n+](-c2nc3ccccc3nc2C(=C=[N-])C#N)cc1
24213. O=[N+]([O-])c1cccc2c[n-]nc12
24882. Cc1ccc(NC(=[OH+])c2cc3ccccc3cc2O)c(C)c1
27437. O=[N+]([O-])c1cccc2c[n-]nc12
28120. Cc1ccc(NC(=[OH+])c2cc3ccccc3cc2O)c(C)c1
41219. CC1=NN(c2ccc(S(=O)(=O)O)cc2C)C(=[OH+])[C@H]1/N=N/c1cc(S(=O)(=O)O)ccc1C(=O)O

----------
CLEANING COMPLETE!
SUMMARY:
Pre-cleaned SMILES data: 41236
Number

Unnamed: 0,compound
0,CN(C(=O)Cc1ccccc1)[C@@H]1CC[C@@]2(CCCO2)C[C@H]...
1,COc1ccc(/C=N/NC(N)=S)cc1OC(=O)c1cccc2ccccc12
2,COc1ccccc1COC(=O)Nc1c(C)nn(C)c1Cl
3,COc1ccc(Cl)cc1S(=O)(=O)N1CCN(C(=O)c2cc(OC)c(OC...
4,O=C(c1ccccc1F)c1cn(Cc2ccccc2)c2ccccc12
...,...
41231,CN1CCCC1CCNC1C2CC3CC(C2)CC1C3
41232,Cn1c(=O)c(-c2ccc(F)cc2)nc2cnc(OCc3ccccc3)nc21
41233,O=C(CNS(=O)(=O)c1ccc(Br)cc1)N1CCOCC1
41234,CCc1c(O)c2c(c(O)c1C(C)=O)C(=O)c1c(cc(O)c(C(=O)...


## 5. Normalize tautomers in data
In this step, tautomers in data will be normalized to become only one specific form for all tautomers.

In [7]:
tautomerized_smiles = NormalizingStage(smiles).normalize_tautomer(output_dir_path=reports_dir_path, get_report=True)
tautomerized_smiles

[17:10:39] Can't kekulize mol.  Unkekulized atoms: 10 14
[17:10:39] Can't kekulize mol.  Unkekulized atoms: 10 14
[17:10:39] Can't kekulize mol.  Unkekulized atoms: 5 17 18 19 28 29
[17:10:39] Can't kekulize mol.  Unkekulized atoms: 3 5 6 10
[17:10:39] Can't kekulize mol.  Unkekulized atoms: 11 15
[17:10:39] Can't kekulize mol.  Unkekulized atoms: 6 9 13 16
[17:10:39] Can't kekulize mol.  Unkekulized atoms: 6 9 13 16
[17:10:39] Can't kekulize mol.  Unkekulized atoms: 2 6
[17:10:39] Can't kekulize mol.  Unkekulized atoms: 2 6
[17:10:39] Can't kekulize mol.  Unkekulized atoms: 2 6
[17:10:39] Can't kekulize mol.  Unkekulized atoms: 2 6
[17:10:39] Can't kekulize mol.  Unkekulized atoms: 2 6
[17:10:39] Can't kekulize mol.  Unkekulized atoms: 2 6
[17:10:39] Can't kekulize mol.  Unkekulized atoms: 2 6
[17:10:39] Can't kekulize mol.  Unkekulized atoms: 2 6
[17:10:39] Can't kekulize mol.  Unkekulized atoms: 2 6
[17:10:39] Tautomer enumeration stopped at 326 tautomers: max transforms reached
[17

Number of SMILES strings before tautomer normalizing: 41236
Number of SMILES tautomers normalized: 7313
Number of SMILES strings after tautomer normalizing: 41236



Unnamed: 0,compound
0,CN(C(=O)Cc1ccccc1)[C@@H]1CC[C@@]2(CCCO2)C[C@H]...
1,COc1ccc(/C=N/NC(N)=S)cc1OC(=O)c1cccc2ccccc12
2,COc1ccccc1COC(=O)Nc1c(C)nn(C)c1Cl
3,COc1ccc(Cl)cc1S(=O)(=O)N1CCN(C(=O)c2cc(OC)c(OC...
4,O=C(c1ccccc1F)c1cn(Cc2ccccc2)c2ccccc12
...,...
41231,CN1CCCC1CCNC1C2CC3CC(C2)CC1C3
41232,Cn1c(=O)c(-c2ccc(F)cc2)nc2cnc(OCc3ccccc3)nc21
41233,O=C(CNS(=O)(=O)c1ccc(Br)cc1)N1CCOCC1
41234,C=Cc1c(C(C)=O)c(O)c2c(O)c3c(C(=O)O)c(C(=O)O)c(...


In [8]:
stereoisomerized_smiles = NormalizingStage(smiles).normalize_stereoisomer(output_dir_path=reports_dir_path, get_report=True)
stereoisomerized_smiles

Number of SMILES strings before stereoisomer normalizing: 41236
Number of SMILES stereoisomers normalized: 13028
Number of SMILES strings after stereoisomer normalizing: 41236



Unnamed: 0,compound
0,CN(C(=O)Cc1ccccc1)C1CCC2(CCCO2)CC1N1CCCC1
1,COc1ccc(C=NNC(N)=S)cc1OC(=O)c1cccc2ccccc12
2,COc1ccccc1COC(=O)Nc1c(C)nn(C)c1Cl
3,COc1ccc(Cl)cc1S(=O)(=O)N1CCN(C(=O)c2cc(OC)c(OC...
4,O=C(c1ccccc1F)c1cn(Cc2ccccc2)c2ccccc12
...,...
41231,CN1CCCC1CCNC1C2CC3CC(C2)CC1C3
41232,Cn1c(=O)c(-c2ccc(F)cc2)nc2cnc(OCc3ccccc3)nc21
41233,O=C(CNS(=O)(=O)c1ccc(Br)cc1)N1CCOCC1
41234,CCc1c(O)c2c(c(O)c1C(C)=O)C(=O)c1c(cc(O)c(C(=O)...


In [9]:
post_normalized_smiles = NormalizingStage(smiles).normalize_tautomer_and_stereoisomer(output_dir_path=reports_dir_path, get_report=True)
post_normalized_smiles

[17:12:14] Can't kekulize mol.  Unkekulized atoms: 5 17 18 19 28 29
[17:12:14] Can't kekulize mol.  Unkekulized atoms: 3 5 6 10
[17:12:14] Can't kekulize mol.  Unkekulized atoms: 10 14
[17:12:14] Can't kekulize mol.  Unkekulized atoms: 10 14
[17:12:14] Can't kekulize mol.  Unkekulized atoms: 11 15
[17:12:14] Can't kekulize mol.  Unkekulized atoms: 6 9 13 16
[17:12:14] Can't kekulize mol.  Unkekulized atoms: 6 9 13 16
[17:12:14] Can't kekulize mol.  Unkekulized atoms: 2 6
[17:12:14] Can't kekulize mol.  Unkekulized atoms: 2 6
[17:12:14] Can't kekulize mol.  Unkekulized atoms: 2 6
[17:12:14] Can't kekulize mol.  Unkekulized atoms: 2 6
[17:12:14] Can't kekulize mol.  Unkekulized atoms: 2 6
[17:12:14] Can't kekulize mol.  Unkekulized atoms: 2 6
[17:12:14] Can't kekulize mol.  Unkekulized atoms: 9 10 11 15 24 25
[17:12:14] Can't kekulize mol.  Unkekulized atoms: 2 6
[17:12:14] Can't kekulize mol.  Unkekulized atoms: 2 6
[17:12:14] Can't kekulize mol.  Unkekulized atoms: 2 6
[17:12:14] Tauto

VALIDATION STEP:
Number of input SMILES: 41236
Number of valid SMILES: 41236
Number of invalid SMILES: 0

----------
STEREOISOMER NORMALIZATION STEP:
Number of SMILES strings before stereoisomer normalizing: 41236
Number of SMILES stereoisomers normalized: 13028
Number of SMILES strings after stereoisomer normalizing: 41236

----------
TAUTOMER NORMALIZATION STEP:
Number of SMILES strings before tautomer normalizing: 41236
Number of SMILES tautomers normalized: 3984
Number of SMILES strings after tautomer normalizing: 41236

----------
NORMALIZATION COMPLETE!
SUMMARY:
Pre-normalized SMILES data: 41236
Number of stereoisomers normalized: 13028
Number of tautomers normalized: 3984
Post-normalized SMILES data: 41236



Unnamed: 0,compound
0,CN(C(=O)Cc1ccccc1)C1CCC2(CCCO2)CC1N1CCCC1
1,COc1ccc(C=NNC(N)=S)cc1OC(=O)c1cccc2ccccc12
2,COc1ccccc1COC(=O)Nc1c(C)nn(C)c1Cl
3,COc1ccc(Cl)cc1S(=O)(=O)N1CCN(C(=O)c2cc(OC)c(OC...
4,O=C(c1ccccc1F)c1cn(Cc2ccccc2)c2ccccc12
...,...
41231,CN1CCCC1CCNC1C2CC3CC(C2)CC1C3
41232,Cn1c(=O)c(-c2ccc(F)cc2)nc2cnc(OCc3ccccc3)nc21
41233,O=C(CNS(=O)(=O)c1ccc(Br)cc1)N1CCOCC1
41234,CCc1c(O)c2c(c(O)c1C(C)=O)C(=O)c1c(cc(O)c(C(=O)...


In [None]:
post_removed_smiles, duplicated_smiles = remove_duplicates(smiles, output_dir_path=reports_dir_path, get_report=True)
duplicated_smiles

In [None]:
post_removed_smiles

# 7. Refinement

In [7]:
post_refined_smiles = Refinement(smiles).refine_smiles()
post_refined_smiles

VALIDATION STEP:
Number of input SMILES: 41236
Number of valid SMILES: 41236
Number of invalid SMILES: 0

----------
MIXTURES REMOVING STEP:
Number of input SMILES: 41236
Number of non-mixture SMILES: 41236
Number of mixture SMILES: 0

----------
INORGANICS REMOVING STEP:
Number of input SMILES: 41236
Number of organic compounds: 41225
Number of inorganic compounds: 11
List of inorganic indexes: 
[3867, 5630, 11006, 12769, 13197, 23864, 24454, 25575, 30277, 33457, 41055]
3867. O=P(O)(O)OP(=O)(O)O
5630. NO
11006. NNNNN
12769. O=P(O)(O)OP(=O)(O)O
13197. NO
23864. NO
24454. O=P(O)(O)OP(=O)(O)O
25575. NO
30277. O=P(O)(O)OP(=O)(O)O
33457. NNNNN
41055. NO

----------
ORGANOMETALLICS REMOVING STEP:
Number of input SMILES: 41225
Number of organic compounds: 41225
Number of organometallic compounds: 0

----------
DUPLICATES REMOVING 1ST TIME (after validation stage):
Number of input SMILES strings: 41225
Number of unique SMILES strings: 14371
Number of duplicate SMILES strings: 26854

---------

[22:40:04] Can't kekulize mol.  Unkekulized atoms: 2 6
[22:40:04] Can't kekulize mol.  Unkekulized atoms: 2 6
[22:40:04] Can't kekulize mol.  Unkekulized atoms: 2 18
[22:40:04] Can't kekulize mol.  Unkekulized atoms: 2 18
[22:40:04] Can't kekulize mol.  Unkekulized atoms: 2 6
[22:40:04] Can't kekulize mol.  Unkekulized atoms: 2 18
[22:40:04] Can't kekulize mol.  Unkekulized atoms: 2 18
[22:40:04] Can't kekulize mol.  Unkekulized atoms: 6 9 13 16
[22:40:04] Can't kekulize mol.  Unkekulized atoms: 6 9 13 16
[22:40:04] Can't kekulize mol.  Unkekulized atoms: 9 10 11 22 29 30
[22:40:04] Can't kekulize mol.  Unkekulized atoms: 2 6
[22:40:04] Can't kekulize mol.  Unkekulized atoms: 2 6
[22:40:04] Can't kekulize mol.  Unkekulized atoms: 14 18
[22:40:04] Can't kekulize mol.  Unkekulized atoms: 3 4 5 14 15 18
[22:40:04] Can't kekulize mol.  Unkekulized atoms: 2 6
[22:40:04] Can't kekulize mol.  Unkekulized atoms: 2 6
[22:40:04] Can't kekulize mol.  Unkekulized atoms: 2 6
[22:40:04] Can't kekuli

NORMALIZE TAUTOMERS STEP:
Number of SMILES strings before tautomer normalizing: 14371
Number of SMILES tautomers normalized: 1353
Number of SMILES strings after tautomer normalizing: 14371

----------
DUPLICATES REMOVING 3RD TIME (after normalizing stage):
Number of input SMILES strings: 14371
Number of unique SMILES strings: 14367
Number of duplicate SMILES strings: 4

----------


Unnamed: 0,compound
0,CN(C(=O)Cc1ccccc1)C1CCC2(CCCO2)CC1N1CCCC1
1,COc1ccc(C=NNC(N)=S)cc1OC(=O)c1cccc2ccccc12
2,COc1ccccc1COC(=O)Nc1c(C)nn(C)c1Cl
3,COc1ccc(Cl)cc1S(=O)(=O)N1CCN(C(=O)c2cc(OC)c(OC...
4,O=C(c1ccccc1F)c1cn(Cc2ccccc2)c2ccccc12
...,...
41195,COc1ccc(C(=O)NC2CCN(C(=S)Nc3cccc(C)c3)CC2)cc1
41197,CCNC(=O)c1ccc(-n2nc(C(F)(F)F)c3c2CCCC3)cc1
41208,O=C(c1ccccc1Nc1ccc(SC(F)F)cc1)N1CCCCC1
41219,Cc1cc(S(=O)(=O)O)ccc1-n1[nH]c(C)c(N=Nc2cc(S(=O...
