# MEHC-CURATION
## 1. Import libraries

In [10]:
import numpy as np
import pandas as pd
import os
from parallel_pandas import ParallelPandas
from rdkit import Chem
from rdkit.Chem.SaltRemover import SaltRemover

In [11]:
# from curation.utils import remove_duplicates
from curation.validate import ValidationStage
from curation.cleaning import *
from curation.normalization import *

## 2. Initialize CPUs

In [12]:
pd.options.mode.chained_assignment = None  # default='warn'

In [13]:
ParallelPandas.initialize(n_cpu=16, split_factor=4, disable_pr_bar=True)

In [14]:
smiles = pd.read_csv('data/cyp_train.csv')
smiles = pd.Series(smiles.iloc[:, 1])
smiles = pd.DataFrame(smiles)
smiles

Unnamed: 0,compound
0,CN(C(=O)Cc1ccccc1)[C@@H]1CC[C@@]2(CCCO2)C[C@H]...
1,COc1ccc(/C=N/NC(N)=S)cc1OC(=O)c1cccc2ccccc12
2,COc1ccccc1COC(=O)Nc1c(C)nn(C)c1Cl
3,COc1ccc(Cl)cc1S(=O)(=O)N1CCN(C(=O)c2cc(OC)c(OC...
4,O=C(c1ccccc1F)c1cn(Cc2ccccc2)c2ccccc12
...,...
41231,CN1CCCC1CCNC1C2CC3CC(C2)CC1C3
41232,Cn1c(=O)c(-c2ccc(F)cc2)nc2cnc(OCc3ccccc3)nc21
41233,O=C(CNS(=O)(=O)c1ccc(Br)cc1)N1CCOCC1
41234,CCc1c(O)c2c(c(O)c1C(C)=O)C(=O)c1c(cc(O)c(C(=O)...


In [15]:
reports_dir_path = os.path.join(os.getcwd(), 'reports')
reports_dir_path

'/Users/stevepham/Documents/Bioinfo/NZResearchGr/mehc-curation/reports'

In [21]:
duplicated_smiles = smiles[smiles.duplicated(keep=False)]
duplicated_smiles_include_idx = duplicated_smiles.groupby(duplicated_smiles.columns.tolist(), sort=False).p_apply(lambda x: tuple(x.index)).reset_index(name='index')
post_smiles = smiles.merge(duplicated_smiles_include_idx, on='compound', how='left')
post_smiles

Unnamed: 0,compound,index
0,CN(C(=O)Cc1ccccc1)[C@@H]1CC[C@@]2(CCCO2)C[C@H]...,"(0, 21940, 28681)"
1,COc1ccc(/C=N/NC(N)=S)cc1OC(=O)c1cccc2ccccc12,"(1, 10775, 33546)"
2,COc1ccccc1COC(=O)Nc1c(C)nn(C)c1Cl,"(2, 14278)"
3,COc1ccc(Cl)cc1S(=O)(=O)N1CCN(C(=O)c2cc(OC)c(OC...,
4,O=C(c1ccccc1F)c1cn(Cc2ccccc2)c2ccccc12,"(4, 24990)"
...,...,...
41230,NS(=O)(=O)c1cc2c(cc1Cl)NC=NS2(=O)=O,"(5254, 14561, 26426, 41230)"
41231,CN1CCCC1CCNC1C2CC3CC(C2)CC1C3,"(3698, 17119, 19798, 25781, 41231)"
41232,Cn1c(=O)c(-c2ccc(F)cc2)nc2cnc(OCc3ccccc3)nc21,"(1740, 13123, 21706, 25678, 41232)"
41233,O=C(CNS(=O)(=O)c1ccc(Br)cc1)N1CCOCC1,"(7564, 12861, 29056, 41233)"


In [22]:
duplicated_smiles_include_idx

Unnamed: 0,compound,index
0,CN(C(=O)Cc1ccccc1)[C@@H]1CC[C@@]2(CCCO2)C[C@H]...,"(0, 21940, 28681)"
1,COc1ccc(/C=N/NC(N)=S)cc1OC(=O)c1cccc2ccccc12,"(1, 10775, 33546)"
2,COc1ccccc1COC(=O)Nc1c(C)nn(C)c1Cl,"(2, 14278)"
3,O=C(c1ccccc1F)c1cn(Cc2ccccc2)c2ccccc12,"(4, 24990)"
4,Cc1nc2c(O)cccc2c(=O)[nH]1,"(5, 40785)"
...,...,...
12714,CN1CCCN=C1/C=C\c1cccs1,"(33356, 39432)"
12715,CN1[C@H](CC(=O)c2ccccc2)CCC[C@H]1C[C@H](O)c1cc...,"(33357, 36774)"
12716,CCC(C(=O)NC(C)(C)C)N(C(=O)Cn1nnc(-c2ccccc2F)n1...,"(33382, 40792)"
12717,CN(C)c1ncc2nc(-c3cccc(C#N)c3)c(=O)n(C)c2n1,"(33402, 35428)"


## 3. Remove invalid SMILES strings
In this part, invalid SMILES strings will be deleted out of the input data. These SMILES strings include SMILES strings with incorrect format, mixture, inorganic, and organometallic compounds.

### 3.1. Validate SMILES data

In [7]:
validating = ValidationStage(smiles)
valid_smiles, invalid_smiles = validating.check_valid_smiles(output_dir_path=reports_dir_path, print_logs=True, get_invalid_smile_indexes=True, get_report=True)
valid_smiles

Number of input SMILES: 41235
Number of valid SMILES: 41235
Number of invalid SMILES: 0



Unnamed: 0,compound
0,CN(C(=O)Cc1ccccc1)C1CCC2(CCCO2)CC1N1CCCC1
1,COc1ccc(C=NNC(N)=S)cc1OC(=O)c1cccc2ccccc12
2,COc1ccccc1COC(=O)Nc1c(C)nn(C)c1Cl
3,COc1ccc(Cl)cc1S(=O)(=O)N1CCN(C(=O)c2cc(OC)c(OC...
4,O=C(c1ccccc1F)c1cn(Cc2ccccc2)c2ccccc12
...,...
41230,NS(=O)(=O)c1cc2c(cc1Cl)NC=NS2(=O)=O
41231,CN1CCCC1CCNC1C2CC3CC(C2)CC1C3
41232,Cn1c(=O)c(-c2ccc(F)cc2)nc2cnc(OCc3ccccc3)nc21
41233,O=C(CNS(=O)(=O)c1ccc(Br)cc1)N1CCOCC1


### 3.2. Remove mixtures in SMILES data

In [10]:
removing_mixtures = ValidationStage(smiles)
mixture_removed_smiles = removing_mixtures.remove_mixtures(output_dir_path=reports_dir_path, get_report=True, print_logs=True)
mixture_removed_smiles

Number of input SMILES: 41235
Number of non-mixture SMILES: 41235
Number of mixture SMILES: 0



Unnamed: 0,compound
0,CN(C(=O)Cc1ccccc1)C1CCC2(CCCO2)CC1N1CCCC1
1,COc1ccc(C=NNC(N)=S)cc1OC(=O)c1cccc2ccccc12
2,COc1ccccc1COC(=O)Nc1c(C)nn(C)c1Cl
3,COc1ccc(Cl)cc1S(=O)(=O)N1CCN(C(=O)c2cc(OC)c(OC...
4,O=C(c1ccccc1F)c1cn(Cc2ccccc2)c2ccccc12
...,...
41230,NS(=O)(=O)c1cc2c(cc1Cl)NC=NS2(=O)=O
41231,CN1CCCC1CCNC1C2CC3CC(C2)CC1C3
41232,Cn1c(=O)c(-c2ccc(F)cc2)nc2cnc(OCc3ccccc3)nc21
41233,O=C(CNS(=O)(=O)c1ccc(Br)cc1)N1CCOCC1


### 3.3. Remove inorganic compounds in SMILES data

In [16]:
removing_inorganics = ValidationStage(smiles)
inorganic_compounds_removed_data = removing_inorganics.remove_inorganic_compounds(output_dir_path=reports_dir_path, get_report=True, print_logs=True)
inorganic_compounds_removed_data

Number of input SMILES: 41236
Number of organic compounds: 41225
Number of inorganic compounds: 11
List of inorganic indexes: 
[3867, 5630, 11006, 12769, 13197, 23864, 24454, 25575, 30277, 33457, 41055]
3867. O=P(O)(O)OP(=O)(O)O
5630. NO
11006. NNNNN
12769. O=P(O)(O)OP(=O)(O)O
13197. NO
23864. NO
24454. O=P(O)(O)OP(=O)(O)O
25575. NO
30277. O=P(O)(O)OP(=O)(O)O
33457. NNNNN
41055. NO



Unnamed: 0,compound
0,CN(C(=O)Cc1ccccc1)C1CCC2(CCCO2)CC1N1CCCC1
1,COc1ccc(C=NNC(N)=S)cc1OC(=O)c1cccc2ccccc12
2,COc1ccccc1COC(=O)Nc1c(C)nn(C)c1Cl
3,COc1ccc(Cl)cc1S(=O)(=O)N1CCN(C(=O)c2cc(OC)c(OC...
4,O=C(c1ccccc1F)c1cn(Cc2ccccc2)c2ccccc12
...,...
41231,CN1CCCC1CCNC1C2CC3CC(C2)CC1C3
41232,Cn1c(=O)c(-c2ccc(F)cc2)nc2cnc(OCc3ccccc3)nc21
41233,O=C(CNS(=O)(=O)c1ccc(Br)cc1)N1CCOCC1
41234,CCc1c(O)c2c(c(O)c1C(C)=O)C(=O)c1c(cc(O)c(C(=O)...


### 3.4. Remove organometallic compounds in SMILES data

In [12]:
removing_organometallics = ValidationStage(smiles)
organometallic_compounds_removed_data = removing_organometallics.remove_organometallic_compounds(output_dir_path=reports_dir_path, get_report=True, print_logs=True)
organometallic_compounds_removed_data

Number of input SMILES: 41235
Number of organic compounds: 41235
Number of organometallic compounds: 0



Unnamed: 0,compound
0,CN(C(=O)Cc1ccccc1)C1CCC2(CCCO2)CC1N1CCCC1
1,COc1ccc(C=NNC(N)=S)cc1OC(=O)c1cccc2ccccc12
2,COc1ccccc1COC(=O)Nc1c(C)nn(C)c1Cl
3,COc1ccc(Cl)cc1S(=O)(=O)N1CCN(C(=O)c2cc(OC)c(OC...
4,O=C(c1ccccc1F)c1cn(Cc2ccccc2)c2ccccc12
...,...
41230,NS(=O)(=O)c1cc2c(cc1Cl)NC=NS2(=O)=O
41231,CN1CCCC1CCNC1C2CC3CC(C2)CC1C3
41232,Cn1c(=O)c(-c2ccc(F)cc2)nc2cnc(OCc3ccccc3)nc21
41233,O=C(CNS(=O)(=O)c1ccc(Br)cc1)N1CCOCC1


### 3.5. Validate SMILES data completely

In [9]:
completely_valid_smiles = ValidationStage(smiles).wholly_validate_smiles(output_dir_path=reports_dir_path, get_report=True)
completely_valid_smiles

VALIDATION STEP:
Number of input SMILES: 41235
Number of valid SMILES: 41235
Number of invalid SMILES: 0

----------
MIXTURES REMOVING STEP:
Number of input SMILES: 41235
Number of non-mixture SMILES: 41235
Number of mixture SMILES: 0

----------
INORGANICS REMOVING STEP:
Number of input SMILES: 41235
Number of organic compounds: 41224
Number of inorganic compounds: 11
List of inorganic indexes: 
[3867, 5630, 11006, 12769, 13197, 23864, 24454, 25575, 30277, 33457, 41055]
3867. O=P(O)(O)OP(=O)(O)O
5630. NO
11006. NNNNN
12769. O=P(O)(O)OP(=O)(O)O
13197. NO
23864. NO
24454. O=P(O)(O)OP(=O)(O)O
25575. NO
30277. O=P(O)(O)OP(=O)(O)O
33457. NNNNN
41055. NO

----------
ORGANOMETALLIC REMOVING STEP:
Number of input SMILES: 41224
Number of organic compounds: 41224
Number of organometallic compounds: 0

----------
VALIDATE COMPLETE!
SUMMARY:
Number of input SMILES: 41235
Number of invalid SMILES: 11
Number of valid SMILES: 41224
List of invalid SMILES indexes: 
[3867, 5630, 11006, 12769, 13197, 2

Unnamed: 0,compound
0,CN(C(=O)Cc1ccccc1)C1CCC2(CCCO2)CC1N1CCCC1
1,COc1ccc(C=NNC(N)=S)cc1OC(=O)c1cccc2ccccc12
2,COc1ccccc1COC(=O)Nc1c(C)nn(C)c1Cl
3,COc1ccc(Cl)cc1S(=O)(=O)N1CCN(C(=O)c2cc(OC)c(OC...
4,O=C(c1ccccc1F)c1cn(Cc2ccccc2)c2ccccc12
...,...
41230,NS(=O)(=O)c1cc2c(cc1Cl)NC=NS2(=O)=O
41231,CN1CCCC1CCNC1C2CC3CC(C2)CC1C3
41232,Cn1c(=O)c(-c2ccc(F)cc2)nc2cnc(OCc3ccccc3)nc21
41233,O=C(CNS(=O)(=O)c1ccc(Br)cc1)N1CCOCC1


## 4. Clean SMILES strings
In this step, salts are identified, counter-ions are deleted, and remaining SMILES strings are neutralized.

In [7]:
cleaned_salts = CleaningStage(smiles).clean_salts(output_dir_path=reports_dir_path, get_report=True)
cleaned_salts

Pre-cleaned smiles data: 41235
Number of salts were cleaned: 5
Number of substance were missing after cleaning salts: 5
Post-cleaned smiles data: 41230
List of salts were cleaned: 
[4892, 12898, 24951, 30734, 37176]
4892. O=C(O)C(O)C(O)C(=O)O
12898. O=C(O)C(O)C(O)C(=O)O
24951. O=C(O)C(O)C(O)C(=O)O
30734. O=C(O)C(O)C(O)C(=O)O
37176. O=C(O)C(O)C(O)C(=O)O


Unnamed: 0,compound
0,CN(C(=O)Cc1ccccc1)C1CCC2(CCCO2)CC1N1CCCC1
1,COc1ccc(C=NNC(N)=S)cc1OC(=O)c1cccc2ccccc12
2,COc1ccccc1COC(=O)Nc1c(C)nn(C)c1Cl
3,COc1ccc(Cl)cc1S(=O)(=O)N1CCN(C(=O)c2cc(OC)c(OC...
4,O=C(c1ccccc1F)c1cn(Cc2ccccc2)c2ccccc12
...,...
41230,NS(=O)(=O)c1cc2c(cc1Cl)NC=NS2(=O)=O
41231,CN1CCCC1CCNC1C2CC3CC(C2)CC1C3
41232,Cn1c(=O)c(-c2ccc(F)cc2)nc2cnc(OCc3ccccc3)nc21
41233,O=C(CNS(=O)(=O)c1ccc(Br)cc1)N1CCOCC1


In [7]:
neutralized = CleaningStage(smiles).neutralize(output_dir_path=reports_dir_path, get_report=True)
neutralized

Pre-cleaned smiles data: 41235
Number of salts were cleaned: 9
Post-cleaned smiles data: 41235
List of salts were cleaned:
[3517, 4838, 10908, 20493, 24213, 24882, 27437, 28120, 41219]
3517. O=[N+]([O-])c1cccc2c[n-]nc12
4838. Cc1ccc(NC(=[OH+])c2cc3ccccc3cc2O)c(C)c1
10908. O=[N+]([O-])c1cccc2c[n-]nc12
20493. CCc1cc[n+](-c2nc3ccccc3nc2C(=C=[N-])C#N)cc1
24213. O=[N+]([O-])c1cccc2c[n-]nc12
24882. Cc1ccc(NC(=[OH+])c2cc3ccccc3cc2O)c(C)c1
27437. O=[N+]([O-])c1cccc2c[n-]nc12
28120. Cc1ccc(NC(=[OH+])c2cc3ccccc3cc2O)c(C)c1
41219. CC1=NN(c2ccc(S(=O)(=O)O)cc2C)C(=[OH+])C1N=Nc1cc(S(=O)(=O)O)ccc1C(=O)O



Unnamed: 0,compound
0,CN(C(=O)Cc1ccccc1)C1CCC2(CCCO2)CC1N1CCCC1
1,COc1ccc(C=NNC(N)=S)cc1OC(=O)c1cccc2ccccc12
2,COc1ccccc1COC(=O)Nc1c(C)nn(C)c1Cl
3,COc1ccc(Cl)cc1S(=O)(=O)N1CCN(C(=O)c2cc(OC)c(OC...
4,O=C(c1ccccc1F)c1cn(Cc2ccccc2)c2ccccc12
...,...
41230,NS(=O)(=O)c1cc2c(cc1Cl)NC=NS2(=O)=O
41231,CN1CCCC1CCNC1C2CC3CC(C2)CC1C3
41232,Cn1c(=O)c(-c2ccc(F)cc2)nc2cnc(OCc3ccccc3)nc21
41233,O=C(CNS(=O)(=O)c1ccc(Br)cc1)N1CCOCC1


In [7]:
cleaned_smiles = CleaningStage(smiles).clean_and_neutralize(output_dir_path=reports_dir_path, get_report=True)
cleaned_smiles

VALIDATION STEP:
Number of input SMILES: 41235
Number of valid SMILES: 41235
Number of invalid SMILES: 0

----------
SALTS CLEANING STEP:
Pre-cleaned smiles data: 41235
Number of salts were cleaned: 5
Number of substance were missing after cleaning salts: 5
Post-cleaned smiles data: 41230
List of salts were cleaned: 
[4892, 12898, 24951, 30734, 37176]
4892. O=C(O)C(O)C(O)C(=O)O
12898. O=C(O)C(O)C(O)C(=O)O
24951. O=C(O)C(O)C(O)C(=O)O
30734. O=C(O)C(O)C(O)C(=O)O
37176. O=C(O)C(O)C(O)C(=O)O

----------
NEUTRALIZING STEP:
Pre-cleaned smiles data: 41230
Number of salts were cleaned: 9
Post-cleaned smiles data: 41230
List of salts were cleaned:
[3517, 4838, 10908, 20493, 24213, 24882, 27437, 28120, 41219]
3517. O=[N+]([O-])c1cccc2c[n-]nc12
4838. Cc1ccc(NC(=[OH+])c2cc3ccccc3cc2O)c(C)c1
10908. Cc1noc(C)c1C(=O)N1CCC2(CC1)CN(c1ccccc1)C2
20493. c1csc(CNc2ccnc(-c3ccoc3)n2)c1
24213. C(#CCN1CCCC1)CN1CCCC1
24882. O=C(O)C1CCCCC1C(=O)Nc1ccccc1Cl
27437. O=S(=O)(c1cccc2c(I)cccc12)N1CCCNCC1
28120. O=C(CSc

Unnamed: 0,compound
0,CN(C(=O)Cc1ccccc1)C1CCC2(CCCO2)CC1N1CCCC1
1,COc1ccc(C=NNC(N)=S)cc1OC(=O)c1cccc2ccccc12
2,COc1ccccc1COC(=O)Nc1c(C)nn(C)c1Cl
3,COc1ccc(Cl)cc1S(=O)(=O)N1CCN(C(=O)c2cc(OC)c(OC...
4,O=C(c1ccccc1F)c1cn(Cc2ccccc2)c2ccccc12
...,...
41230,NS(=O)(=O)c1cc2c(cc1Cl)NC=NS2(=O)=O
41231,CN1CCCC1CCNC1C2CC3CC(C2)CC1C3
41232,Cn1c(=O)c(-c2ccc(F)cc2)nc2cnc(OCc3ccccc3)nc21
41233,O=C(CNS(=O)(=O)c1ccc(Br)cc1)N1CCOCC1


## 5. Normalize tautomers in data
In this step, tautomers in data will be normalized to become only one specific form for all tautomers.

In [7]:
tautomerized_smiles = NormalizingStage(smiles).normalize_tautomer(output_dir_path=reports_dir_path, get_report=True)
tautomerized_smiles

[16:02:12] Can't kekulize mol.  Unkekulized atoms: 5 17 18 19 28 29
[16:02:12] Can't kekulize mol.  Unkekulized atoms: 3 5 6 10
[16:02:12] Can't kekulize mol.  Unkekulized atoms: 10 14
[16:02:12] Can't kekulize mol.  Unkekulized atoms: 10 14
[16:02:12] Can't kekulize mol.  Unkekulized atoms: 11 15
[16:02:12] Can't kekulize mol.  Unkekulized atoms: 6 9 13 16
[16:02:12] Can't kekulize mol.  Unkekulized atoms: 6 9 13 16
[16:02:12] Can't kekulize mol.  Unkekulized atoms: 2 6
[16:02:12] Can't kekulize mol.  Unkekulized atoms: 2 6
[16:02:12] Can't kekulize mol.  Unkekulized atoms: 2 6
[16:02:12] Can't kekulize mol.  Unkekulized atoms: 2 6
[16:02:12] Can't kekulize mol.  Unkekulized atoms: 2 6
[16:02:12] Can't kekulize mol.  Unkekulized atoms: 2 6
[16:02:12] Can't kekulize mol.  Unkekulized atoms: 2 6
[16:02:12] Can't kekulize mol.  Unkekulized atoms: 2 6
[16:02:12] Can't kekulize mol.  Unkekulized atoms: 2 6
[16:02:12] Can't kekulize mol.  Unkekulized atoms: 9 10 11 15 24 25
[16:02:12] Tauto

RuntimeError: Invariant Violation
	could not find atom2
	Violation occurred on line 228 in file Code/GraphMol/Canon.cpp
	Failed Expression: firstFromAtom2
	RDKIT: 2023.09.6
	BOOST: 1_78


In [9]:
stereoisomerized_smiles = NormalizingStage(smiles).normalize_stereoisomer(output_dir_path=reports_dir_path, get_report=True)
stereoisomerized_smiles

Number of SMILES strings before stereoisomer normalizing: 41235
Number of SMILES stereoisomers normalized: 13028
Number of SMILES strings after stereoisomer normalizing: 41235



Unnamed: 0,compound
0,CN(C(=O)Cc1ccccc1)C1CCC2(CCCO2)CC1N1CCCC1
1,COc1ccc(C=NNC(N)=S)cc1OC(=O)c1cccc2ccccc12
2,COc1ccccc1COC(=O)Nc1c(C)nn(C)c1Cl
3,COc1ccc(Cl)cc1S(=O)(=O)N1CCN(C(=O)c2cc(OC)c(OC...
4,O=C(c1ccccc1F)c1cn(Cc2ccccc2)c2ccccc12
...,...
41230,NS(=O)(=O)c1cc2c(cc1Cl)NC=NS2(=O)=O
41231,CN1CCCC1CCNC1C2CC3CC(C2)CC1C3
41232,Cn1c(=O)c(-c2ccc(F)cc2)nc2cnc(OCc3ccccc3)nc21
41233,O=C(CNS(=O)(=O)c1ccc(Br)cc1)N1CCOCC1


In [7]:
post_normalized_smiles = NormalizingStage(smiles).normalize_tautomer_and_stereoisomer(output_dir_path=reports_dir_path, get_report=True)
post_normalized_smiles

[16:05:01] Can't kekulize mol.  Unkekulized atoms: 10 14
[16:05:01] Can't kekulize mol.  Unkekulized atoms: 5 17 18 19 28 29
[16:05:01] Can't kekulize mol.  Unkekulized atoms: 10 14
[16:05:01] Can't kekulize mol.  Unkekulized atoms: 3 5 6 10
[16:05:01] Can't kekulize mol.  Unkekulized atoms: 11 15
[16:05:01] Can't kekulize mol.  Unkekulized atoms: 6 9 13 16
[16:05:01] Can't kekulize mol.  Unkekulized atoms: 6 9 13 16
[16:05:01] Can't kekulize mol.  Unkekulized atoms: 2 6
[16:05:01] Can't kekulize mol.  Unkekulized atoms: 2 6
[16:05:01] Can't kekulize mol.  Unkekulized atoms: 2 6
[16:05:01] Can't kekulize mol.  Unkekulized atoms: 2 6
[16:05:01] Can't kekulize mol.  Unkekulized atoms: 2 6
[16:05:01] Can't kekulize mol.  Unkekulized atoms: 2 6
[16:05:01] Can't kekulize mol.  Unkekulized atoms: 2 6
[16:05:01] Tautomer enumeration stopped at 162 tautomers: max transforms reached
[16:05:01] Tautomer enumeration stopped at 355 tautomers: max transforms reached
[16:05:01] Can't kekulize mol.  

VALIDATION STEP:
Number of input SMILES: 41235
Number of valid SMILES: 41235
Number of invalid SMILES: 0

----------
STEREOISOMER NORMALIZATION STEP:
Number of SMILES strings before stereoisomer normalizing: 41235
Number of SMILES stereoisomers normalized: 13028
Number of SMILES strings after stereoisomer normalizing: 41235

----------
TAUTOMER NORMALIZATION STEP:
Number of SMILES strings before tautomer normalizing: 41235
Number of SMILES tautomers normalized: 3984
Number of SMILES strings after tautomer normalizing: 41235

----------
NORMALIZATION COMPLETE!
SUMMARY:
Pre-normalized SMILES data: 41235
Number of stereoisomers normalized: 13028
Number of tautomers normalized: 3984
Post-normalized SMILES data: 41235



Unnamed: 0,compound
0,CN(C(=O)Cc1ccccc1)C1CCC2(CCCO2)CC1N1CCCC1
1,COc1ccc(C=NNC(N)=S)cc1OC(=O)c1cccc2ccccc12
2,COc1ccccc1COC(=O)Nc1c(C)nn(C)c1Cl
3,COc1ccc(Cl)cc1S(=O)(=O)N1CCN(C(=O)c2cc(OC)c(OC...
4,O=C(c1ccccc1F)c1cn(Cc2ccccc2)c2ccccc12
...,...
41230,NS(=O)(=O)c1cc2c(cc1Cl)NC=NS2(=O)=O
41231,CN1CCCC1CCNC1C2CC3CC(C2)CC1C3
41232,Cn1c(=O)c(-c2ccc(F)cc2)nc2cnc(OCc3ccccc3)nc21
41233,O=C(CNS(=O)(=O)c1ccc(Br)cc1)N1CCOCC1


In [7]:
post_removed_smiles, duplicated_smiles = remove_duplicates_in_dataframe(smiles, output_dir_path=reports_dir_path, get_report=True)
duplicated_smiles

Number of input SMILES strings: 41235
Number of unique SMILES strings: 14373
Number of duplicate SMILES strings: 26862



Unnamed: 0,compound,index
0,CN(C(=O)Cc1ccccc1)C1CCC2(CCCO2)CC1N1CCCC1,"(0, 21940, 28681)"
1,COc1ccc(C=NNC(N)=S)cc1OC(=O)c1cccc2ccccc12,"(1, 10775, 33546)"
2,COc1ccccc1COC(=O)Nc1c(C)nn(C)c1Cl,"(2, 14278)"
3,COc1ccc(Cl)cc1S(=O)(=O)N1CCN(C(=O)c2cc(OC)c(OC...,
4,O=C(c1ccccc1F)c1cn(Cc2ccccc2)c2ccccc12,"(4, 24990)"
...,...,...
41230,NS(=O)(=O)c1cc2c(cc1Cl)NC=NS2(=O)=O,"(5254, 14561, 26426, 41230)"
41231,CN1CCCC1CCNC1C2CC3CC(C2)CC1C3,"(3698, 17119, 19798, 25781, 41231)"
41232,Cn1c(=O)c(-c2ccc(F)cc2)nc2cnc(OCc3ccccc3)nc21,"(1740, 13123, 21706, 25678, 41232)"
41233,O=C(CNS(=O)(=O)c1ccc(Br)cc1)N1CCOCC1,"(7564, 12861, 29056, 41233)"


In [8]:
post_removed_smiles

Unnamed: 0,compound
0,CN(C(=O)Cc1ccccc1)C1CCC2(CCCO2)CC1N1CCCC1
1,COc1ccc(C=NNC(N)=S)cc1OC(=O)c1cccc2ccccc12
2,COc1ccccc1COC(=O)Nc1c(C)nn(C)c1Cl
3,COc1ccc(Cl)cc1S(=O)(=O)N1CCN(C(=O)c2cc(OC)c(OC...
4,O=C(c1ccccc1F)c1cn(Cc2ccccc2)c2ccccc12
...,...
41154,O=C(NCC1CCCO1)c1cccc(F)c1
41195,COc1ccc(C(=O)NC2CCN(C(=S)Nc3cccc(C)c3)CC2)cc1
41197,CCNC(=O)c1ccc(-n2nc(C(F)(F)F)c3c2CCCC3)cc1
41208,O=C(c1ccccc1Nc1ccc(SC(F)F)cc1)N1CCCCC1
