# Evaluate Pre-training Results
This notebook provides evaluation of generated SMILES in the pre-training stage.

In [None]:
import sys
sys.path.append('../')

from src.paths import ensure_dirs, PRETRAIN_DATA, PRETRAIN_RESULTS, PRETRAIN_OUT
ensure_dirs()

dataset_list = ['pubchem_filtered_ac', 
                'pubchem_unfiltered_ac', 
                'pubchem_inac', 
                'chembl_filtered', 
                'chembl_unfiltered', 
                'zinc']

### 1. Metrics 

For each dataset, we evaluate the generated molecules in terms of:
- **Validity**: fraction of chemically valid SMILES
- **Uniqueness**: fraction of unique valid SMILES
- **Novelty**: fraction of unique valid SMILES not present in the training set
- **Average length**: average number of characters in canonical SMILES

The results are summarized across datasets and saved as `pretrain_metrics.tsv`.

In [None]:
from src.model.generative_models.generative_models.utils import Smiles2RDKitCanonicalSmiles
from src.metrics import gen_smiles_metrics
import pandas as pd

metrics_dict = []
for dataset in dataset_list:
    train_smiles_df = pd.read_table(f'{PRETRAIN_DATA}/sampled_datasets/{dataset}.tsv', index_col=0)
    gen_smiles_df   = pd.read_table(f'{PRETRAIN_RESULTS}/{dataset}_results/sampling_10000/gen_smiles.tsv', index_col=0)

    dict = gen_smiles_metrics(train_smiles=train_smiles_df, gen_smiles=gen_smiles_df, 
                              trainsmi_col='rdkit_smiles', gensmi_col='smiles')
    metrics_dict.append(dict)

metrics_df = pd.DataFrame(metrics_dict, index=dataset_list).round(3)
metrics_df.T.to_csv(f'{PRETRAIN_OUT}/pretrain_metrics.tsv', sep='\t')
metrics_df.T

[23:59:34] Can't kekulize mol.  Unkekulized atoms: 2 3 12 13 15 16 17
[23:59:34] SMILES Parse Error: syntax error while parsing: N#Cc1ccccc1NC(=O)CSc1nnc(-c2ccccc2O)n1-
[23:59:34] SMILES Parse Error: Failed parsing SMILES 'N#Cc1ccccc1NC(=O)CSc1nnc(-c2ccccc2O)n1-' for input: 'N#Cc1ccccc1NC(=O)CSc1nnc(-c2ccccc2O)n1-'
[23:59:34] Can't kekulize mol.  Unkekulized atoms: 2 3 4 20 21
[23:59:34] Explicit valence for atom # 8 O, 3, is greater than permitted
[23:59:34] SMILES Parse Error: unclosed ring for input: 'CC(C)c1cc(C(=O)NC1CCCCC1)n1cncc1'
[23:59:34] Can't kekulize mol.  Unkekulized atoms: 11 12 13 14 15 16 17 18 19 20 21
[23:59:34] Can't kekulize mol.  Unkekulized atoms: 6 7 10
[23:59:34] SMILES Parse Error: ring closure 1 duplicates bond between atom 14 and atom 15 for input: 'Cn1nc(C(F)(F)F)c(C)c1NC(=O)C1C1C2CCC1C2'
[23:59:34] SMILES Parse Error: unclosed ring for input: 'CC(C)n1nnnc1C(c1ccccc1F)N1CCN(CCSCC2)CC1'
[23:59:34] SMILES Parse Error: unclosed ring for input: 'COc1ccc(-c2cc(C

[23:59:35] SMILES Parse Error: extra close parentheses while parsing: CCCC1C(=O)N=C2S(=O)(=O)c3ccccc32)c1OC
[23:59:35] SMILES Parse Error: Failed parsing SMILES 'CCCC1C(=O)N=C2S(=O)(=O)c3ccccc32)c1OC' for input: 'CCCC1C(=O)N=C2S(=O)(=O)c3ccccc32)c1OC'
[23:59:35] SMILES Parse Error: unclosed ring for input: 'CCN(CC)c1nc2c3c(nc4c(c3nc1n1)CCCC3)CCC3'
[23:59:35] Can't kekulize mol.  Unkekulized atoms: 13 14 15 16 26
[23:59:35] Can't kekulize mol.  Unkekulized atoms: 2 3 4 5 6 7 10 13 14
[23:59:35] SMILES Parse Error: unclosed ring for input: 'COc1ccccc1-c1ccc2c(c1)C(C)(Cc1c(cn(C)n3Cc2cccc(F)c2)c1=O)N(C)C2'
[23:59:35] Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 9 10 17
[23:59:35] SMILES Parse Error: unclosed ring for input: 'O=C(c1cnc(-c2ccccc2Cl)o2)N1CCCC1'
[23:59:35] Can't kekulize mol.  Unkekulized atoms: 8
[23:59:35] SMILES Parse Error: unclosed ring for input: 'Cc1ccc(-c2nc3ccccc3c(C(=O)NCC(c4ccco4)N2C)CC2)cc1'
[23:59:35] SMILES Parse Error: unclosed ring for input: 'CCCCn1c2sc3c(=

Unnamed: 0,pubchem_filtered_ac,pubchem_unfiltered_ac,pubchem_inac,chembl_filtered,chembl_unfiltered,zinc
validity,0.973,0.959,0.96,0.955,0.933,0.961
uniqueness,0.894,0.92,0.927,0.954,0.961,0.948
novelty,0.333,0.489,0.523,0.635,0.677,0.615
avg_length,39.661,42.85,45.985,42.667,49.241,45.804


### 2. Cohen's *d*
Compute Cohen's *d* as a measure of the standardized difference in means between the generated and training molecules for MW, LogP and TPSA.
Values close to 0 indicate strong similarity, whereas values around 0.2, 0.5, and 0.8 
are conventionally interpreted as small, medium, and large differences, respectively.

In [None]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import Descriptors
from tqdm import tqdm
from src.metrics import cohens_d
from src.chem_utils import calc_properties

dataset_list = ['pubchem_filtered_ac', 'pubchem_unfiltered_ac', 'pubchem_inac', 'chembl_filtered', 'chembl_unfiltered', 'zinc']
properties  = ['mw', 'logP', 'tpsa']

cohens_data = {prop: [] for prop in properties}

for dataset in dataset_list:
    train_smiles_df = pd.read_table(f'{PRETRAIN_DATA}/sampled_datasets/{dataset}.tsv', index_col=0)
    train_prop_df   = calc_properties(train_smiles_df, smiles_col='rdkit_smiles')

    gen_smiles_df = pd.read_table(f'{PRETRAIN_RESULTS}/{dataset}_results/sampling_10000/gen_smiles.tsv', index_col=0)
    gen_smiles_df.dropna(subset=['smiles'], inplace=True)
    gen_smiles_df.drop_duplicates(subset=['smiles'], inplace=True)
    gen_prop_df   = calc_properties(gen_smiles_df, smiles_col='smiles')

    for prop in properties:
        cohens_data[prop].append(cohens_d(train_prop_df[prop], gen_prop_df[prop]))

cohens_df = pd.DataFrame(cohens_data, index=dataset_list).T.round(3)
cohens_table = cohens_df.reset_index(names='property')
cohens_table.to_csv(f"{PRETRAIN_OUT}/pretrain_cohen's_d.tsv", sep='\t', index=False)
cohens_table

[18:01:45] Can't kekulize mol.  Unkekulized atoms: 2 3 12 13 15 16 17
[18:01:45] SMILES Parse Error: syntax error while parsing: N#Cc1ccccc1NC(=O)CSc1nnc(-c2ccccc2O)n1-
[18:01:45] SMILES Parse Error: Failed parsing SMILES 'N#Cc1ccccc1NC(=O)CSc1nnc(-c2ccccc2O)n1-' for input: 'N#Cc1ccccc1NC(=O)CSc1nnc(-c2ccccc2O)n1-'
[18:01:45] Can't kekulize mol.  Unkekulized atoms: 2 3 4 20 21
[18:01:45] Explicit valence for atom # 8 O, 3, is greater than permitted
[18:01:45] SMILES Parse Error: unclosed ring for input: 'CC(C)c1cc(C(=O)NC1CCCCC1)n1cncc1'
[18:01:45] Can't kekulize mol.  Unkekulized atoms: 11 12 13 14 15 16 17 18 19 20 21
[18:01:45] Can't kekulize mol.  Unkekulized atoms: 6 7 10
[18:01:45] SMILES Parse Error: ring closure 1 duplicates bond between atom 14 and atom 15 for input: 'Cn1nc(C(F)(F)F)c(C)c1NC(=O)C1C1C2CCC1C2'
[18:01:45] SMILES Parse Error: unclosed ring for input: 'CC(C)n1nnnc1C(c1ccccc1F)N1CCN(CCSCC2)CC1'
[18:01:45] SMILES Parse Error: unclosed ring for input: 'COc1ccc(-c2cc(C

[calc_properties] 267 invalid SMILES removed


[18:01:56] Can't kekulize mol.  Unkekulized atoms: 1 2 3 5 27
[18:01:56] Can't kekulize mol.  Unkekulized atoms: 19 20 21 23 25 26 27
[18:01:56] non-ring atom 15 marked aromatic
[18:01:56] SMILES Parse Error: unclosed ring for input: 'O=C(Nc1ccc(Cl)c(-c2nn3c(C(F)(F)F)nc3ccccc23)c1)c1ccc2c(c1)OCCO2'
[18:01:56] SMILES Parse Error: unclosed ring for input: 'C=CCNc1nc2c(c(=O)n(C)c(=O)n2C)n1CCCCC1'
[18:01:56] SMILES Parse Error: unclosed ring for input: 'CCN1CCc2c(sc(NC(=O)C(F)(F)F)C2C(=O)NC2=O)C1'
[18:01:56] SMILES Parse Error: syntax error while parsing: CC1CC[C@H]2c3sc4ccc(=O)c(CCC(=O)NCCCO)c(4)c3cc2C1
[18:01:56] SMILES Parse Error: Failed parsing SMILES 'CC1CC[C@H]2c3sc4ccc(=O)c(CCC(=O)NCCCO)c(4)c3cc2C1' for input: 'CC1CC[C@H]2c3sc4ccc(=O)c(CCC(=O)NCCCO)c(4)c3cc2C1'
[18:01:56] Explicit valence for atom # 4 C, 5, is greater than permitted
[18:01:56] Can't kekulize mol.  Unkekulized atoms: 2 3 4 5 8 21 22 23 24
[18:01:56] SMILES Parse Error: unclosed ring for input: 'COc1cc(C(=S)N2CCOCC2)

[calc_properties] 407 invalid SMILES removed


[18:02:06] Explicit valence for atom # 15 N, 4, is greater than permitted
[18:02:06] Can't kekulize mol.  Unkekulized atoms: 15 16 17
[18:02:06] Can't kekulize mol.  Unkekulized atoms: 2 3 5 6 7 8 12 14 15
[18:02:06] SMILES Parse Error: unclosed ring for input: 'O=C(Nc1ccc(I)cc1Cl)N1CCN(c2ncnc3sc(C(=O)N4CCc5ccccc54)CC2)cc21'
[18:02:06] SMILES Parse Error: extra close parentheses while parsing: COc1cc(/C=C2/C(=N)N3N=C(CSc4ccc(Cl)cc4)CC3=O)ccc2O)cc(OC)c1OC
[18:02:06] SMILES Parse Error: Failed parsing SMILES 'COc1cc(/C=C2/C(=N)N3N=C(CSc4ccc(Cl)cc4)CC3=O)ccc2O)cc(OC)c1OC' for input: 'COc1cc(/C=C2/C(=N)N3N=C(CSc4ccc(Cl)cc4)CC3=O)ccc2O)cc(OC)c1OC'
[18:02:06] Explicit valence for atom # 2 F, 2, is greater than permitted
[18:02:06] Can't kekulize mol.  Unkekulized atoms: 6 7 8 9 10 25 26
[18:02:06] SMILES Parse Error: unclosed ring for input: 'CC(C)c1n[nH]c2c1CN(C(=O)Cc1nn3c(-c4cccs4)nn3C3CCCCC2)CC1'
[18:02:06] Can't kekulize mol.  Unkekulized atoms: 3 4 5 6 21
[18:02:06] SMILES Parse Error: 

[calc_properties] 404 invalid SMILES removed


[18:02:16] Can't kekulize mol.  Unkekulized atoms: 9 10 12 13 14 15 23
[18:02:16] Can't kekulize mol.  Unkekulized atoms: 5 6 7 8 9 22 23
[18:02:17] SMILES Parse Error: unclosed ring for input: 'COc1ccc2cc([C@@H](C)CN3CCN(CCC(C)C)C(=O)C3CCCO3)ccc2c1'
[18:02:17] SMILES Parse Error: unclosed ring for input: 'COc1cccc(C2(N3CCc4c(CC5CC5CC4)[C@H](C)CC(C)C4)CCC2)c1'
[18:02:17] Can't kekulize mol.  Unkekulized atoms: 10 11 12 13 14
[18:02:17] Can't kekulize mol.  Unkekulized atoms: 9 10 17 18 19
[18:02:17] SMILES Parse Error: unclosed ring for input: 'CC1=N[C@@]2(c3ccc(F)cc3CCc3ccn(c4)Cc3ccc(OC(F)(F)F)cc32)CC1'
[18:02:17] SMILES Parse Error: syntax error while parsing: C/=c1/n(CC2CCCO2)c2ccc(F)cc2nc1-c1cnn(C)c1
[18:02:17] SMILES Parse Error: Failed parsing SMILES 'C/=c1/n(CC2CCCO2)c2ccc(F)cc2nc1-c1cnn(C)c1' for input: 'C/=c1/n(CC2CCCO2)c2ccc(F)cc2nc1-c1cnn(C)c1'
[18:02:17] Can't kekulize mol.  Unkekulized atoms: 15 16 17 18 27
[18:02:17] SMILES Parse Error: syntax error while parsing: COc1ccc

[calc_properties] 451 invalid SMILES removed


[18:02:28] Can't kekulize mol.  Unkekulized atoms: 20 21 22 23 29
[18:02:28] Can't kekulize mol.  Unkekulized atoms: 31
[18:02:28] SMILES Parse Error: extra open parentheses for input: 'Cc1ccc(-c2noc(-c3c(O)c4ccccc4n(C)c2=O)nn1'
[18:02:28] Can't kekulize mol.  Unkekulized atoms: 4 5 18 19 20
[18:02:28] SMILES Parse Error: unclosed ring for input: 'CCC1([C@@H]2CC[C@H]3[C@@H](C)C[C@H](O)CC3(C)C)[C@@H](C)CC[C@H]1/C(=C/C#N)Cc1ccccc1'
[18:02:28] Can't kekulize mol.  Unkekulized atoms: 12 13 14 25 26 27 28 30 31
[18:02:28] SMILES Parse Error: unclosed ring for input: 'O=C1CN2CCN(CCCCC(c3ccccc3)N2CCN(c3ccccc3)CC2)CC1'
[18:02:28] SMILES Parse Error: extra close parentheses while parsing: C=C1C(=O)[C@@]23CC[C@H](OC(=O)/C=C/c4ccccc4)C[C@H]4[C@@H](CCC(C)(C)O)CC[C@H]4[C@@H]2C)[C@@H]13
[18:02:28] SMILES Parse Error: Failed parsing SMILES 'C=C1C(=O)[C@@]23CC[C@H](OC(=O)/C=C/c4ccccc4)C[C@H]4[C@@H](CCC(C)(C)O)CC[C@H]4[C@@H]2C)[C@@H]13' for input: 'C=C1C(=O)[C@@]23CC[C@H](OC(=O)/C=C/c4ccccc4)C[C@H]4[C@

[calc_properties] 673 invalid SMILES removed


[18:02:39] Can't kekulize mol.  Unkekulized atoms: 14 15 16 17 18 19 20 21 22
[18:02:39] Can't kekulize mol.  Unkekulized atoms: 1 2 3 5 6 9 15 16 17 18 19 20 25
[18:02:39] Can't kekulize mol.  Unkekulized atoms: 17 18 19 20 21
[18:02:39] Can't kekulize mol.  Unkekulized atoms: 4 5 6 14 15
[18:02:39] SMILES Parse Error: unclosed ring for input: 'COc1ccc(C)cc1N1C(=O)[C@H](c2ccccc2)C(c2n(c3ccc(Cl)cc2)c2ccccc2Cl)=NN1c1ccccc1'
[18:02:39] Can't kekulize mol.  Unkekulized atoms: 5 6 7 9 20
[18:02:39] SMILES Parse Error: unclosed ring for input: 'c1cc2occ(CCSc3ncnc4sccc35)n2c2c1'
[18:02:39] SMILES Parse Error: syntax error while parsing: CC[C@@H](CCc1ccccc1)NC(=O)NC[C@]1(c2cccc()C2)CC(C)C)c1
[18:02:39] SMILES Parse Error: Failed parsing SMILES 'CC[C@@H](CCc1ccccc1)NC(=O)NC[C@]1(c2cccc()C2)CC(C)C)c1' for input: 'CC[C@@H](CCc1ccccc1)NC(=O)NC[C@]1(c2cccc()C2)CC(C)C)c1'
[18:02:39] SMILES Parse Error: unclosed ring for input: 'COc1cccc([C@@H]2CCCN2c2nc3ccc4c(c3)OCCO4)cc1'
[18:02:39] Can't kekulize

[calc_properties] 388 invalid SMILES removed


Unnamed: 0,property,pubchem_filtered_ac,pubchem_unfiltered_ac,pubchem_inac,chembl_filtered,chembl_unfiltered,zinc
0,mw,0.086143,0.101731,0.07639,0.155674,0.173244,0.15018
1,logP,0.025642,0.041238,0.070195,0.06125,0.059177,0.085721
2,tpsa,0.060987,0.055557,0.026917,0.118672,0.161365,0.09405


### 3. MOSES
Clone the [MOSES benchmark](https://github.com/molecularsets/moses) repository and place it into the `tools` directory:

In [9]:
!git clone https://github.com/molecularsets/moses.git ../tools/moses-master/

Cloning into '../tools/moses-master'...


remote: Enumerating objects: 1957, done.[K
remote: Counting objects: 100% (2/2), done.[K
remote: Compressing objects: 100% (2/2), done.[K
remote: Total 1957 (delta 0), reused 0 (delta 0), pack-reused 1955 (from 2)[K
remote: Total 1957 (delta 0), reused 0 (delta 0), pack-reused 1955 (from 2)[K
Receiving objects: 100% (1957/1957), 164.05 MiB | 5.17 MiB/s, done.
Resolving deltas: 100% (1068/1068), done.
Receiving objects: 100% (1957/1957), 164.05 MiB | 5.17 MiB/s, done.
Resolving deltas: 100% (1068/1068), done.


Clone the [fcd_torch](https://github.com/insilicomedicine/fcd_torch) repository and place it into the `tools` directory:

In [14]:
!git clone https://github.com/insilicomedicine/fcd_torch.git ../tools/fcd_torch

Cloning into '../tools/fcd_torch'...
remote: Enumerating objects: 47, done.[K
remote: Total 47 (delta 0), reused 0 (delta 0), pack-reused 47 (from 1)[K
Receiving objects: 100% (47/47), 5.00 MiB | 3.07 MiB/s, done.
Resolving deltas: 100% (20/20), done.


#### # **Pandas & RDKit Compativility Fix**

Some code in MOSES doesn't work in the current environment. The following fixes are needed:

1. **Pandas compatibility issue in `moses/metrics/utils.py`**: 
   The `append` method has been removed from pandas. Please manually replace the following lines in the file:
   
   ```python
   # Replace this (line 23-24):
   _filters = [Chem.MolFromSmarts(x) for x in
               _mcf.append(_pains, sort=True)['smarts'].values]
   
   # With this:
   _mcf_pains = pd.concat([_mcf,_pains], axis=0, sort=True)
   _filters = [Chem.MolFromSmarts(x) for x in _mcf_pains['smarts'].values]
   ```

2. **RDKit compatibility issue in `moses/metrics/SA_Score/sascorer.py`**:
   The `rdkit.six` module was removed in RDKit 2024.03.1. Please manually replace the following line in the file:
   
   ```python
   # Comment out this (line 3):
   from rdkit.six import iteritems
   
   # Replace this (line 63)
   for bitId, v in iteritems(fps):

   # With this:
   for bitId, v in fps.items():
   ```

#### Run MOSES

In [None]:
import sys 
app_path  = '../tools/moses-master'
app_path2 = '../tools/fcd_torch'
sys.path.append(app_path)
sys.path.append(app_path2)
import warnings
from multiprocessing import Pool
import numpy as np
from scipy.spatial.distance import cosine as cos_distance
from fcd_torch import FCD as FCDMetric
from scipy.stats import wasserstein_distance
from moses.dataset import get_dataset, get_statistics
from moses.utils import mapper
from moses.utils import disable_rdkit_log, enable_rdkit_log
from moses.metrics.utils import compute_fragments, average_agg_tanimoto, \
    compute_scaffold, compute_scaffolds, fingerprints, \
    get_mol, canonic_smiles, mol_passes_filters, \
    logP, QED, SA, weight
from moses.metrics.metrics import get_all_metrics
import pandas as pd
from rdkit import Chem
import math
from tqdm import tqdm
from src.chem_utils import smiles2scaffold

dataset_list = ['pubchem_filtered_ac', 'pubchem_unfiltered_ac', 'pubchem_inac', 'chembl_filtered', 'chembl_unfiltered', 'zinc']
sample_num_list = [10000, 41743]

for sample_num in sample_num_list:
    metrics_dict_list = []
    print(f'=== Sample size: {sample_num} ===')
    for dataset in dataset_list:
        train_smiles_df = pd.read_table(f'{PRETRAIN_DATA}/sampled_datasets/{dataset}.tsv', index_col=0)
        gen_smiles_df   = pd.read_csv(f'{PRETRAIN_RESULTS}/{dataset}_results/sampling_{sample_num}/gen_smiles.tsv', index_col=0, sep ='\t')

        train_smiles    = train_smiles_df['rdkit_smiles'].to_list()
        gen_smiles      = gen_smiles_df['smiles'].to_list()

        train_scaffolds = [compute_scaffold(smi, min_rings=2) for smi in train_smiles_df['rdkit_smiles']]
        train_scaffolds = [scaf for scaf in train_scaffolds if scaf is not None]
       
        metrics = get_all_metrics(gen_smiles, n_jobs=16, device='cpu', batch_size=512, test=train_smiles, test_scaffolds=train_scaffolds, train=train_smiles)
        metrics_dict_list.append(metrics)
        print(metrics)
 
    metrics_df_list = [pd.DataFrame(d, index=[i]) for i,d in enumerate(metrics_dict_list)]
    metrics_df      = pd.concat(metrics_df_list).T
    metrics_df.columns = dataset_list
    metrics_df.to_csv(f'{PRETRAIN_OUT}/pretrain_gen{sample_num}_moses_metrics.tsv', sep='\t')
    metrics_df