# Evaluate Pre-training Results
This notebook provides evaluation of generated SMILES in the pre-training stage.

In [None]:
import sys
sys.path.append('../')

from src.paths import ensure_dirs, PRETRAIN_DATA, PRETRAIN_RESULTS, PRETRAIN_OUT
ensure_dirs()

dataset_list = ['pubchem_filtered_ac', 
                'pubchem_unfiltered_ac', 
                'pubchem_inac', 
                'chembl_filtered', 
                'chembl_unfiltered', 
                'zinc']

### 1. Metrics 

For each dataset, we evaluate the generated molecules in terms of:
- **Validity**: fraction of chemically valid SMILES
- **Uniqueness**: fraction of unique valid SMILES
- **Novelty**: fraction of unique valid SMILES not present in the training set
- **Average length**: average number of characters in canonical SMILES

The results are summarized across datasets and saved as `pretrain_metrics.tsv`.

In [None]:
from src.model.generative_models.generative_models.utils import Smiles2RDKitCanonicalSmiles
from src.metrics import gen_smiles_metrics
import pandas as pd

metrics_dict = []
for dataset in dataset_list:
    train_smiles_df = pd.read_table(f'{PRETRAIN_DATA}/sampled_datasets/{dataset}.tsv', index_col=0)
    gen_smiles_df   = pd.read_table(f'{PRETRAIN_RESULTS}/{dataset}_results/sampling_10000/gen_smiles.tsv', index_col=0)

    dict = gen_smiles_metrics(train_smiles=train_smiles_df, gen_smiles=gen_smiles_df, 
                              trainsmi_col='rdkit_smiles', gensmi_col='smiles')
    metrics_dict.append(dict)

metrics_df = pd.DataFrame(metrics_dict, index=dataset_list).round(3)
metrics_df.T.to_csv(f'{PRETRAIN_OUT}/pretrain_metrics.tsv', sep='\t')
metrics_df.T

### 2. Cohen's *d*
Compute Cohen's *d* as a measure of the standardized difference in means between the generated and training molecules for MW, LogP and TPSA.
Values close to 0 indicate strong similarity, whereas values around 0.2, 0.5, and 0.8 
are conventionally interpreted as small, medium, and large differences, respectively.

In [None]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import Descriptors
from tqdm import tqdm
from src.metrics import cohens_d
from src.chem_utils import calc_properties

dataset_list = ['pubchem_filtered_ac', 'pubchem_unfiltered_ac', 'pubchem_inac', 'chembl_filtered', 'chembl_unfiltered', 'zinc']
properties  = ['mw', 'logP', 'tpsa']

cohens_data = {prop: [] for prop in properties}

for dataset in dataset_list:
    train_smiles_df = pd.read_table(f'{PRETRAIN_DATA}/sampled_datasets/{dataset}.tsv', index_col=0)
    train_prop_df   = calc_properties(train_smiles_df, smiles_col='rdkit_smiles')

    gen_smiles_df = pd.read_table(f'{PRETRAIN_RESULTS}/{dataset}_results/sampling_10000/gen_smiles.tsv', index_col=0)
    gen_smiles_df.dropna(subset=['smiles'], inplace=True)
    gen_smiles_df.drop_duplicates(subset=['smiles'], inplace=True)
    gen_prop_df   = calc_properties(gen_smiles_df, smiles_col='smiles')

    for prop in properties:
        cohens_data[prop].append(cohens_d(train_prop_df[prop], gen_prop_df[prop]))

cohens_df = pd.DataFrame(cohens_data, index=dataset_list).T.round(3)
cohens_table = cohens_df.reset_index(names='property')
cohens_table.to_csv(f"{PRETRAIN_OUT}/pretrain_cohen's_d.tsv", sep='\t', index=False)
cohens_table

### 3. MOSES
Clone the [MOSES benchmark](https://github.com/molecularsets/moses) repository and place it into the `tools` directory:

In [None]:
!git clone https://github.com/molecularsets/moses.git ../tools/moses-master/

Clone the [fcd_torch](https://github.com/insilicomedicine/fcd_torch) repository and place it into the `tools` directory:

In [None]:
!git clone https://github.com/insilicomedicine/fcd_torch.git ../tools/fcd_torch

#### # **Pandas & RDKit Compativility Fix**

Some code in MOSES doesn't work in the current environment. The following fixes are needed:

1. **Pandas compatibility issue in `moses/metrics/utils.py`**: 
   The `append` method has been removed from pandas. Please manually replace the following lines in the file:
   
   ```python
   # Replace this (line 23-24):
   _filters = [Chem.MolFromSmarts(x) for x in
               _mcf.append(_pains, sort=True)['smarts'].values]
   
   # With this:
   _mcf_pains = pd.concat([_mcf,_pains], axis=0, sort=True)
   _filters = [Chem.MolFromSmarts(x) for x in _mcf_pains['smarts'].values]
   ```

2. **RDKit compatibility issue in `moses/metrics/SA_Score/sascorer.py`**:
   The `rdkit.six` module was removed in RDKit 2024.03.1. Please manually replace the following line in the file:
   
   ```python
   # Comment out this (line 27):
   from rdkit.six import iteritems
   
   # Replace this (line 63)
   for bitId, v in iteritems(fps):

   # With this:
   for bitId, v in fps.items():
   ```

#### Run MOSES

In [None]:
import sys 
app_path  = '../tools/moses-master'
app_path2 = '../tools/fcd_torch'
sys.path.append(app_path)
sys.path.append(app_path2)
import warnings
from multiprocessing import Pool
import numpy as np
from scipy.spatial.distance import cosine as cos_distance
from fcd_torch import FCD as FCDMetric
from scipy.stats import wasserstein_distance
from moses.dataset import get_dataset, get_statistics
from moses.utils import mapper
from moses.utils import disable_rdkit_log, enable_rdkit_log
from moses.metrics.utils import compute_fragments, average_agg_tanimoto, \
    compute_scaffold, compute_scaffolds, fingerprints, \
    get_mol, canonic_smiles, mol_passes_filters, \
    logP, QED, SA, weight
from moses.metrics.metrics import get_all_metrics
import pandas as pd
from rdkit import Chem
import math
from tqdm import tqdm
from src.chem_utils import smiles2scaffold

dataset_list = ['pubchem_filtered_ac', 'pubchem_unfiltered_ac', 'pubchem_inac', 'chembl_filtered', 'chembl_unfiltered', 'zinc']
sample_num_list = [10000, 41743]

for sample_num in sample_num_list:
    metrics_dict_list = []
    print(f'=== Sample size: {sample_num} ===')
    for dataset in dataset_list:
        train_smiles_df = pd.read_table(f'{PRETRAIN_DATA}/sampled_datasets/{dataset}.tsv', index_col=0)
        gen_smiles_df   = pd.read_csv(f'{PRETRAIN_RESULTS}/{dataset}_results/sampling_{sample_num}/gen_smiles.tsv', index_col=0, sep ='\t')

        train_smiles    = train_smiles_df['rdkit_smiles'].to_list()
        gen_smiles      = gen_smiles_df['smiles'].to_list()

        train_scaffolds = [compute_scaffold(smi, min_rings=2) for smi in train_smiles_df['rdkit_smiles']]
        train_scaffolds = [scaf for scaf in train_scaffolds if scaf is not None]
       
        metrics = get_all_metrics(gen_smiles, n_jobs=16, device='cpu', batch_size=512, test=train_smiles, test_scaffolds=train_scaffolds, train=train_smiles)
        metrics_dict_list.append(metrics)
        print(metrics)
 
    metrics_df_list = [pd.DataFrame(d, index=[i]) for i,d in enumerate(metrics_dict_list)]
    metrics_df      = pd.concat(metrics_df_list).T
    metrics_df.columns = dataset_list
    metrics_df.to_csv(f'{PRETRAIN_OUT}/pretrain_gen{sample_num}_moses_metrics.tsv', sep='\t')
    metrics_df

### 4. Filter-passing Rate

In [None]:
import os
import pandas as pd
from rdkit import RDLogger

from src.dataset_curation import filter_structural_alerts
from tqdm import tqdm
RDLogger.DisableLog('rdApp.*')

save_dir = f'{PRETRAIN_OUT}/filtering/'
os.makedirs(save_dir, exist_ok=True)

stats = []
for dataset in tqdm(dataset_list):
    gen_smi_df = pd.read_table(f'{PRETRAIN_RESULTS}/{dataset}_results/sampling_10000/gen_smiles.tsv')
    uniq_smi_df = gen_smi_df.dropna(subset=['smiles']).drop_duplicates(subset=['smiles'])

    passed_df, pass_rate, filter_flags = filter_structural_alerts(uniq_smi_df, 'smiles')

    filter_flags.to_csv(f'{save_dir}/{dataset}_flags.tsv', sep='\t')
    passed_df.to_csv(f'{save_dir}/{dataset}_all_passed_smiles.tsv', sep='\t')
    n_uniq   = filter_flags.shape[0]
    n_passed  = len(passed_df)
    stats.append({'dataset': dataset, '#unique': n_uniq, '#passed': n_passed, 'pass_rate': pass_rate})

stats_df = pd.DataFrame(stats)
stats_df.to_csv(f'{PRETRAIN_OUT}/pretrain_filter-passing_rate.tsv', sep='\t')

stats_df