In [1]:
import os
from rdkit import RDLogger                                                                                                                                                               
import numpy as np
import pandas as pd
from fcd import get_fcd, load_ref_model,canonical_smiles, get_predictions, calculate_frechet_distance

RDLogger.DisableLog('rdApp.*')

np.random.seed(0)
os.environ["CUDA_VISIBLE_DEVICES"]= '0' #set gpu

### Load and prepare data
Take care when preparing data.
- Different canonicalizations change the FCD. rdkit canonicalization should be used.
- In case the generated "molecules" contain invalid SMILES, decide if you want to include them in the FCD.
- Make sure that you respect sample sizes as the FCD varies with size.

In [2]:
# Load chemnet model
model = load_ref_model()

# Load generated molecules
gen_mol_file = "generated_smiles/LSTM_Segler.smi" #input file which contains one generated SMILES per line
gen_mol = pd.read_csv(gen_mol_file,header=None)[0] #IMPORTANT: take at least 10000 molecules as FCD can vary with sample size 
sample1 = np.random.choice(gen_mol, 10000, replace=False)
sample2 = np.random.choice(gen_mol, 10000, replace=False)

# get canonical smiles and filter invalid ones
can_sample1 = [w for w in canonical_smiles(sample1) if w is not None]
can_sample2 = [w for w in canonical_smiles(sample2) if w is not None]

## Calculation of FCD

In [3]:
#get CHEBMLNET activations of generated molecules 
act1 = get_predictions(model, can_sample1)
act2 = get_predictions(model, can_sample2)

mu1 = np.mean(act1, axis=0)
sigma1 = np.cov(act1.T)

mu2 = np.mean(act2, axis=0)
sigma2 = np.cov(act2.T)

fcd_score = calculate_frechet_distance(
    mu1=mu1,
    mu2=mu2, 
    sigma1=sigma1,
    sigma2=sigma2)

print('FCD: ',fcd_score)

FCD:  0.333862289051325


In [4]:
"""if you don't need to store the activations you can also take a shortcut."""
fcd_score = get_fcd(can_sample1, can_sample2, model)

print('FCD: ',fcd_score)

FCD:  0.333862289051325


In [5]:
"""This is what happens if you do not canonicalize the smiles"""
fcd_score = get_fcd(can_sample1, sample2, model)
print('FCD: ',fcd_score)

FCD:  25.635578193222216
