In [1]:
import os
os.chdir('/home/yuke/PythonProject/DrugEmbedding/')
import warnings
warnings.simplefilter(action='ignore')

In [2]:
from tqdm import tnrange
import json
import numpy as np
import pandas as pd
import random
from decode import *

In [3]:
random.seed(1)

In [4]:
def recon_acc_score(configs, model, smiles_sample_lst):
    match_lst = []
    for i in tnrange(len(smiles_sample_lst)):
        smiles_x = smiles_sample_lst[i]
        mean, logv = smiles2mean(configs, smiles_x, model)
        _, _, smiles_lst = latent2smiles(configs, model, z=mean.repeat(200, 1),
                                                   nsamples=1, sampling_mode='random')
        if smiles_x in smiles_lst:
            match_lst.append(1)
        else:
            match_lst.append(0)
    return np.array(match_lst).mean()

# Prepare Model Reference Table

In [16]:
latent_size_lst_64 = [64, 64, 64, 64, 64, 64]
manifold_lst_64 = ['Euclidean', 'Euclidean', 'Euclidean', 'Lorentz', 'Lorentz', 'Lorentz']
exp_dir_lst_64 = ['./experiments/KDD/kdd_009', './experiments/KDD_SEED/kdd_e64_s1', './experiments/KDD_SEED/kdd_e64_s2',
              './experiments/KDD/kdd_010', './experiments/KDD_SEED/kdd_l64_s1', './experiments/KDD_SEED/kdd_l64_s2']
checkpoint_lst_64 = ['checkpoint_epoch110.model', 'checkpoint_epoch120.model', 'checkpoint_epoch120.model',
                 'checkpoint_epoch110.model', 'checkpoint_epoch120.model', 'checkpoint_epoch120.model']

In [17]:
latent_size_lst_32 = [32, 32, 32, 32, 32, 32]
manifold_lst_32 = ['Euclidean', 'Euclidean', 'Euclidean', 'Lorentz', 'Lorentz', 'Lorentz']
exp_dir_lst_32 = ['./experiments/KDD/kdd_015', './experiments/KDD_SEED/kdd_e32_s1', './experiments/KDD_SEED/kdd_e32_s2',
              './experiments/KDD/kdd_016', './experiments/KDD_SEED/kdd_l32_s1', './experiments/KDD_SEED/kdd_l32_s2']
checkpoint_lst_32 = ['checkpoint_epoch110.model', 'checkpoint_epoch130.model', 'checkpoint_epoch110.model',
                 'checkpoint_epoch110.model', 'checkpoint_epoch130.model', 'checkpoint_epoch110.model']

In [18]:
latent_size_lst_8 = [8, 8, 8, 8, 8, 8]
manifold_lst_8 = ['Euclidean', 'Euclidean', 'Euclidean', 'Lorentz', 'Lorentz', 'Lorentz']
exp_dir_lst_8 = ['./experiments/KDD/kdd_017', './experiments/KDD_SEED/kdd_e8_s1', './experiments/KDD_SEED/kdd_e8_s2',
              './experiments/KDD/kdd_018', './experiments/KDD_SEED/kdd_l8_s1', './experiments/KDD_SEED/kdd_l8_s2']
checkpoint_lst_8 = ['checkpoint_epoch110.model', 'checkpoint_epoch120.model', 'checkpoint_epoch110.model',
                 'checkpoint_epoch120.model', 'checkpoint_epoch120.model', 'checkpoint_epoch110.model']

In [19]:
latent_size_lst_4 = [4, 4, 4, 4, 4, 4]
manifold_lst_4 = ['Euclidean', 'Euclidean', 'Euclidean', 'Lorentz', 'Lorentz', 'Lorentz']
exp_dir_lst_4 = ['./experiments/KDD/kdd_019', './experiments/KDD_SEED/kdd_e4_s1', './experiments/KDD_SEED/kdd_e4_s2',
              './experiments/KDD/kdd_020', './experiments/KDD_SEED/kdd_l4_s1', './experiments/KDD_SEED/kdd_l4_s2']
checkpoint_lst_4 = ['checkpoint_epoch100.model', 'checkpoint_epoch110.model', 'checkpoint_epoch110.model',
                 'checkpoint_epoch100.model', 'checkpoint_epoch100.model', 'checkpoint_epoch090.model']

In [20]:
latent_size_lst_2 = [2, 2, 2, 2, 2, 2]
manifold_lst_2 = ['Euclidean', 'Euclidean', 'Euclidean', 'Lorentz', 'Lorentz', 'Lorentz']
exp_dir_lst_2 = ['./experiments/KDD/kdd_021', './experiments/KDD_SEED/kdd_e2_s1', './experiments/KDD_SEED/kdd_e2_s2',
              './experiments/KDD/kdd_022', './experiments/KDD_SEED/kdd_l2_s1', './experiments/KDD_SEED/kdd_l2_s2']
checkpoint_lst_2 = ['checkpoint_epoch110.model', 'checkpoint_epoch100.model', 'checkpoint_epoch090.model',
                 'checkpoint_epoch110.model', 'checkpoint_epoch100.model', 'checkpoint_epoch090.model']

In [21]:
latent_size_lst = latent_size_lst_64 + latent_size_lst_32 + latent_size_lst_8 + latent_size_lst_4 + latent_size_lst_2

In [22]:
manifold_lst = manifold_lst_64 + manifold_lst_32 + manifold_lst_8 + manifold_lst_4 + manifold_lst_2

In [23]:
exp_dir_lst = exp_dir_lst_64 + exp_dir_lst_32 + exp_dir_lst_8 + exp_dir_lst_4 + exp_dir_lst_2

In [24]:
checkpoint_lst = checkpoint_lst_64 + checkpoint_lst_32 + checkpoint_lst_8 + checkpoint_lst_4 + checkpoint_lst_2

In [25]:
df = pd.DataFrame.from_dict({'latent_size': latent_size_lst, 'manifold': manifold_lst,
                      'exp_dir': exp_dir_lst, 'checkpoint': checkpoint_lst})

In [26]:
df.to_csv('./experiments/RECON/model_dir.csv', index=False)

# Load SMILES Test Set

In [28]:
mdl_dir_df = pd.read_csv('./experiments/RECON/model_dir.csv')
mdl_dir_df['recon_acc'] = None

In [29]:
# load SMILES test set
exp_dir = mdl_dir_df['exp_dir'].iloc[0]
smiles_test_file = os.path.join(exp_dir, 'smiles_test.smi')
smiles_test_lst = []
with open(smiles_test_file) as file:
    lines = file.read().splitlines()
    idx = 0
    for l in lines:
        # convert to tokens
        if len(l.split(" ")) == 1: # the SMILES comes from ZINC 250k
            smi = l # remove /n
            id = 'zinc_' + str(idx) # use zinc + idx as instance ID
            idx += 1
        else: # the SMILES comes from FDA drug
            smi = l.split(" ")[0]
            id = l.split(" ")[1].lower() # use FDA drug name as instance ID
            idx += 1
        smiles_test_lst.append(smi)

# Evaluate Molecule Reconstruction Accuracy

## latent size of 64

In [34]:
idx = mdl_dir_df['latent_size'] == 64
sub_df = mdl_dir_df[idx]

In [35]:
for idx, row in sub_df.iterrows():
    print(row)
    exp_dir = row['exp_dir']
    checkpoint = row['checkpoint']
    config_path = os.path.join(exp_dir, 'configs.json')
    checkpoint_path = os.path.join(exp_dir, checkpoint)
    with open(config_path, 'r') as fp:
        configs = json.load(fp)
    fp.close()
    configs['checkpoint'] = checkpoint
    model = load_model(configs)
    smiles_sample_lst = random.sample(smiles_test_lst, 1000)
    recon_score = recon_acc_score(configs, model, smiles_sample_lst)
    mdl_dir_df['recon_acc'].iloc[idx] = recon_score
    print('Recon. accuracy: ' + str(recon_score))
    print('------------------------------------')

latent_size                           64
manifold                       Euclidean
exp_dir        ./experiments/KDD/kdd_009
checkpoint     checkpoint_epoch110.model
recon_acc                           None
Name: 0, dtype: object


HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))


Recon. accuracy: 0.943
------------------------------------
latent_size                                   64
manifold                               Euclidean
exp_dir        ./experiments/KDD_SEED/kdd_e64_s1
checkpoint             checkpoint_epoch120.model
recon_acc                                   None
Name: 1, dtype: object


HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))


Recon. accuracy: 0.97
------------------------------------
latent_size                                   64
manifold                               Euclidean
exp_dir        ./experiments/KDD_SEED/kdd_e64_s2
checkpoint             checkpoint_epoch120.model
recon_acc                                   None
Name: 2, dtype: object


HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))


Recon. accuracy: 0.941
------------------------------------
latent_size                           64
manifold                         Lorentz
exp_dir        ./experiments/KDD/kdd_010
checkpoint     checkpoint_epoch110.model
recon_acc                           None
Name: 3, dtype: object


HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))


Recon. accuracy: 0.93
------------------------------------
latent_size                                   64
manifold                                 Lorentz
exp_dir        ./experiments/KDD_SEED/kdd_l64_s1
checkpoint             checkpoint_epoch120.model
recon_acc                                   None
Name: 4, dtype: object


HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))


Recon. accuracy: 0.954
------------------------------------
latent_size                                   64
manifold                                 Lorentz
exp_dir        ./experiments/KDD_SEED/kdd_l64_s2
checkpoint             checkpoint_epoch120.model
recon_acc                                   None
Name: 5, dtype: object


HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)




Recon. accuracy: 0.97
------------------------------------


## latent size of 32

In [60]:
idx = mdl_dir_df['latent_size'] == 32
sub_df = mdl_dir_df[idx]

In [61]:
for idx, row in sub_df.iterrows():
    print(row)
    exp_dir = row['exp_dir']
    checkpoint = row['checkpoint']
    config_path = os.path.join(exp_dir, 'configs.json')
    checkpoint_path = os.path.join(exp_dir, checkpoint)
    with open(config_path, 'r') as fp:
        configs = json.load(fp)
    fp.close()
    configs['checkpoint'] = checkpoint
    model = load_model(configs)
    smiles_sample_lst = random.sample(smiles_test_lst, 1000)
    recon_score = recon_acc_score(configs, model, smiles_sample_lst)
    mdl_dir_df['recon_acc'].iloc[idx] = recon_score
    print('Recon. accuracy: ' + str(recon_score))
    print('------------------------------------')

latent_size                           32
manifold                       Euclidean
exp_dir        ./experiments/KDD/kdd_015
checkpoint     checkpoint_epoch110.model
recon_acc                           None
Name: 6, dtype: object


HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))


Recon. accuracy: 0.865
------------------------------------
latent_size                                   32
manifold                               Euclidean
exp_dir        ./experiments/KDD_SEED/kdd_e32_s1
checkpoint             checkpoint_epoch130.model
recon_acc                                   None
Name: 7, dtype: object


HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))


Recon. accuracy: 0.911
------------------------------------
latent_size                                   32
manifold                               Euclidean
exp_dir        ./experiments/KDD_SEED/kdd_e32_s2
checkpoint             checkpoint_epoch110.model
recon_acc                                   None
Name: 8, dtype: object


HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))


Recon. accuracy: 0.937
------------------------------------
latent_size                           32
manifold                         Lorentz
exp_dir        ./experiments/KDD/kdd_016
checkpoint     checkpoint_epoch110.model
recon_acc                           None
Name: 9, dtype: object


HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))


Recon. accuracy: 0.881
------------------------------------
latent_size                                   32
manifold                                 Lorentz
exp_dir        ./experiments/KDD_SEED/kdd_l32_s1
checkpoint             checkpoint_epoch130.model
recon_acc                                   None
Name: 10, dtype: object


HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))


Recon. accuracy: 0.948
------------------------------------
latent_size                                   32
manifold                                 Lorentz
exp_dir        ./experiments/KDD_SEED/kdd_l32_s2
checkpoint             checkpoint_epoch110.model
recon_acc                                   None
Name: 11, dtype: object


HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))


Recon. accuracy: 0.89
------------------------------------


## Structure Only Models

### Euclidean Space

In [58]:
seed_lst = [0, 1, 2]
exp_dir = './experiments/EXP_TASK/exp_task_009'
checkpoint = 'checkpoint_epoch100.model'
config_path = os.path.join(exp_dir, 'configs.json')
checkpoint_path = os.path.join(exp_dir, checkpoint)
with open(config_path, 'r') as fp:
    configs = json.load(fp)
fp.close()
configs['checkpoint'] = checkpoint
model = load_model(configs)
for s in seed_lst:
    random.seed(s)
    smiles_sample_lst = random.sample(smiles_test_lst, 1000)
    recon_score = recon_acc_score(configs, model, smiles_sample_lst)
    mdl_dir_df['recon_acc'].iloc[idx] = recon_score
    print('Recon. accuracy: ' + str(recon_score))
    print('------------------------------------')

HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))


Recon. accuracy: 0.905
------------------------------------


HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))


Recon. accuracy: 0.899
------------------------------------


HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))


Recon. accuracy: 0.894
------------------------------------


### Lorentz Space

In [59]:
seed_lst = [0, 1, 2]
exp_dir = './experiments/EXP_TASK/exp_task_010'
checkpoint = 'checkpoint_epoch075.model'
config_path = os.path.join(exp_dir, 'configs.json')
checkpoint_path = os.path.join(exp_dir, checkpoint)
with open(config_path, 'r') as fp:
    configs = json.load(fp)
fp.close()
configs['checkpoint'] = checkpoint
model = load_model(configs)
for s in seed_lst:
    random.seed(s)
    smiles_sample_lst = random.sample(smiles_test_lst, 1000)
    recon_score = recon_acc_score(configs, model, smiles_sample_lst)
    mdl_dir_df['recon_acc'].iloc[idx] = recon_score
    print('Recon. accuracy: ' + str(recon_score))
    print('------------------------------------')

HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))


Recon. accuracy: 0.885
------------------------------------


HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))


Recon. accuracy: 0.854
------------------------------------


HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))


Recon. accuracy: 0.873
------------------------------------


In [5]:
np.array([0.885, 0.854, 0.873]).mean()

0.8706666666666667

In [6]:
np.array([0.885, 0.854, 0.873]).std()

0.01276279314605111