In [1]:

import os
from os.path import join, exists, basename
import argparse
import numpy as np
import paddle.fluid as fluid
import paddle
paddle.seed(0)
import random
np.random.seed(42) 
random.seed(42)
fluid.default_startup_program().random_seed = 42
fluid.default_main_program().random_seed = 42
import paddle.nn as nn
import pgl
import pandas as pd
from pahelix.model_zoo.gem_model import GeoGNNModel
from pahelix.utils import load_json_config
from pahelix.datasets.inmemory_dataset import InMemoryDataset
from rdkit.Chem import AllChem

from src.model import DownstreamModel
from src.featurizer import DownstreamTransformFn, DownstreamCollateFn
from src.utils import get_dataset, create_splitter, get_downstream_task_names, get_dataset_stat, \
        calc_rocauc_score, calc_rmse, calc_mae, exempt_parameters

In [34]:
cached_data_path=f"./cached_data/bbbp/rdkit"
splitter = create_splitter("scaffold")
dataset_new = InMemoryDataset(npz_data_path=cached_data_path)
train_dataset, valid_dataset, test_dataset = splitter.split(dataset_new,0.8,0.1,0.1)
test_dataset[0]["label"]

array([1], dtype=int64)

In [None]:
dataset_new[0]

In [10]:
stri="COC(=O)[C@@H]1[C@H]2C[C@H]3c4[nH]c5ccccc5c4CCN3C[C@@H]2CC[C@@H]1O"

mol=AllChem.MolFromSmiles(stri)

In [14]:
AllChem.MolToSmiles(mol)

'COC(=O)[C@H]1[C@@H](O)CC[C@H]2CN3CCc4c([nH]c5ccccc45)[C@@H]3C[C@@H]21'

In [None]:
stri="Cc1c[nH+][o+]c(C([NH])CC(C)C(C)(C)N(C(C)(C)C)C(C)(N)N)c1[O-]"
dataset_new = InMemoryDataset(npz_data_path="./cached_data/bbbp/rdkit")
for item in dataset_new:
        print(item["smiles"])

In [23]:
def std_smiles(item):
    mol=AllChem.MolFromSmiles(item)
    if mol==None:
        return "asd"
    smi=AllChem.MolToSmiles(mol)
    mol=AllChem.MolFromSmiles(item)
    smi=AllChem.MolToSmiles(mol)
    return smi
def save_test_data(dataset,targets,smiles_col):
    cached_data_path=f"./cached_data/{dataset}/rdkit"
    data_path=f"./chemrl_downstream_datasets/{dataset}/raw/{dataset}.csv"
    ref_df=pd.read_csv(data_path)
    if dataset=="bace":
        smiles_col="mol"
    ref_df[smiles_col]=ref_df[smiles_col].apply(std_smiles)
    ref_df=ref_df.set_index(smiles_col)
    print(dataset)
    splitter = create_splitter("scaffold")
    dataset_new = InMemoryDataset(npz_data_path=cached_data_path)
    train_dataset, valid_dataset, test_dataset = splitter.split(dataset_new,0.8,0.1,0.1)
    smiles_list_test={"smiles":[]}
    smiles_list_val={"smiles":[]}
    smiles_list_train={"smiles":[]}
    for target in targets:
        smiles_list_test[target]=[]
        smiles_list_val[target]=[]
        smiles_list_train[target]=[]
    for item in test_dataset:
        smiles_list_test["smiles"].append(item["smiles"])
        for i in range(len(targets)):
            mol=AllChem.MolFromSmiles(item["smiles"])
            smi=AllChem.MolToSmiles(mol)
            smiles_list_test[targets[i]].append(ref_df.loc[smi][targets[i]])
    for item in train_dataset:
        smiles_list_train["smiles"].append(item["smiles"])
        for i in range(len(targets)):
            mol=AllChem.MolFromSmiles(item["smiles"])
            smi=AllChem.MolToSmiles(mol)
            smiles_list_train[targets[i]].append(ref_df.loc[smi][targets[i]])
    for item in valid_dataset:
        smiles_list_val["smiles"].append(item["smiles"])
        for i in range(len(targets)):
            mol=AllChem.MolFromSmiles(item["smiles"])
            smi=AllChem.MolToSmiles(mol)
            smiles_list_val[targets[i]].append(ref_df.loc[smi][targets[i]])
    df_test=pd.DataFrame.from_dict(smiles_list_test)
    df_train=pd.DataFrame.from_dict(smiles_list_train)
    df_val=pd.DataFrame.from_dict(smiles_list_val)
    
    df_test.to_csv(f"test_data/"+dataset+"-gem-test.csv",index=False)
    df_train.to_csv(f"test_data/"+dataset+"-gem-train.csv",index=False)
    df_val.to_csv(f"test_data/"+dataset+"-gem-val.csv",index=False)


In [24]:
datasets=["esol","freesolv","lipo","qm7","bbbp","bace"]
#datasets=["bbbp"]
targets={"esol":['measured log solubility in mols per litre'],"freesolv":['expt'],"lipo":['exp'],"qm7":['u0_atom'],"bbbp":['p_np'],"bace":['Class']}
for dataset in datasets:
    smiles_col="smiles"
    
    save_test_data(dataset,targets[dataset],smiles_col)

esol
freesolv
lipo
qm7
bbbp
bace


In [12]:
def test_set_check(dataset,mode):
    cached_data_path=f"./cached_data/{dataset}/{mode}"
    splitter = create_splitter("scaffold")
    dataset = InMemoryDataset(npz_data_path=cached_data_path)
    train_dataset, valid_dataset, test_dataset = splitter.split(dataset,0.8,0.1,0.1)
    smiles_list=[]
    for item in test_dataset:
        smiles_list.append(item["smiles"])
    return smiles_list

In [16]:
lis1=["asd","a","b","c"]
list2=["a","b"]
set(lis1)-set(list2)

{'asd', 'c'}

In [23]:
datasets=["esol","freesolv","lipo","qm7","bbbp","bace"]
modes=["rdkit","graph","mmffless","geomol"]
for dataset in datasets:
    smiles_list=[]
    for mode in modes:
        smiles=test_set_check(dataset,mode)
        smiles_list.append(smiles)
    for i in range(len(modes)-1):
        for j in range(i+1,len(modes)):
            set1=set(smiles_list[i])
            set2=set(smiles_list[j])
            diff=0
            for item in set1:
                if item not in set2:
                    diff+=1
            for item in set2:
                if item not in set1:
                    diff+=1
            print(f"test_data/{dataset}:mode={modes[i]}-{modes[j]}:{diff}")


test_data/esol:mode=rdkit-graph:0
test_data/esol:mode=rdkit-mmffless:0
test_data/esol:mode=rdkit-geomol:0
test_data/esol:mode=graph-mmffless:0
test_data/esol:mode=graph-geomol:0
test_data/esol:mode=mmffless-geomol:0
test_data/freesolv:mode=rdkit-graph:0
test_data/freesolv:mode=rdkit-mmffless:0
test_data/freesolv:mode=rdkit-geomol:0
test_data/freesolv:mode=graph-mmffless:0
test_data/freesolv:mode=graph-geomol:0
test_data/freesolv:mode=mmffless-geomol:0
test_data/lipo:mode=rdkit-graph:0
test_data/lipo:mode=rdkit-mmffless:0
test_data/lipo:mode=rdkit-geomol:0
test_data/lipo:mode=graph-mmffless:0
test_data/lipo:mode=graph-geomol:0
test_data/lipo:mode=mmffless-geomol:0
test_data/qm7:mode=rdkit-graph:0
test_data/qm7:mode=rdkit-mmffless:0
test_data/qm7:mode=rdkit-geomol:0
test_data/qm7:mode=graph-mmffless:0
test_data/qm7:mode=graph-geomol:0
test_data/qm7:mode=mmffless-geomol:0
test_data/bbbp:mode=rdkit-graph:28
test_data/bbbp:mode=rdkit-mmffless:0
test_data/bbbp:mode=rdkit-geomol:28
test_data/

In [20]:
def model_datasets_check(path1,path2):
    df1=pd.read_csv(path1)
    df2=pd.read_csv(path2)
    smiles1=set(df1["smiles"])
    smiles2=set(df2["smiles"])
    for item in smiles1:
        if item not in smiles2:
            print(item)
    diff=list(smiles1-smiles2).extend((smiles2-smiles1))
    return diff

In [30]:
print(model_datasets_check("test_data/test.csv","test_data/bbbp-gem-test.csv"))

None


In [56]:
test_dataset="test_data/bace-gem-test.csv"
ref_dataset="chemrl_downstream_datasets/bace/raw/bace.csv"
ref_df=pd.read_csv(ref_dataset)
test_df=pd.read_csv(test_dataset)
ref_df=ref_df.set_index("mol")
for item in test_df.iterrows():
    smi=item[1]["smiles"]
    ref_mol=ref_df.loc[smi]
    if ref_mol["Class"]!=item[1]["Class"]:
        print(ref_mol["Class"],item[1]["Class"])


0 -1
0 -1
0 -1
0 -1
0 -1
0 -1
0 -1
0 -1
0 -1
0 -1
0 -1
0 -1
0 -1
0 -1
0 -1
0 -1
0 -1
0 -1
0 -1
0 -1
0 -1
0 -1
0 -1
0 -1
0 -1
0 -1
0 -1
0 -1
0 -1
0 -1
0 -1
0 -1
0 -1
0 -1
0 -1
0 -1
0 -1
0 -1
0 -1
0 -1
0 -1
0 -1
0 -1
0 -1
0 -1
0 -1
0 -1
0 -1
0 -1
0 -1
0 -1
0 -1
0 -1
0 -1
0 -1
0 -1
0 -1
0 -1
0 -1
0 -1
0 -1
0 -1
0 -1
0 -1
0 -1
0 -1
0 -1
0 -1
0 -1
0 -1
0 -1


In [46]:
ref_df

Unnamed: 0,mol,CID,Class,Model,pIC50,MW,AlogP,HBA,HBD,RB,...,PEOE6 (PEOE6),PEOE7 (PEOE7),PEOE8 (PEOE8),PEOE9 (PEOE9),PEOE10 (PEOE10),PEOE11 (PEOE11),PEOE12 (PEOE12),PEOE13 (PEOE13),PEOE14 (PEOE14),canvasUID
0,O1CC[C@@H](NC(=O)[C@@H](Cc2cc3cc(ccc3nc2N)-c2c...,BACE_1,1,Train,9.154901,431.56979,4.4014,3,2,5,...,53.205711,78.640335,226.855410,107.434910,37.133846,0.000000,7.980170,0.000000,0.000000,1
1,Fc1cc(cc(F)c1)C[C@H](NC(=O)[C@@H](N1CC[C@](NC(...,BACE_2,1,Train,8.853872,657.81073,2.6412,5,4,16,...,73.817162,47.171600,365.676940,174.076750,34.923889,7.980170,24.148668,0.000000,24.663788,2
2,S1(=O)(=O)N(c2cc(cc3c2n(cc3CC)CC1)C(=O)N[C@H](...,BACE_3,1,Train,8.698970,591.74091,2.5499,4,3,11,...,70.365707,47.941147,192.406520,255.752550,23.654478,0.230159,15.879790,0.000000,24.663788,3
3,S1(=O)(=O)C[C@@H](Cc2cc(O[C@H](COCC)C(F)(F)F)c...,BACE_4,1,Train,8.698970,591.67828,3.1680,4,3,12,...,56.657166,37.954151,194.353040,202.763350,36.498634,0.980913,8.188327,0.000000,26.385181,4
4,S1(=O)(=O)N(c2cc(cc3c2n(cc3CC)CC1)C(=O)N[C@H](...,BACE_5,1,Train,8.698970,629.71283,3.5086,3,3,11,...,78.945702,39.361153,179.712880,220.461300,23.654478,0.230159,15.879790,0.000000,26.100143,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1508,Clc1cc2nc(n(c2cc1)C(CC(=O)NCC1CCOCC1)CC)N,BACE_1543,0,Test,3.000000,364.86969,2.5942,3,2,6,...,37.212799,37.681076,180.226410,95.670128,30.107586,9.368159,7.980170,0.000000,0.000000,1543
1509,Clc1cc2nc(n(c2cc1)C(CC(=O)NCc1ncccc1)CC)N,BACE_1544,0,Test,3.000000,357.83731,2.8229,3,2,6,...,45.792797,47.349350,122.401500,99.877144,30.107586,9.368159,7.980170,0.000000,0.000000,1544
1510,Brc1cc(ccc1)C1CC1C=1N=C(N)N(C)C(=O)C=1,BACE_1545,0,Test,2.953115,320.18451,3.0895,2,1,2,...,47.790600,22.563574,96.290794,58.798935,20.071724,9.368159,0.000000,6.904104,0.000000,1545
1511,O=C1N(C)C(=NC(=C1)C1CC1c1cc(ccc1)-c1ccccc1)N,BACE_1546,0,Test,2.733298,317.38440,3.8595,2,1,3,...,77.219978,9.316234,95.907784,112.609720,20.071724,9.368159,0.000000,6.904104,0.000000,1546
