In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os.path
sns.set_context("poster")
sns.set_style("whitegrid")
sns.set_palette("Set2")
pal = sns.color_palette()

import pandas as pd
import sys
import umap

from rdkit import Chem
from atomsci.ddm.pipeline import model_pipeline as mp
from atomsci.ddm.pipeline import parameter_parser as parse
import atomsci.ddm.utils.struct_utils as struct_utils
import atomsci.ddm.pipeline.model_tracker as mt
import atomsci.ddm.pipeline.chem_diversity as cd
import atomsci.ddm.pipeline.predict_from_model as pfm
import atomsci.ddm.pipeline.featurization as feat
import atomsci.ddm.pipeline.diversity_plots as dp

import warnings
warnings.filterwarnings(action='once')
from sklearn.metrics import roc_curve, auc, roc_auc_score, r2_score, precision_recall_curve, average_precision_score, confusion_matrix

pd.set_option('display.max_rows', 20)
pd.set_option('display.max_columns', 90)

DEBUG:ATOM:Model tracker client not supported in your environment; will save models in filesystem only.


In [2]:
p1_training_data_path = 'PARP1_gostar_2022-06-20_chembl_30_gostar_custom_2022-06-23_UCSF_pIC50_agg.csv'

In [3]:
p1_path = "20230904_20220620_updated_PARP1_chembl_gostar_ucsf_pIC50_agg_model_d0e0add7-f76f-43df-880f-3c280af2501c.tar.gz"
# paths= [p1_path]
models= ['PARP1_pXC50']
feats = ['graphconv']

In [4]:
df=pd.read_csv(p1_training_data_path)

In [5]:
df

Unnamed: 0,compound_id,base_rdkit_smiles,relation,activity_value
0,gvk_121938715,Cc1cc2c(=O)[nH]c(-c3cnn(C(C)(C)C)c3)cn2c1,,6.522879
1,gvk_122322701,CC(C)(O)c1ccc(-c2nn3cc(Cl)cc3c(=O)[nH]2)cc1,,5.522879
2,Otava_1084099,Cc1oc2ccc(O)cc2c1C(=O)Nc1cc(Cl)ccc1Cl,,4.444000
3,gvk_125398920,COc1cc(C=C2C(=O)N(C)C(=O)N(C)C2=O)cc(CN2CCOCC2...,,7.381638
4,gvk_125378925,O=c1cc(-c2ccc(O)cc2)oc2c(CN3CCN(c4ccc(O)cc4)CC...,,7.401209
...,...,...,...,...
9406,gvk_3823972,O=c1[nH]nc(CCCCN2CC=C(c3ccc(F)cc3)CC2)c2ccccc12,,7.309804
9407,gvk_15323173,O=C(c1cccc(CCl)c1)N1CCc2c([nH]c(=O)c3ccccc23)C1,,6.097453
9408,gvk_15327451,CCc1cc(-c2ccc(CNCc3cnn(C)c3)o2)c(C)[nH]c1=O,,6.522879
9409,gvk_125440805,O=C1N(c2cncc(Cc3n[nH]c(=O)c4ccccc34)c2)c2ccccc...,>,8.000000


In [6]:
smiles = df['base_rdkit_smiles'][0]

In [7]:
gen_smiles_df = pd.DataFrame(data={'smiles': smiles}, index=[0])

In [8]:
gen_smiles_df

Unnamed: 0,smiles
0,Cc1cc2c(=O)[nH]c(-c3cnn(C(C)(C)C)c3)cn2c1


In [9]:
input_df = gen_smiles_df

In [10]:
smiles_col = 'smiles'

In [11]:
dont_standardize = True # smiles have already been standardized
AD_method = 'z_score'

In [12]:
is_featurized=False

In [13]:
pred_df = pfm.predict_from_model_file(model_path = p1_path, 
                                      id_col = None,
                                      input_df=gen_smiles_df, 
                                      smiles_col=smiles_col,
                                      dont_standardize=True, 
                                      is_featurized = is_featurized) #, 
                                      #AD_method=AD_method)

INFO:atomsci.ddm.utils.model_version_utils:20230904_20220620_updated_PARP1_chembl_gostar_ucsf_pIC50_agg_model_d0e0add7-f76f-43df-880f-3c280af2501c.tar.gz, 1.6.0
INFO:atomsci.ddm.utils.model_version_utils:Version compatible check: 20230904_20220620_updated_PARP1_chembl_gostar_ucsf_pIC50_agg_model_d0e0add7-f76f-43df-880f-3c280af2501c.tar.gz version = "1.6.0", AMPL version = "1.6.0"


num_model_tasks is deprecated and its value is ignored.
Featurization = DynamicFeaturization with graphconv features
number of features: 75


In [14]:
pred_df

Unnamed: 0,smiles,compound_id,activity_value_pred
0,Cc1cc2c(=O)[nH]c(-c3cnn(C(C)(C)C)c3)cn2c1,compound_000000,5.217463


# Predict quicker

In [15]:
import pickle
import numpy as np

In [16]:
from atomsci.ddm.pipeline.predict_from_model import _prepare_input_data

In [None]:
res_files = [f"/home/alif/JTVAE/logs/bo/chem_pXC50/rank/k_1e-2/r_50/seed730007773/results.npz",
            f"/home/alif/JTVAE/logs/bo/chem_pXC50/rank/k_1e-3/r_50/seed730007773/results.npz",
            f"/home/alif/JTVAE/logs/bo/chem_pXC50/rank/k_1e-4/r_50/seed730007773/results.npz",
            f"/home/alif/JTVAE/logs/bo/chem_pXC50/rank/k_1e-5/r_50/seed730007773/results.npz",
            f"/home/alif/JTVAE/logs/bo/chem_pXC50/rank/k_1e-5/r_50/seed730007773/results.npz"]

all_smiles = []

for res_file in res_files:
    results = np.load(res_file, allow_pickle = True)

    smiles = results['sample_points'].reshape((-1, ))

    smiles = set(smiles)
    if None in smiles:
        smiles.remove(None)
    print(len(smiles))
    all_smiles.extend(list(smiles))

In [None]:
import bionetgen
import contextlib
import os
import pandas as pd

import numpy as np

import pickle

from atomsci.ddm.pipeline import model_pipeline as mp
from atomsci.ddm.pipeline import parameter_parser as parse

xc_path = "/home/alif/JTVAE/updated_pXC50_predictor/PARP1_CGUAgg_2022-06_fingerprint_graphconv_model_4f296899-1e4f-4d08-a7c5-47ef64d7fec3.tar.gz"

model_path = xc_path
smiles_col = 'smiles'
response_col = 'pXC50'
dont_standardize = True
is_featurized = False

pred_params = {'featurizer': 'computed_descriptors', 
               'result_dir': None,
               'id_col': 'compound_id', 
               'smiles_col': smiles_col,
               'response_cols': response_col}

pred_params = parse.wrapper(pred_params)
          
def get_pipeline(pred_params,model_path,reload_dir=None,verbose=False):
    pipe = mp.create_prediction_pipeline_from_file(pred_params, 
                                               reload_dir=None, 
                                               model_path=model_path, 
                                               verbose=False)
   
    return pipe 
     
pipe = get_pipeline(pred_params=pred_params,model_path=model_path)

In [None]:
def pXC50(smiles):
    with contextlib.redirect_stdout(None):
        pred_df = pipe.predict_on_smiles([smiles], AD_method='z_score')
        pIC50 = pred_df['pred'][0]
    return pIC50

In [None]:
from tqdm.notebook import tqdm
pXC50_dict = {}

with tqdm(total=len(all_smiles)) as pbar:
    for smiles in all_smiles:
        pXC50_dict[smiles] = pXC50(smiles)
        pbar.update(1)

In [None]:
with open('pXC50_new_PARP_gen_with_pXC50_opt.pkl', 'wb') as handle:
    pickle.dump(pXC50_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
with open('pXC50_new_PARP_gen_with_pXC50_opt.pkl', "rb") as f:
    loaded_pXC50_dict = pickle.load(f)

In [None]:
len(loaded_pXC50_dict)

In [None]:
for smiles in all_smiles:
    print(loaded_pXC50_dict[smiles])

# Prediction

In [None]:
# import os
# os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

import warnings
warnings.filterwarnings(action='once')
warnings.simplefilter("ignore")

import timeit

start = timeit.default_timer()

#Your statements here

import bionetgen
import contextlib
import os
import pandas as pd

import numpy as np

import pickle

from atomsci.ddm.pipeline import predict_from_model as pfm

from multiprocessing import Pool

In [None]:
def calc_therapeutic_score(pXC50):
    with contextlib.redirect_stdout(None):
        model = bionetgen.bngmodel("/home/alif/BioNetGen/Apopt Repair Toy Model 102521 v1.0.bngl", 'model')
        model.parameters.IC50 = 10 ** (6 - pXC50)
        result = bionetgen.run(model)
        therapeutic_score = result['Apopt Repair Toy Model 102521 v1.0'][-1][6]
    
    return therapeutic_score

In [None]:
xc_path = "/home/alif/JTVAE/updated_pXC50_predictor/PARP1_CGUAgg_2022-06_fingerprint_graphconv_model_4f296899-1e4f-4d08-a7c5-47ef64d7fec3.tar.gz"

In [None]:
model_path = xc_path
smiles_col = 'smiles'
response_col = 'pXC50'
dont_standardize = True
is_featurized = False

In [None]:
with open('/media/alif/Alif/Research(Yoon)/weighted-retraining/data/chem/zinc/orig_model/pen_logP_all.pkl', "rb") as f:
    property_dict = pickle.load(f)

In [None]:
all_smiles = list(property_dict.keys())

all_smiles = all_smiles[:1000]

In [None]:
input_df = pd.DataFrame(all_smiles, columns = ['smiles'])

In [None]:
pred_df = pfm.predict_from_model_file(model_path = model_path, 
                                      input_df=input_df, 
                                      smiles_col=smiles_col, 
                                      response_col=response_col,
                                      dont_standardize=dont_standardize, 
                                      is_featurized = is_featurized, 
                                      AD_method='z_score')

In [None]:
pXC50 = list(pred_df['activity_value_pred'])

In [None]:
therapeutic_score = [calc_therapeutic_score(pIC50) for pIC50 in pXC50]

In [None]:
# model = bionetgen.bngmodel("/home/alif/BioNetGen/Apopt Repair Toy Model 102521 v1.0.bngl", 'model')

In [None]:
# model.parameters.IC50 = 10 ** (6 - pXC50[0])

In [None]:
# result = bionetgen.run(model)

In [None]:
pred_df

In [None]:
if os.path.exists('test.txt'):
    os.remove('test.txt')

In [None]:
pXC50_dict = {}
for smiles, pXC50 in zip(pred_df['smiles'], pred_df['activity_value_pred']):
    pXC50_dict[smiles] = pXC50
    with open('test.txt', 'a') as f:
        f.writelines(smiles+': '+str(pXC50)+'\n')

In [None]:
with open('pXC50.pkl', 'wb') as handle:
    pickle.dump(pXC50_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
with open('/media/alif/Alif/Research(Yoon)/PARP Inhibition/pXC50_new_PARP.pkl', "rb") as f:
    property_dict = pickle.load(f)

In [None]:
max(property_dict.values())

In [None]:
def calc_therapeutic_score(smiles):
    with contextlib.redirect_stdout(None):
        pXC50 = property_dict[smiles]
        model = bionetgen.bngmodel("/media/alif/Alif/Research(Yoon)/BioNetGen/Apopt Repair Toy Model 102521 v1.0.bngl", 'model')
        model.parameters.IC50 = 10 ** (6 - pred_df['activity_value_pred'])
        result = bionetgen.run(model)
        therapeutic_score = result['Apopt Repair Toy Model 102521 v1.0'][-1][6]
    
    return therapeutic_score

In [None]:
model = bionetgen.bngmodel("/media/alif/Alif/Research(Yoon)/BioNetGen/Apopt Repair Toy Model 102521 v1.0.bngl", 'model')

In [None]:
model.parameters.IC50 = 10 ** (6 - 1.47)

In [None]:
result = bionetgen.run(model)

In [None]:
therapeutic_score = result['Apopt Repair Toy Model 102521 v1.0'][-1][6]

In [None]:
therapeutic_score

# Predict Quicker

In [None]:
import glob
import numpy as np

In [None]:
result_dir = "/home/alif/JTVAE/sample-results"
res_files = glob.glob(result_dir+'/*.npz')

In [None]:
all_smiles = []

for res_file in res_files:
    results = np.load(res_file, allow_pickle = True)

    smiles = results['sample_points'].reshape((-1, ))

    smiles = set(smiles)
    if None in smiles:
        smiles.remove(None)
    print(res_file, len(smiles))
    all_smiles.extend(list(smiles))

In [None]:
import bionetgen
import contextlib
import os
import pandas as pd

import numpy as np

import pickle

from atomsci.ddm.pipeline import predict_from_model as pfm

In [None]:
xc_path = "/home/alif/JTVAE/updated_pXC50_predictor/PARP1_CGUAgg_2022-06_fingerprint_graphconv_model_4f296899-1e4f-4d08-a7c5-47ef64d7fec3.tar.gz"

In [None]:
model_path = xc_path
smiles_col = 'smiles'
response_col = 'pXC50'
dont_standardize = True
is_featurized = False

In [None]:
all_smiles = all_smiles[:1000]
input_df = pd.DataFrame(all_smiles, columns = ['smiles'])

In [None]:
pred_df = pfm.predict_from_model_file(model_path = model_path, 
                                      input_df=input_df, 
                                      smiles_col=smiles_col, 
                                      response_col=response_col,
                                      dont_standardize=dont_standardize, 
                                      is_featurized = is_featurized, 
                                      AD_method='z_score')

In [None]:
pXC50_dict = {}
for smiles, pXC50 in zip(pred_df['smiles'], pred_df['activity_value_pred']):
    pXC50_dict[smiles] = pXC50
#     with open('test.txt', 'a') as f:
#         f.writelines(smiles+': '+str(pXC50)+'\n')

In [None]:
pXC50_dict

In [None]:
with open('gen_pXC50.pkl', 'wb') as handle:
    pickle.dump(pXC50_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)