In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# Imports
import os
import json
import csv
import pandas as pd
from utils.download import download_3d_similar_molecules
from utils.chem import compute_3d_similarity, extract_features

In [6]:
# Paths
notebook = os.path.join(".")
temp = os.path.join(notebook, ".temp") # use to download temporary files (temporary downloads).
if not os.path.exists(temp):
    os.makedirs(temp)

molecule_name = "remdesivir"
temp = os.path.join(temp, molecule_name)

if not os.path.exists(temp):
    os.makedirs(temp)

reinvent_prior_path = os.path.join(notebook, '..', 'models', 'reinvent.prior')

In [7]:
# Download Smiles
input_smiles = "CC1(OC2C(OC(C2O1)(C#N)C3=CC=C4N3N=CN=C4N)CO)C"
similar_str_smiles_path = os.path.join(temp, "similar.json")

download_3d_similar_molecules(input_smiles, similar_str_smiles_path)

True

In [8]:
# Reading the downloaded 3d similar structures.

similar_str_smiles = {}
with open(similar_str_smiles_path) as reader:
    similar_str_smiles = json.load(reader)

df = pd.DataFrame(data=similar_str_smiles)
df

Unnamed: 0,smiles,identifier,similarity
0,CC1(C)O[C@H]2[C@H](n3c(Br)nc4c3ncnc4N)O[C@H](C...,ZINC15 : ZINC000017381098,0.369863
1,CC1(C)O[C@H]2[C@@H](CO)O[C@@H](n3c(Br)nc4c3ncn...,ZINC15 : ZINC000095949869,0.369863
2,CC1(C)O[C@@H]2[C@H](CO)O[C@@H](n3cnc4c3ncnc4N)...,ZINC15 : ZINC000100807906,0.366197
3,CC1(C)O[C@H]2[C@H](n3cnc4c3ncnc4N)O[C@@H](CO)[...,ZINC15 : ZINC000012958516,0.366197
4,CC1(C)O[C@H]2[C@H](n3cnc4c3ncnc4N)O[C@H](CO)[C...,ZINC15 : ZINC000004347645,0.366197
...,...,...,...
395,CC1(C)O[C@@H]2[C@@H](O1)[C@@H](CO)O[C@@H]2n1cn...,ZINC15 : ZINC000008955192,0.285714
396,CC1(C)O[C@@H]2[C@@H](CO)O[C@@H](n3cnc4c3nc[nH]...,ZINC15 : ZINC000101133086,0.285714
397,CC1(C)O[C@@H]2[C@@H](CO)O[C@H](n3cnc4c3nc[nH]c...,ZINC15 : ZINC000004538849,0.285714
398,CC1(C)O[C@@H]2[C@@H](O1)[C@H](CO)O[C@@H]2n1cnc...,ZINC15 : ZINC000004538848,0.285714


In [9]:
# Training (80%), Validation(10%), Test(10%)
df = df.sample(frac=1)

# Define your split sizes
train_size = int(0.8 * len(df))

# Split your DataFrame
train_df = df[:train_size]
valid_df = df[train_size:]

train_set_file = os.path.join(temp, 'training.smi')
valid_set_file = os.path.join(temp, 'validation.smi')


train_df.to_csv(train_set_file, sep="\t", index=False, header=False)
valid_df.to_csv(valid_set_file, sep="\t", index=False, header=False)

In [10]:
# Transfer learning config. (Ref: https://github.com/MolecularAI/REINVENT4/blob/main/notebooks/Reinvent_TLRL.py)

config_filename = os.path.join(temp, 'config.json')
temp_models = os.path.join(temp, 'checkpoints')

if not os.path.exists(temp_models):
    os.mkdir(temp_models)

new_model_path = os.path.join(temp_models, 'temp.model')

reinvet_transfer_learning_parameter = {
    "run_type": "transfer_learning",
    "device": "cpu",
    "tb_logdir": os.path.join(temp, 'tb_TL'),
    "parameters": {
        "num_epochs": 100,
        "save_every_n_epochs": 2,
        "batch_size": 50,
        "sample_batch_size": 100,
        "input_model_file": reinvent_prior_path,
        "output_model_file": new_model_path,
        "smiles_file": train_set_file,
        "validation_smiles_file": valid_set_file,
        "standardize_smiles": True,
        "randomize_smiles": False,
        "randomize_all_smiles": False,
        "internal_diversity": True,
    },
}

with open(config_filename, "w") as writer:
    json.dump(reinvet_transfer_learning_parameter, writer, indent=2)

In [11]:
# Transfer Learning.

!reinvent $config_filename -f json

19:38:37 <INFO> Started REINVENT 4.4.22 (C) AstraZeneca 2017, 2023 on 2024-07-22
19:38:37 <INFO> Command line: /root/miniconda3/envs/reinvent-transfer-learning/bin/reinvent ./.temp/remdesivir/config.json -f json
19:38:37 <INFO> User root on host Ank
19:38:37 <INFO> Python version 3.11.9
19:38:37 <INFO> PyTorch version 2.3.1+cu121, git d44533f9d073df13895333e70b66f81c513c1889
19:38:37 <INFO> PyTorch compiled with CUDA version 12.1
19:38:37 <INFO> RDKit version 2023.09.5
19:38:37 <INFO> Platform Linux-5.15.153.1-microsoft-standard-WSL2-x86_64-with-glibc2.35
19:38:37 <INFO> Number of PyTorch CUDA devices 1
19:38:37 <INFO> Using CPU x86_64
19:38:37 <INFO> Writing TensorBoard summary to /mnt/d/projects/github/reinvent-transfer-learning/notebooks/.temp/remdesivir/tb_TL
19:38:37 <INFO> Starting Transfer Learning
19:38:37 <INFO> /mnt/d/projects/github/reinvent-transfer-learning/models/reinvent.prior has valid hash:
{ 'comments': [],
  'creation_date': 0,
  'date_format': 'UNIX epoch',
  'hash_

In [12]:
# Running new model
new_model_config_path = os.path.join(temp, '_config.json')
output_smiles = os.path.join(temp, 'output.csv')
config = {
    "run_type": "sampling",
    "device": "cpu",
    "parameters": {
        "model_file": new_model_path,
        "output_file": output_smiles,
        "num_smiles": 500,
        "unique_molecules": True,
        "randomize_smiles": True,
    }
}

with open(new_model_config_path, "w") as writer:
    json.dump(config, writer, indent=2)

!reinvent $new_model_config_path -f json

19:41:31 <INFO> Started REINVENT 4.4.22 (C) AstraZeneca 2017, 2023 on 2024-07-22
19:41:31 <INFO> Command line: /root/miniconda3/envs/reinvent-transfer-learning/bin/reinvent ./.temp/remdesivir/_config.json -f json
19:41:31 <INFO> User root on host Ank
19:41:31 <INFO> Python version 3.11.9
19:41:31 <INFO> PyTorch version 2.3.1+cu121, git d44533f9d073df13895333e70b66f81c513c1889
19:41:31 <INFO> PyTorch compiled with CUDA version 12.1
19:41:31 <INFO> RDKit version 2023.09.5
19:41:31 <INFO> Platform Linux-5.15.153.1-microsoft-standard-WSL2-x86_64-with-glibc2.35
19:41:31 <INFO> Number of PyTorch CUDA devices 1
19:41:31 <INFO> Using CPU x86_64
19:41:31 <INFO> Starting Sampling
19:41:31 <INFO> /mnt/d/projects/github/reinvent-transfer-learning/notebooks/.temp/remdesivir/checkpoints/temp.model has valid hash:
{ 'comments': ['TL'],
  'creation_date': 0,
  'date_format': 'UNIX epoch',
  'hash_id': 'c5b16ad84d26ec1cad7daa01a42e793b',
  'hash_id_format': 'xxhash.xxh3_128_hex 3.4.1',
  'model_id': '5

In [26]:
%%time
# Attempting to filter out non-similar molecule without using ML
# In this apprach we will try to calculate RMSD of all the molecules generated
# by our new model. If RMSD is heigher than 2 then we will ignore the molecules.

entries = []

with open(output_smiles) as reader:
    rows = csv.reader(reader)
    next(rows, None) # Skipping header
    for row in rows:
        entries.append(row)

scores = []

for entry in entries:
    score = compute_3d_similarity(input_smiles, entry[0])
    if score[0] is False or score[1] > 2:
        continue
    scores.append({"smile": entry[0], "score": score[1]})

without_ml = os.path.join(temp, 'without_ml.csv')
df = pd.DataFrame(data=scores)

df.to_csv(without_ml, index=False)
df

CPU times: user 7.03 s, sys: 0 ns, total: 7.03 s
Wall time: 7.04 s


Unnamed: 0,smile,score
0,CC1(C)CN(c2cc(N)nc3ccnn23)CC(CO)O1,1.506041
1,CC1(C)OC2C(CO)OC(n3cnc(C(N)=O)c3N)C2O1,1.623593
2,CC1(C)CC(CN)C2CC(C1)C2(C)C,1.797067
3,CC1(C)CN(c2cc(N)n3ncnc3n2)CC(CO)O1,1.406245
4,CC(=O)OCC1OC(n2c(Br)nc3c(N)ncnc32)C2OC(C)(C)OC12,1.626120
...,...,...
129,COCC(O)Cn1cc(CN2CC(C)OC(C)C2)nn1,1.853355
130,CCOC1OC(COc2ncnc3ccnn23)C2OC(C)(C)OC12,1.461097
131,CC1(C)C(C(=O)Nc2ccccc2)N2C(=O)C(=Cc3ccccn3)C2S...,1.836153
132,CC1(C)CC(CC(N)C(=O)O)OC(C)(C)O1,1.370143


# Traning a new classifier model

In [159]:
# Generating Random molecules
# We will use reinent to generate random molecules.

# reinvent_path = os.path.join(notebook, '..', 'models', 'reinvent.prior')
# random_config_path = os.path.join(temp, 'random_config.json')
# out = os.path.join(temp, 'random.csv')
# config = {
#     "run_type": "sampling",
#     "device": "cpu",
#     "parameters": {
#         "model_file": reinvent_path,
#         "output_file": out,
#         "num_smiles": 5000,
#         "unique_molecules": True,
#         "randomize_smiles": True,
#     }
# }

# with open(random_config_path, "w") as writer:
#     json.dump(config, writer, indent=2)


# !reinvent $random_config_path -f json

20:07:22 <INFO> Started REINVENT 4.4.22 (C) AstraZeneca 2017, 2023 on 2024-07-15
20:07:22 <INFO> Command line: /root/miniconda3/envs/reinvent-transfer-learning/bin/reinvent ./.temp/cephalotaxin/random_config.json -f json
20:07:22 <INFO> User root on host Ank
20:07:22 <INFO> Python version 3.11.9
20:07:22 <INFO> PyTorch version 2.3.1+cu121, git d44533f9d073df13895333e70b66f81c513c1889
20:07:22 <INFO> PyTorch compiled with CUDA version 12.1
20:07:22 <INFO> RDKit version 2023.09.5
20:07:22 <INFO> Platform Linux-5.15.153.1-microsoft-standard-WSL2-x86_64-with-glibc2.35
20:07:22 <INFO> Number of PyTorch CUDA devices 1
20:07:22 <INFO> Using CPU x86_64
20:07:22 <INFO> Starting Sampling
20:07:22 <INFO> /mnt/d/projects/github/reinvent-transfer-learning/models/reinvent.prior has valid hash:
{ 'comments': [],
  'creation_date': 0,
  'date_format': 'UNIX epoch',
  'hash_id': '173568c36e1fc3d95cab289c7d31ce0b',
  'hash_id_format': 'xxhash.xxh3_128_hex 3.4.1',
  'model_id': '55d68f8a81c04f5a86304ebe1

In [45]:
%%time
negative_entries = []
# Make sure to download and keep the reference library in .temp folder.
# Download it from here: https://github.com/ersilia-os/groverfeat/blob/main/data/reference_library.csv
reference_smiles_path = os.path.join(notebook, '.temp', 'reference_library.csv')

with open(reference_smiles_path) as reader:
    rows = csv.reader(reader)
    for row in rows:
        negative_entries.append(row)

filtered = []


for idx, entry in enumerate(negative_entries):
    if len(filtered) > 1000:
        break
    score = compute_3d_similarity(input_smiles, entry[0])
    if score[0] is False or score[1] < 2:
        continue
    filtered.append({"smile": entry[0], "score": score[1]})

negative_output = os.path.join(temp, 'negative.csv')
df = pd.DataFrame(data=filtered)
df.to_csv(negative_output, index=False)
df

[20:31:04] UFFTYPER: Unrecognized charge state for atom: 25
[20:31:25] UFFTYPER: Unrecognized charge state for atom: 8
[20:31:27] UFFTYPER: Unrecognized charge state for atom: 5
[20:31:27] UFFTYPER: Unrecognized charge state for atom: 20
[20:31:33] UFFTYPER: Unrecognized charge state for atom: 1
[20:31:51] UFFTYPER: Unrecognized atom type: Se2+2 (5)
[20:31:51] UFFTYPER: Unrecognized atom type: Se2+2 (31)
[20:32:22] UFFTYPER: Unrecognized charge state for atom: 12
[20:32:27] UFFTYPER: Unrecognized atom type: Se2+2 (26)
[20:33:03] UFFTYPER: Unrecognized charge state for atom: 3
[20:33:06] UFFTYPER: Unrecognized charge state for atom: 2


CPU times: user 3min 1s, sys: 56.7 ms, total: 3min 1s
Wall time: 3min 3s


Unnamed: 0,smile,score
0,CC1=C(S(=O)(=O)N2CCCCC2)C2=C(S1)N=CN(CC(=O)N1C...,2.709123
1,CN(C)CCOC1=CC=C(C(=O)/C=C/C2=CC=C(OC3=CC=CC=C3...,3.172030
2,O=C(CCC1=COC2=CC=CC(OCC3CCCCC3)=C2C1=O)C1=CC=C...,2.130951
3,CC1=CC=C(COC2=NN(CN3CCOCC3)C(=S)N2/N=C/C2=CNN=...,2.610649
4,CC(C)(C)C(=O)NC1=NC=NC2=C1C=NN2CCCCN1CCCCCC1,2.562486
...,...,...
996,COCCC(=O)N1[C@@H](C)C2=CC=CC=C2NC[C@H]1C,5.866997
997,CC(C)[C@H](NC(=O)[C@H](CCCCN)NC(=O)[C@@H]1CCCN...,4.098622
998,COC(=O)[C@H]1CCCN1C(=O)[C@@H](C)[C@H](/C=C/C(C...,2.541832
999,CC(C)CC1=CSC(NC2=NC=C(Br)C=C2OCC2=CC=CC=C2)=N1,2.133948


In [46]:
# Prepare data for classifier model
negatives = []
positives = []

# Positives are all the smiles downloaded using cheese api
with open(similar_str_smiles_path) as reader:
    data = json.load(reader)
    
    for entry in data:
        positives.append(entry['smiles'])

# Negatives are all the smiles generated using reinvent
with open(negative_output) as reader:
    data = csv.reader(reader)
    next(data, None) # skipping header
    for row in data:
        negatives.append(row[0])

labels = [1] * len(positives)
labels += [0] * len(negatives)

total = positives + negatives
features = extract_features(total)

In [47]:
# Training
from flaml import AutoML
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X = pd.DataFrame(features)
y = pd.Series(labels)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

automl = AutoML()
automl_settings = {
    "time_budget": 180,  # time budget in seconds
    "metric": 'roc_auc',  # metric to optimize
    "task": 'classification',
    "estimator_list": ['rf'] # random forest
}

automl.fit(X_train, y_train, **automl_settings)

# Checking accuracy
y_pred = automl.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")

[flaml.automl.logger: 07-22 20:34:46] {1680} INFO - task = classification
[flaml.automl.logger: 07-22 20:34:46] {1691} INFO - Evaluation method: holdout
[flaml.automl.logger: 07-22 20:34:46] {1789} INFO - Minimizing error metric: 1-roc_auc
[flaml.automl.logger: 07-22 20:34:46] {1901} INFO - List of ML learners in AutoML Run: ['rf']
[flaml.automl.logger: 07-22 20:34:46] {2219} INFO - iteration 0, current learner rf
[flaml.automl.logger: 07-22 20:34:46] {2345} INFO - Estimated sufficient time budget=1407s. Estimated necessary time budget=1s.
[flaml.automl.logger: 07-22 20:34:46] {2392} INFO -  at 1.0s,	estimator rf's best error=0.0129,	best estimator rf's best error=0.0129
[flaml.automl.logger: 07-22 20:34:46] {2219} INFO - iteration 1, current learner rf
[flaml.automl.logger: 07-22 20:34:46] {2392} INFO -  at 1.1s,	estimator rf's best error=0.0037,	best estimator rf's best error=0.0037
[flaml.automl.logger: 07-22 20:34:46] {2219} INFO - iteration 2, current learner rf
[flaml.automl.logg

In [48]:
# Testing generated output (the output generated by the new model.)

gen_out = []

with open(output_smiles) as reader:
    data = csv.reader(reader)
    next(data, None) # skipping header
    for row in data:
        gen_out.append(row[0])

gen_out_feature = extract_features(gen_out)
gen_out_feature = pd.DataFrame(gen_out_feature)

prediction = automl.predict_proba(gen_out_feature) # predict_proba
prediction

array([[1.79658161e-01, 8.20341839e-01],
       [2.73033126e-02, 9.72696687e-01],
       [5.17598344e-04, 9.99482402e-01],
       [2.73033126e-02, 9.72696687e-01],
       [9.64191303e-01, 3.58086974e-02],
       [9.79534660e-01, 2.04653396e-02],
       [3.73656709e-02, 9.62634329e-01],
       [5.17598344e-04, 9.99482402e-01],
       [6.62704974e-01, 3.37295026e-01],
       [3.91295795e-01, 6.08704205e-01],
       [9.96257924e-01, 3.74207590e-03],
       [4.40924337e-02, 9.55907566e-01],
       [9.79746301e-01, 2.02536994e-02],
       [9.97593108e-01, 2.40689223e-03],
       [2.55175983e-02, 9.74482402e-01],
       [5.17598344e-04, 9.99482402e-01],
       [5.17598344e-04, 9.99482402e-01],
       [6.85069867e-01, 3.14930133e-01],
       [1.70160455e-01, 8.29839545e-01],
       [2.55175983e-02, 9.74482402e-01],
       [8.64019521e-02, 9.13598048e-01],
       [5.17598344e-04, 9.99482402e-01],
       [8.45818116e-01, 1.54181884e-01],
       [2.59382929e-02, 9.74061707e-01],
       [7.465211

In [49]:
ml_prediction_output = os.path.join(temp, 'ml_output.csv')

output = []
for idx, mol in enumerate(gen_out):
    if prediction[idx][1] > 0.5:
        continue
    score = compute_3d_similarity(input_smiles, mol)
    if score[0] is False or score[1] > 2.5:
        continue
    output.append({"smile": gen_out[idx], "score":  score[1]})


with_ml_df = pd.DataFrame(data=output)
with_ml_df.to_csv(ml_prediction_output, index=False, header=False)
with_ml_df

Unnamed: 0,smile,score
0,CC1(C)CC(CN)C2CC(C1)C2(C)C,1.797067
1,COCC12CCN(Cc3ccoc3)CC1CN(Cc1ccncc1)C2,2.238896
2,CC1(C)CC(CC(N)=O)N(c2cncc3ncnn23)C1,1.745158
3,CC1(C)OC(=O)C(CO)O1,1.832699
4,C=CC1OC(Oc2[nH]cnc(=NC)c2C#N)CC1O,1.574039
...,...,...
64,CCc1cc2c(cc1C(=O)O)C(C)(C)C(C)(C)O2,1.559810
65,COCC(O)Cn1cc(CN2CC(C)OC(C)C2)nn1,1.853355
66,CC1(C)C(C(=O)Nc2ccccc2)N2C(=O)C(=Cc3ccccn3)C2S...,1.836153
67,CC1(C)CC(CC(N)C(=O)O)OC(C)(C)O1,1.370143
