In [1]:
%load_ext autoreload
%autoreload 2

In [20]:
# Imports
import os
import json
import pandas as pd
from utils.download import download_3d_similar_molecules

In [25]:
# Paths
notebook = os.path.join(".")
temp = os.path.join(notebook, ".temp") # use to download temporary files (temporary downloads).
if not os.path.exists(temp):
    os.makedirs(temp)

model_path = os.path.join(notebook, '..', 'models', 'reinvent.prior')
st3d_path = os.path.join(notebook, '..', 'smiles-to-3d', 'smiles3d', 'smiles3d.py')
vsflow_path = os.path.join(notebook, '..', 'vsflow')
vsflow = os.path.join(vsflow_path, 'vsflow')
vsflow_database_path = os.path.join(temp, 'vsflow_db.vsdb')

In [26]:
# Download Smiles
input_smiles = "CC1(OC2C(OC(C2O1)(C#N)C3=CC=C4N3N=CN=C4N)CO)C"
filename = "remdesivir.json"
similar_str_smiles_path = os.path.join(temp, filename)

download_3d_similar_molecules(input_smiles, similar_str_smiles_path)

True

In [27]:
# Reading the downloaded 3d similar structures.

similar_str_smiles = {}
with open(similar_str_smiles_path) as reader:
    similar_str_smiles = json.load(reader)

filtered_smiles = []
for smiles in similar_str_smiles:
    # Only taking smiles that has tanimoto score less than 0.70
    if smiles['Morgan Tanimoto'] < 0.70:
        filtered_smiles.append(smiles)

df = pd.DataFrame(data=filtered_smiles)
df

Unnamed: 0,smiles,zinc_id,properties,Morgan Tanimoto
0,CC1(C)O[C@H]2[C@H](n3c(Br)nc4c3ncnc4N)O[C@H](C...,ZINC15 : ZINC000017381098,"{'absorption': {'caco2_wang': -5.272, 'lipophi...",0.369863
1,CC1(C)O[C@H]2[C@@H](CO)O[C@@H](n3c(Br)nc4c3ncn...,ZINC15 : ZINC000095949869,"{'absorption': {'caco2_wang': -5.272, 'lipophi...",0.369863
2,CC1(C)O[C@@H]2[C@H](CO)O[C@@H](n3cnc4c3ncnc4N)...,ZINC15 : ZINC000100807906,"{'absorption': {'caco2_wang': -5.34, 'lipophil...",0.366197
3,CC1(C)O[C@H]2[C@H](n3cnc4c3ncnc4N)O[C@@H](CO)[...,ZINC15 : ZINC000012958516,"{'absorption': {'caco2_wang': -5.34, 'lipophil...",0.366197
4,CC1(C)O[C@H]2[C@H](n3cnc4c3ncnc4N)O[C@H](CO)[C...,ZINC15 : ZINC000004347645,"{'absorption': {'caco2_wang': -5.34, 'lipophil...",0.366197
...,...,...,...,...
395,CC1(C)O[C@@H]2[C@@H](O1)[C@@H](CO)O[C@@H]2n1cn...,ZINC15 : ZINC000008955192,"{'absorption': {'caco2_wang': -5.333, 'lipophi...",0.285714
396,CC1(C)O[C@@H]2[C@@H](CO)O[C@@H](n3cnc4c3nc[nH]...,ZINC15 : ZINC000101133086,"{'absorption': {'caco2_wang': -5.202, 'lipophi...",0.285714
397,CC1(C)O[C@@H]2[C@@H](CO)O[C@H](n3cnc4c3nc[nH]c...,ZINC15 : ZINC000004538849,"{'absorption': {'caco2_wang': -5.202, 'lipophi...",0.285714
398,CC1(C)O[C@@H]2[C@@H](O1)[C@H](CO)O[C@@H]2n1cnc...,ZINC15 : ZINC000004538848,"{'absorption': {'caco2_wang': -5.202, 'lipophi...",0.285714


In [28]:
# Training (80%), Validation(10%), Test(10%)
df = df.sample(frac=1)

# Define your split sizes
train_size = int(0.8 * len(df))

# Split your DataFrame
train_df = df[:train_size]
valid_df = df[train_size:]

train_set_file = os.path.join(temp, 'training.smi')
valid_set_file = os.path.join(temp, 'validation.smi')


train_df.to_csv(train_set_file, sep="\t", index=False, header=False)
valid_df.to_csv(valid_set_file, sep="\t", index=False, header=False)

In [29]:
# Transfer learning config. (Ref: https://github.com/MolecularAI/REINVENT4/blob/main/notebooks/Reinvent_TLRL.py)

config_filename = os.path.join(temp, 'config.json')
new_model_path = os.path.join(temp, 'temp.model')
reinvet_transfer_learning_parameter = {
    "run_type": "transfer_learning",
    "device": "cpu",
    "tb_logdir": os.path.join(temp, 'tb_TL'),
    "parameters": {
        "num_epochs": 20,
        "save_every_n_epochs": 2,
        "batch_size": 50,
        "sample_batch_size": 500,
        "input_model_file": model_path,
        "output_model_file": new_model_path,
        "smiles_file": train_set_file,
        "validation_smiles_file": valid_set_file,
        "standardize_smiles": True,
        "randomize_smiles": False,
        "randomize_all_smiles": False,
        "internal_diversity": True,
        "pairs": {
            "type": "tanimoto",
            "upper_threshold": 0.6,
            "lower_threshold": 0.0,
            "min_cardinality": 1,
            "max_cardinality": 199
        }
    },
}

with open(config_filename, "w") as writer:
    json.dump(reinvet_transfer_learning_parameter, writer, indent=2)

In [30]:
# Transfer Learning.

!reinvent $config_filename -f json

23:12:07 <INFO> Started REINVENT 4.4.22 (C) AstraZeneca 2017, 2023 on 2024-07-14
23:12:07 <INFO> Command line: /root/miniconda3/envs/reinvent-transfer-learning/bin/reinvent ./.temp/config.json -f json
23:12:07 <INFO> User root on host Ank
23:12:07 <INFO> Python version 3.11.9
23:12:07 <INFO> PyTorch version 2.3.1+cu121, git d44533f9d073df13895333e70b66f81c513c1889
23:12:07 <INFO> PyTorch compiled with CUDA version 12.1
23:12:07 <INFO> RDKit version 2023.09.5
23:12:07 <INFO> Platform Linux-5.15.153.1-microsoft-standard-WSL2-x86_64-with-glibc2.35
23:12:07 <INFO> Number of PyTorch CUDA devices 1
23:12:07 <INFO> Using CPU x86_64
23:12:07 <INFO> Writing TensorBoard summary to /mnt/d/projects/github/reinvent-transfer-learning/notebooks/.temp/tb_TL
23:12:07 <INFO> Starting Transfer Learning
23:12:07 <INFO> /mnt/d/projects/github/reinvent-transfer-learning/models/reinvent.prior has valid hash:
{ 'comments': [],
  'creation_date': 0,
  'date_format': 'UNIX epoch',
  'hash_id': '173568c36e1fc3d9

![A Mean Loss](./a_mean_loss.png)
![B Fraction Valid](b_fraction_valid.png)

In [31]:
# Running new model
new_model_config_path = os.path.join(temp, '_config.json')
output_smiles = os.path.join(temp, 'output.csv')
config = {
    "run_type": "sampling",
    "device": "cpu",
    "parameters": {
        "model_file": new_model_path,
        "output_file": output_smiles,
        "num_smiles": 100,
        "unique_molecules": True,
        "randomize_smiles": True,
    }
}

with open(new_model_config_path, "w") as writer:
    json.dump(config, writer, indent=2)


!reinvent $new_model_config_path -f json

23:19:15 <INFO> Started REINVENT 4.4.22 (C) AstraZeneca 2017, 2023 on 2024-07-14
23:19:15 <INFO> Command line: /root/miniconda3/envs/reinvent-transfer-learning/bin/reinvent ./.temp/_config.json -f json
23:19:15 <INFO> User root on host Ank
23:19:15 <INFO> Python version 3.11.9
23:19:15 <INFO> PyTorch version 2.3.1+cu121, git d44533f9d073df13895333e70b66f81c513c1889
23:19:15 <INFO> PyTorch compiled with CUDA version 12.1
23:19:15 <INFO> RDKit version 2023.09.5
23:19:15 <INFO> Platform Linux-5.15.153.1-microsoft-standard-WSL2-x86_64-with-glibc2.35
23:19:15 <INFO> Number of PyTorch CUDA devices 1
23:19:15 <INFO> Using CPU x86_64
23:19:15 <INFO> Starting Sampling
23:19:15 <INFO> /mnt/d/projects/github/reinvent-transfer-learning/notebooks/.temp/temp.model has valid hash:
{ 'comments': ['TL'],
  'creation_date': 0,
  'date_format': 'UNIX epoch',
  'hash_id': '8e1ba1f960b7f0665892c3c839f01d03',
  'hash_id_format': 'xxhash.xxh3_128_hex 3.4.1',
  'model_id': '55d68f8a81c04f5a86304ebe1723a0ea',
