In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# Imports
import os
import json
import pandas as pd
from utils.download import download_3d_similar_molecules

In [4]:
# Paths
notebook = os.path.join(".")
temp = os.path.join(notebook, ".temp") # use to download temporary files (temporary downloads).
if not os.path.exists(temp):
    os.makedirs(temp)

model_path = os.path.join(notebook, '..', 'models', 'reinvent.prior')
vsflow_path = os.path.join(notebook, '..', 'vsflow')
vsflow = os.path.join(vsflow_path, 'vsflow')
vsflow_database_path = os.path.join(temp, 'vsflow_db.vsdb')

In [5]:
# Download Smiles
input_smiles = "CC1(OC2C(OC(C2O1)(C#N)C3=CC=C4N3N=CN=C4N)CO)C"
filename = "remdesivir.json"
similar_str_smiles_path = os.path.join(temp, filename)

download_3d_similar_molecules(input_smiles, similar_str_smiles_path)

True

In [6]:
# Reading the downloaded 3d similar structures.

similar_str_smiles = {}
with open(similar_str_smiles_path) as reader:
    similar_str_smiles = json.load(reader)

filtered_smiles = []
for smiles in similar_str_smiles['neighbors']:
    # Only taking smiles that has tanimoto score less than 0.70
    if smiles['Morgan Tanimoto'] < 0.70:
        filtered_smiles.append(smiles['smiles'])

df = pd.DataFrame(data=filtered_smiles)
df

Unnamed: 0,0
0,CC1(C)CN(C2=CC=NC(N)=C2C#N)C[C@H](CO)O1
1,CC1(C)CN(C2=CC=NC(N)=C2C#N)CC(CO)O1
2,CC1(C)CN(C2=CC=NC(N)=C2C#N)C[C@@H](CO)O1
3,CC1(C)O[C@H]2[C@@H](O1)[C@@H](n1cnc3c1ncnc3N)O...
4,CC1(C)CN(C2=CC=NC3=C(C#N)C=NN23)C[C@H](CO)O1
...,...
495,CCC1(C)CN(c2ncnc3c2ncn3[C@@H]2O[C@H](CO)[C@@H]...
496,CCC1(C)CN(c2ncnc3c2ncn3[C@@H]2O[C@H](CO)[C@@H]...
497,CCC1(C)CN(c2ncnc3c2ncn3[C@@H]2O[C@H](CO)[C@H](...
498,Nc1ncnc2c1ncn2[C@@H]1O[C@H](CO)[C@@H](O)[C@H]1...


In [7]:
# Training (80%), Validation(10%), Test(10%)
df = df.sample(frac=1)

# Define your split sizes
train_size = int(0.8 * len(df))
valid_size = int(0.1 * len(df))

# Split your DataFrame
train_df = df[:train_size]
valid_df = df[train_size:(train_size + valid_size)]
test_df = df[(train_size + valid_size):]

train_set_file = os.path.join(temp, 'training.smi')
valid_set_file = os.path.join(temp, 'validation.smi')
test_set_file = os.path.join(temp, 'test.smi')


train_df.to_csv(train_set_file, sep="\t", index=False, header=False)
valid_df.to_csv(valid_set_file, sep="\t", index=False, header=False)
test_df.to_csv(test_set_file, sep="\t", index=False, header=False)

In [8]:
# Transfer learning config. (Ref: https://github.com/MolecularAI/REINVENT4/blob/main/notebooks/Reinvent_TLRL.py)

config_filename = os.path.join(temp, 'config.json')
new_model_path = os.path.join(temp, 'temp.model')
reinvet_transfer_learning_parameter = {
    "run_type": "transfer_learning",
    "device": "cpu",
    "tb_logdir": os.path.join(temp, 'tb_TL'),
    "parameters": {
        "num_epochs": 20,
        "save_every_n_epochs": 2,
        "batch_size": 50,
        "sample_batch_size": 500,
        "input_model_file": model_path,
        "output_model_file": new_model_path,
        "smiles_file": train_set_file,
        "validation_smiles_file": valid_set_file,
        "standardize_smiles": True,
        "randomize_smiles": False,
        "randomize_all_smiles": False,
        "internal_diversity": True,
        "pairs": {
            "type": "tanimoto",
            "upper_threshold": 0.6,
            "lower_threshold": 0.0,
            "min_cardinality": 1,
            "max_cardinality": 199
        }
    },
}

with open(config_filename, "w") as writer:
    json.dump(reinvet_transfer_learning_parameter, writer, indent=2)

In [9]:
# Transfer Learning.

!reinvent $config_filename -f json

22:12:22 <INFO> Started REINVENT 4.3.5 (C) AstraZeneca 2017, 2023 on 2024-07-10
22:12:22 <INFO> Command line: C:\Users\ankit\.conda\envs\reinvent-transfer-learning\Scripts\reinvent .\.temp\config.json -f json
22:12:22 <INFO> User ankit on host Ank
22:12:22 <INFO> Python version 3.11.9
22:12:22 <INFO> PyTorch version 2.2.1+cu121, git 6c8c5ad5eaf47a62fafbb4a2747198cbffbf1ff0
22:12:22 <INFO> PyTorch compiled with CUDA version 12.1
22:12:22 <INFO> RDKit version 2023.09.5
22:12:22 <INFO> Platform Windows-10-10.0.26244-SP0
22:12:22 <INFO> Number of PyTorch CUDA devices 1
22:12:22 <INFO> Using CPU AMD64 Family 25 Model 80 Stepping 0, AuthenticAMD
22:12:22 <INFO> Writing TensorBoard summary to D:\projects\github\reinvent-transfer-learning\notebooks\.temp\tb_TL
22:12:22 <INFO> Starting Transfer Learning
22:12:22 <INFO> D:\projects\github\reinvent-transfer-learning\models\reinvent.prior has valid hash:
{ 'comments': [],
  'creation_date': 0,
  'date_format': 'UNIX epoch',
  'hash_id': '173568c36

![A Mean Loss](./a_mean_loss.png)
![B Fraction Valid](b_fraction_valid.png)

In [10]:
# Running new model
new_model_config_path = os.path.join(temp, '_config.json')
output_smiles = os.path.join(temp, 'output.csv')
config = {
    "run_type": "sampling",
    "device": "cpu",
    "parameters": {
        "model_file": new_model_path,
        "output_file": output_smiles,
        "num_smiles": 100,
        "unique_molecules": True,
        "randomize_smiles": True,
    }
}

with open(new_model_config_path, "w") as writer:
    json.dump(config, writer, indent=2)


!reinvent $new_model_config_path -f json

22:14:46 <INFO> Started REINVENT 4.3.5 (C) AstraZeneca 2017, 2023 on 2024-07-10
22:14:46 <INFO> Command line: C:\Users\ankit\.conda\envs\reinvent-transfer-learning\Scripts\reinvent .\.temp\_config.json -f json
22:14:46 <INFO> User ankit on host Ank
22:14:46 <INFO> Python version 3.11.9
22:14:46 <INFO> PyTorch version 2.2.1+cu121, git 6c8c5ad5eaf47a62fafbb4a2747198cbffbf1ff0
22:14:46 <INFO> PyTorch compiled with CUDA version 12.1
22:14:46 <INFO> RDKit version 2023.09.5
22:14:46 <INFO> Platform Windows-10-10.0.26244-SP0
22:14:46 <INFO> Number of PyTorch CUDA devices 1
22:14:46 <INFO> Using CPU AMD64 Family 25 Model 80 Stepping 0, AuthenticAMD
22:14:46 <INFO> Starting Sampling
22:14:46 <ERRO> D:\projects\github\reinvent-transfer-learning\notebooks\.temp\temp.model has invalid hash:
{ 'comments': [],
  'creation_date': 0,
  'date_format': 'UNIX epoch',
  'hash_id': '173568c36e1fc3d95cab289c7d31ce0b',
  'hash_id_format': 'xxhash.xxh3_128_hex 3.4.1',
  'model_id': '55d68f8a81c04f5a86304ebe17