In [59]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [60]:
# Imports
import os
import json
import csv
import pandas as pd
from utils.download import download_3d_similar_molecules
from utils.chem import compute_3d_similarity, extract_features

In [61]:
# Paths
notebook = os.path.join(".")
temp = os.path.join(notebook, ".temp") # use to download temporary files (temporary downloads).
if not os.path.exists(temp):
    os.makedirs(temp)

molecule_name = "cephalotaxin"
temp = os.path.join(temp, molecule_name)

if not os.path.exists(temp):
    os.makedirs(temp)

reinvent_prior_path = os.path.join(notebook, '..', 'models', 'reinvent.prior')

In [62]:
# Download Smiles
input_smiles = "COC1=CC23CCCN2CCC4=CC5=C(C=C4C3C1O)OCO5"
similar_str_smiles_path = os.path.join(temp, "similar.json")

download_3d_similar_molecules(input_smiles, similar_str_smiles_path)

True

In [63]:
# Reading the downloaded 3d similar structures.

similar_str_smiles = {}
with open(similar_str_smiles_path) as reader:
    similar_str_smiles = json.load(reader)

df = pd.DataFrame(data=similar_str_smiles)
df

Unnamed: 0,smiles,zinc_id,Morgan Tanimoto
0,COC1=C[C@@]23CCC[N@@H+]2CCc2cc4c(cc2[C@H]3[C@H...,ZINC15 : ZINC000017027409,0.566667
1,COC1=C[C@]23CCC[N@@H+]2CCc2cc4c(cc2[C@@H]3C1O)...,ZINC15 : ZINC001560408829,0.566667
2,COC1=C[C@]23CCC[N@H+]2CCc2cc4c(cc2[C@@H]3[C@@H...,ZINC15 : ZINC000019795979,0.566667
3,COC1=C[C@@]23CCC[N@H+]2CCc2cc4c(cc2[C@@H]3[C@H...,ZINC15 : ZINC000071789804,0.566667
4,COC1=C[C@]23CCC[N@H+]2CCc2cc4c(cc2[C@@H]3C1O)OCO4,ZINC15 : ZINC001560408829,0.566667
...,...,...,...
395,O[C@@H]1C=C2CCN3Cc4cc5c(cc4[C@H]([C@H]23)[C@@H...,ZINC15 : ZINC000003873173,0.313433
396,O[C@@H]1C=C2CCN3Cc4cc5c(cc4[C@@H]([C@H]23)[C@H...,ZINC15 : ZINC000000000024,0.313433
397,O[C@@H]1C=C2CCN3Cc4cc5c(cc4[C@@H]([C@@H]23)[C@...,ZINC15 : ZINC000000586000,0.313433
398,O[C@@H]1C=C2CCN3Cc4cc5c(cc4[C@H]([C@H]23)[C@H]...,ZINC15 : ZINC000003873175,0.313433


In [64]:
# Training (80%), Validation(10%), Test(10%)
df = df.sample(frac=1)

# Define your split sizes
train_size = int(0.8 * len(df))

# Split your DataFrame
train_df = df[:train_size]
valid_df = df[train_size:]

train_set_file = os.path.join(temp, 'training.smi')
valid_set_file = os.path.join(temp, 'validation.smi')


train_df.to_csv(train_set_file, sep="\t", index=False, header=False)
valid_df.to_csv(valid_set_file, sep="\t", index=False, header=False)

In [65]:
# Transfer learning config. (Ref: https://github.com/MolecularAI/REINVENT4/blob/main/notebooks/Reinvent_TLRL.py)

config_filename = os.path.join(temp, 'config.json')
temp_models = os.path.join(temp, 'checkpoints')

if not os.path.exists(temp_models):
    os.mkdir(temp_models)

new_model_path = os.path.join(temp_models, 'temp.model')

reinvet_transfer_learning_parameter = {
    "run_type": "transfer_learning",
    "device": "cpu",
    "tb_logdir": os.path.join(temp, 'tb_TL'),
    "parameters": {
        "num_epochs": 100,
        "save_every_n_epochs": 2,
        "batch_size": 50,
        "sample_batch_size": 100,
        "input_model_file": reinvent_prior_path,
        "output_model_file": new_model_path,
        "smiles_file": train_set_file,
        "validation_smiles_file": valid_set_file,
        "standardize_smiles": True,
        "randomize_smiles": False,
        "randomize_all_smiles": False,
        "internal_diversity": True,
    },
}

with open(config_filename, "w") as writer:
    json.dump(reinvet_transfer_learning_parameter, writer, indent=2)

In [66]:
# Transfer Learning.

!reinvent $config_filename -f json

19:07:44 <INFO> Started REINVENT 4.4.22 (C) AstraZeneca 2017, 2023 on 2024-07-15
19:07:44 <INFO> Command line: /root/miniconda3/envs/reinvent-transfer-learning/bin/reinvent ./.temp/cephalotaxin/config.json -f json
19:07:44 <INFO> User root on host Ank
19:07:44 <INFO> Python version 3.11.9
19:07:44 <INFO> PyTorch version 2.3.1+cu121, git d44533f9d073df13895333e70b66f81c513c1889
19:07:44 <INFO> PyTorch compiled with CUDA version 12.1
19:07:44 <INFO> RDKit version 2023.09.5
19:07:44 <INFO> Platform Linux-5.15.153.1-microsoft-standard-WSL2-x86_64-with-glibc2.35
19:07:44 <INFO> Number of PyTorch CUDA devices 1
19:07:44 <INFO> Using CPU x86_64
19:07:44 <INFO> Writing TensorBoard summary to /mnt/d/projects/github/reinvent-transfer-learning/notebooks/.temp/cephalotaxin/tb_TL
19:07:44 <INFO> Starting Transfer Learning
19:07:44 <INFO> /mnt/d/projects/github/reinvent-transfer-learning/models/reinvent.prior has valid hash:
{ 'comments': [],
  'creation_date': 0,
  'date_format': 'UNIX epoch',
  'h

In [157]:
# Running new model
new_model_config_path = os.path.join(temp, '_config.json')
output_smiles = os.path.join(temp, 'output.csv')
config = {
    "run_type": "sampling",
    "device": "cpu",
    "parameters": {
        "model_file": new_model_path,
        "output_file": output_smiles,
        "num_smiles": 2000,
        "unique_molecules": True,
        "randomize_smiles": True,
    }
}

with open(new_model_config_path, "w") as writer:
    json.dump(config, writer, indent=2)

!reinvent $new_model_config_path -f json

20:05:25 <INFO> Started REINVENT 4.4.22 (C) AstraZeneca 2017, 2023 on 2024-07-15
20:05:25 <INFO> Command line: /root/miniconda3/envs/reinvent-transfer-learning/bin/reinvent ./.temp/cephalotaxin/_config.json -f json
20:05:25 <INFO> User root on host Ank
20:05:25 <INFO> Python version 3.11.9
20:05:25 <INFO> PyTorch version 2.3.1+cu121, git d44533f9d073df13895333e70b66f81c513c1889
20:05:25 <INFO> PyTorch compiled with CUDA version 12.1
20:05:25 <INFO> RDKit version 2023.09.5
20:05:25 <INFO> Platform Linux-5.15.153.1-microsoft-standard-WSL2-x86_64-with-glibc2.35
20:05:25 <INFO> Number of PyTorch CUDA devices 1
20:05:25 <INFO> Using CPU x86_64
20:05:25 <INFO> Starting Sampling
20:05:25 <INFO> /mnt/d/projects/github/reinvent-transfer-learning/notebooks/.temp/cephalotaxin/checkpoints/temp.model has valid hash:
{ 'comments': ['TL'],
  'creation_date': 0,
  'date_format': 'UNIX epoch',
  'hash_id': '44a2714653c2fc9a7b266382188964a4',
  'hash_id_format': 'xxhash.xxh3_128_hex 3.4.1',
  'model_id'

In [158]:
# Attempting to filter out non-similar molecule without using ML
# In this apprach we will try to calculate RMSD of all the molecules generated
# by our new model. If RMSD is heigher than 2 then we will ignore the molecules.

entries = []

with open(output_smiles) as reader:
    rows = csv.reader(reader)
    next(rows, None) # Skipping header
    for row in rows:
        entries.append(row)

scores = []

for entry in entries:
    score = compute_3d_similarity(input_smiles, entry[0])
    if score[0] is False or score[1] > 2:
        continue
    scores.append({"smile": entry[0], "score": score[1]})

without_ml = os.path.join(temp, 'without_ml.csv')
df = pd.DataFrame(data=scores)

df.to_csv(without_ml, index=False)
df

[20:07:15] UFFTYPER: Unrecognized charge state for atom: 9


Unnamed: 0,smile,score
0,COC1OC2c3cc4c(cc3CCN(C)C2c2ccc3c(c21)OCO3)OCO4,1.970421
1,COc1cc2c(cc1CNC1Cc3ccccc3C1O)OCO2,1.334700
2,COc1cc2c(cc1C(=O)N1CCC3CCC(C1)N3C)OCO2,1.615380
3,COc1cc2c(cc1CN1CCCc3ccccc3CC1)OCO2,1.691411
4,COc1c2c(cc3c1OCO3)C(O)CNC2,1.671142
...,...,...
436,COCC(C)N(Cc1cc2c(cc1O)OCO2)C(CN1CCCC1)C(C)C,1.649425
437,COC1CCC2CN3CCc4cc5c(cc4C3CC2C1)OCO5,1.620182
438,COC(=O)C1c2[nH]c3ccccc3c2CCN2CCCC12,1.539911
439,COc1cc2c(cc1CN1CCC3C(O)C(C)(C)CN31)OCO2,1.787751


# Traning a new classifier model

In [159]:
# Generating Random molecules
# We will use reinent to generate random molecules.

reinvent_path = os.path.join(notebook, '..', 'models', 'reinvent.prior')
random_config_path = os.path.join(temp, 'random_config.json')
out = os.path.join(temp, 'random.csv')
config = {
    "run_type": "sampling",
    "device": "cpu",
    "parameters": {
        "model_file": reinvent_path,
        "output_file": out,
        "num_smiles": 5000,
        "unique_molecules": True,
        "randomize_smiles": True,
    }
}

with open(random_config_path, "w") as writer:
    json.dump(config, writer, indent=2)


!reinvent $random_config_path -f json

20:07:22 <INFO> Started REINVENT 4.4.22 (C) AstraZeneca 2017, 2023 on 2024-07-15
20:07:22 <INFO> Command line: /root/miniconda3/envs/reinvent-transfer-learning/bin/reinvent ./.temp/cephalotaxin/random_config.json -f json
20:07:22 <INFO> User root on host Ank
20:07:22 <INFO> Python version 3.11.9
20:07:22 <INFO> PyTorch version 2.3.1+cu121, git d44533f9d073df13895333e70b66f81c513c1889
20:07:22 <INFO> PyTorch compiled with CUDA version 12.1
20:07:22 <INFO> RDKit version 2023.09.5
20:07:22 <INFO> Platform Linux-5.15.153.1-microsoft-standard-WSL2-x86_64-with-glibc2.35
20:07:22 <INFO> Number of PyTorch CUDA devices 1
20:07:22 <INFO> Using CPU x86_64
20:07:22 <INFO> Starting Sampling
20:07:22 <INFO> /mnt/d/projects/github/reinvent-transfer-learning/models/reinvent.prior has valid hash:
{ 'comments': [],
  'creation_date': 0,
  'date_format': 'UNIX epoch',
  'hash_id': '173568c36e1fc3d95cab289c7d31ce0b',
  'hash_id_format': 'xxhash.xxh3_128_hex 3.4.1',
  'model_id': '55d68f8a81c04f5a86304ebe1

In [160]:
negative_entries = []

with open(out) as reader:
    rows = csv.reader(reader)
    next(rows, None) # Skipping header
    for row in rows:
        negative_entries.append(row)

scores = []

for entry in negative_entries:
    if len(scores) > 500:
        break
    score = compute_3d_similarity(input_smiles, entry[0])
    if score[0] is False or score[1] < 2.5:
        continue
    scores.append({"smile": entry[0], "score": score[1]})

negative_output = os.path.join(temp, 'negative.csv')
df = pd.DataFrame(data=scores)
df.to_csv(negative_output, index=False)

[20:08:16] UFFTYPER: Unrecognized charge state for atom: 12
[20:08:52] UFFTYPER: Unrecognized charge state for atom: 5
[20:09:52] UFFTYPER: Unrecognized charge state for atom: 5
[20:10:09] UFFTYPER: Unrecognized charge state for atom: 1
[20:10:13] UFFTYPER: Unrecognized charge state for atom: 19
[20:10:16] UFFTYPER: Unrecognized charge state for atom: 8


In [161]:
# Prepare data for classifier model
negatives = []
positives = []

# Positives are all the smiles downloaded using cheese api
with open(similar_str_smiles_path) as reader:
    data = json.load(reader)
    
    for entry in data:
        positives.append(entry['smiles'])

# Negatives are all the smiles generated using reinvent
with open(negative_output) as reader:
    data = csv.reader(reader)
    next(data, None) # skipping header
    for row in data:
        negatives.append(row[0])

labels = [1] * len(positives)
labels += [0] * len(negatives)

total = positives + negatives
features = extract_features(total)

In [162]:
# Training
from flaml import AutoML
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X = pd.DataFrame(features)
y = pd.Series(labels)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

automl = AutoML()
automl_settings = {
    "time_budget": 60,  # time budget in seconds
    "metric": 'accuracy',  # metric to optimize
    "task": 'classification',
    "estimator_list": ['rf'] # random forest
}

automl.fit(X_train, y_train, **automl_settings)

# Checking accuracy
y_pred = automl.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")

[flaml.automl.logger: 07-15 20:10:55] {1680} INFO - task = classification
[flaml.automl.logger: 07-15 20:10:55] {1691} INFO - Evaluation method: holdout
[flaml.automl.logger: 07-15 20:10:55] {1789} INFO - Minimizing error metric: 1-accuracy
[flaml.automl.logger: 07-15 20:10:55] {1901} INFO - List of ML learners in AutoML Run: ['rf']
[flaml.automl.logger: 07-15 20:10:55] {2219} INFO - iteration 0, current learner rf
[flaml.automl.logger: 07-15 20:10:55] {2345} INFO - Estimated sufficient time budget=1506s. Estimated necessary time budget=2s.
[flaml.automl.logger: 07-15 20:10:55] {2392} INFO -  at 2.5s,	estimator rf's best error=0.0405,	best estimator rf's best error=0.0405
[flaml.automl.logger: 07-15 20:10:55] {2219} INFO - iteration 1, current learner rf
[flaml.automl.logger: 07-15 20:10:56] {2392} INFO -  at 2.6s,	estimator rf's best error=0.0000,	best estimator rf's best error=0.0000
[flaml.automl.logger: 07-15 20:10:56] {2219} INFO - iteration 2, current learner rf
[flaml.automl.log

In [163]:
# Testing generated output (the output generated by the new model.)

gen_out = []

with open(output_smiles) as reader:
    data = csv.reader(reader)
    next(data, None) # skipping header
    for row in data:
        gen_out.append(row[0])

gen_out_feature = extract_features(gen_out)
gen_out_feature = pd.DataFrame(gen_out_feature)

prediction = automl.predict(gen_out_feature)

In [164]:
ml_prediction_output = os.path.join(temp, 'ml_output.csv')

output = []
for idx, mol in enumerate(gen_out):
    if prediction[idx] == 0:
        continue
    output.append({"smile": gen_out[idx], "score": 0 })


with_ml_df = pd.DataFrame(data=output)
with_ml_df.to_csv(ml_prediction_output, index=False, header=False)
with_ml_df

Unnamed: 0,smile,score
0,COC1OC2c3cc4c(cc3CCN(C)C2c2ccc3c(c21)OCO3)OCO4,0
1,COc1cc2c(cc1CNC1Cc3ccccc3C1O)OCO2,0
2,COc1cc2c(cc1C(=O)N1CCC3CCC(C1)N3C)OCO2,0
3,COc1cc2c(cc1CN1CCCc3ccccc3CC1)OCO2,0
4,COc1c2c(cc3c1OCO3)C(O)CNC2,0
...,...,...
362,COCC(C)N(Cc1cc2c(cc1O)OCO2)C(CN1CCCC1)C(C)C,0
363,COC1CCC2CN3CCc4cc5c(cc4C3CC2C1)OCO5,0
364,CN1CCc2cc3c(cc2C1C1OC(=O)c2cc4c(cc21)OCO4)OCO3,0
365,COc1cc2c(cc1CN1CCC3C(O)C(C)(C)CN31)OCO2,0


In [183]:
# Overlapped
without_ml_df = pd.read_csv(without_ml)

with_ml_df['type'] = 'with_ml'
without_ml_df['type'] = 'without_ml'

merged = pd.concat([with_ml_df, without_ml_df])
overlapped = merged.duplicated(subset=['smile'])
                               
merged[overlapped]

Unnamed: 0,smile,score,type
0,COC1OC2c3cc4c(cc3CCN(C)C2c2ccc3c(c21)OCO3)OCO4,1.970421,without_ml
1,COc1cc2c(cc1CNC1Cc3ccccc3C1O)OCO2,1.334700,without_ml
2,COc1cc2c(cc1C(=O)N1CCC3CCC(C1)N3C)OCO2,1.615380,without_ml
3,COc1cc2c(cc1CN1CCCc3ccccc3CC1)OCO2,1.691411,without_ml
4,COc1c2c(cc3c1OCO3)C(O)CNC2,1.671142,without_ml
...,...,...,...
435,CC1C2Cc3c(ccc4c3OCO4)C1(C)CCN2CC1CC1,1.203207,without_ml
436,COCC(C)N(Cc1cc2c(cc1O)OCO2)C(CN1CCCC1)C(C)C,1.649425,without_ml
437,COC1CCC2CN3CCc4cc5c(cc4C3CC2C1)OCO5,1.620182,without_ml
439,COc1cc2c(cc1CN1CCC3C(O)C(C)(C)CN31)OCO2,1.787751,without_ml
