In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# Imports
import os
import json
import csv
import random
import pandas as pd
from utils.download import download_3d_similar_molecules
from utils.chem import extract_features

In [3]:
# Paths
notebook = os.path.join(".")
temp = os.path.join(notebook, ".temp") # use to download temporary files (temporary downloads).
if not os.path.exists(temp):
    os.makedirs(temp)

molecule_name = "isoniazid"
temp = os.path.join(temp, molecule_name)

if not os.path.exists(temp):
    os.makedirs(temp)

reinvent_prior_path = os.path.join(notebook, '..', 'models', 'reinvent.prior')

In [4]:
# Download Smiles
input_smiles = "C1=CN=CC=C1C(=O)NN"
similar_str_smiles_path = os.path.join(temp, "similar.json")

download_3d_similar_molecules(input_smiles, similar_str_smiles_path)

True

In [5]:
# Reading the downloaded 3d similar structures.

similar_str_smiles = {}
with open(similar_str_smiles_path) as reader:
    similar_str_smiles = json.load(reader)

df = pd.DataFrame(data=similar_str_smiles)
df

Unnamed: 0,smiles,identifier,similarity
0,NNC(=O)C1=CC=NC=C1,ENAMINE-REAL : PV-005809739863,1.000000
1,NNC(=O)C1=CC=C(C2=CC=NC=C2)C=C1,ENAMINE-REAL : Z3244894387,0.769231
2,NNC(=O)C1=CC=C(NC(=O)C2=CC=NC=C2)C=C1,ENAMINE-REAL : Z363387156,0.750000
3,O=C(NCCNC(=O)c1ccncc1)c1ccncc1,ZINC15 : ZINC000001684294,0.653846
4,O=C(NO)c1ccncc1,ZINC15 : 4362829,0.640000
...,...,...,...
395,CCCCCNC(=O)c1ccncc1,ZINC15 : 3164712,0.515152
396,CC(C)C[C@H](C)NC(=O)c1ccncc1,ZINC15 : ZINC000002069508,0.515152
397,C[C@@H]([NH3+])CNC(=O)c1ccncc1,ZINC15 : 74532791,0.515152
398,CC(C)C[C@H](C)NC(=O)C1=CC=NC=C1,ENAMINE-REAL : Z3609346532,0.515152


In [6]:
# Training (80%), Validation(10%), Test(10%)
df = df.sample(frac=1)

# Define your split sizes
train_size = int(0.8 * len(df))

# Split your DataFrame
train_df = df[:train_size]
valid_df = df[train_size:]

train_set_file = os.path.join(temp, 'training.smi')
valid_set_file = os.path.join(temp, 'validation.smi')


train_df.to_csv(train_set_file, sep="\t", index=False, header=False)
valid_df.to_csv(valid_set_file, sep="\t", index=False, header=False)

In [7]:
# Transfer learning config. (Ref: https://github.com/MolecularAI/REINVENT4/blob/main/notebooks/Reinvent_TLRL.py)

config_filename = os.path.join(temp, 'config.json')
temp_models = os.path.join(temp, 'checkpoints')

if not os.path.exists(temp_models):
    os.mkdir(temp_models)

new_model_path = os.path.join(temp_models, 'temp.model')

reinvet_transfer_learning_parameter = {
    "run_type": "transfer_learning",
    "device": "cpu",
    "tb_logdir": os.path.join(temp, 'tb_TL'),
    "parameters": {
        "num_epochs": 100,
        "save_every_n_epochs": 2,
        "batch_size": 50,
        "sample_batch_size": 100,
        "input_model_file": reinvent_prior_path,
        "output_model_file": new_model_path,
        "smiles_file": train_set_file,
        "validation_smiles_file": valid_set_file,
        "standardize_smiles": True,
        "randomize_smiles": False,
        "randomize_all_smiles": False,
        "internal_diversity": True,
    },
}

with open(config_filename, "w") as writer:
    json.dump(reinvet_transfer_learning_parameter, writer, indent=2)

In [8]:
# Transfer Learning.

!reinvent $config_filename -f json

12:58:24 <INFO> Started REINVENT 4.4.22 (C) AstraZeneca 2017, 2023 on 2024-07-25
12:58:24 <INFO> Command line: /root/miniconda3/envs/reinvent-transfer-learning/bin/reinvent ./.temp/isoniazid/config.json -f json
12:58:24 <INFO> User root on host Ank
12:58:24 <INFO> Python version 3.11.9
12:58:24 <INFO> PyTorch version 2.3.1+cu121, git d44533f9d073df13895333e70b66f81c513c1889
12:58:24 <INFO> PyTorch compiled with CUDA version 12.1
12:58:24 <INFO> RDKit version 2023.09.5
12:58:24 <INFO> Platform Linux-5.15.153.1-microsoft-standard-WSL2-x86_64-with-glibc2.35
12:58:24 <INFO> Number of PyTorch CUDA devices 1
12:58:24 <INFO> Using CPU x86_64
12:58:24 <INFO> Writing TensorBoard summary to /mnt/d/projects/github/reinvent-transfer-learning/notebooks/.temp/isoniazid/tb_TL
12:58:24 <INFO> Starting Transfer Learning
12:58:24 <INFO> /mnt/d/projects/github/reinvent-transfer-learning/models/reinvent.prior has valid hash:
{ 'comments': [],
  'creation_date': 0,
  'date_format': 'UNIX epoch',
  'hash_id

In [9]:
# Running new model
new_model_config_path = os.path.join(temp, '_config.json')
output_smiles = os.path.join(temp, 'output.csv')
config = {
    "run_type": "sampling",
    "device": "cpu",
    "parameters": {
        "model_file": new_model_path,
        "output_file": output_smiles,
        "num_smiles": 10_000,
        "unique_molecules": True,
        "randomize_smiles": True,
    }
}

with open(new_model_config_path, "w") as writer:
    json.dump(config, writer, indent=2)

!reinvent $new_model_config_path -f json

13:01:54 <INFO> Started REINVENT 4.4.22 (C) AstraZeneca 2017, 2023 on 2024-07-25
13:01:54 <INFO> Command line: /root/miniconda3/envs/reinvent-transfer-learning/bin/reinvent ./.temp/isoniazid/_config.json -f json
13:01:54 <INFO> User root on host Ank
13:01:54 <INFO> Python version 3.11.9
13:01:54 <INFO> PyTorch version 2.3.1+cu121, git d44533f9d073df13895333e70b66f81c513c1889
13:01:54 <INFO> PyTorch compiled with CUDA version 12.1
13:01:54 <INFO> RDKit version 2023.09.5
13:01:54 <INFO> Platform Linux-5.15.153.1-microsoft-standard-WSL2-x86_64-with-glibc2.35
13:01:54 <INFO> Number of PyTorch CUDA devices 1
13:01:54 <INFO> Using CPU x86_64
13:01:54 <INFO> Starting Sampling
13:01:54 <INFO> /mnt/d/projects/github/reinvent-transfer-learning/notebooks/.temp/isoniazid/checkpoints/temp.model has valid hash:
{ 'comments': ['TL'],
  'creation_date': 0,
  'date_format': 'UNIX epoch',
  'hash_id': 'bc0a90b7daa0c5a6f586565f3486e6b4',
  'hash_id_format': 'xxhash.xxh3_128_hex 3.4.1',
  'model_id': '55d

# Traning a new classifier model

In [10]:
%%time
negative_entries = []
# Make sure to download and keep the reference library in .temp folder.
# Download it from here: https://github.com/ersilia-os/groverfeat/blob/main/data/reference_library.csv
reference_smiles_path = os.path.join(notebook, '.temp', 'reference_library.csv')

with open(reference_smiles_path) as reader:
    rows = csv.reader(reader)
    negative_entries = random.sample(list(rows), 500)

negative_output = os.path.join(temp, 'negative.csv')
df = pd.DataFrame(data=negative_entries)
df.to_csv(negative_output, index=False)
df

CPU times: user 2.19 s, sys: 443 ms, total: 2.63 s
Wall time: 4.31 s


Unnamed: 0,0
0,CCC(C1=NC2=C(SC=C2)C(=O)N1CC1CC1)N(CCCN)C(=O)C...
1,CN1C[C@@H]2[C@@H]3C(=O)N(C)C(=O)[C@@H]3[C@](C)...
2,CC1=[N+](C)N(C2=CC=CC=C2)C(=O)C1/N=C/C=C/C1=CC...
3,O=C(NC1=NC(C2=CC=CC=C2)=CS1)C1=CC=NC=C1NS(=O)(...
4,NC1=NC=C(C2=CCCCC2)N=C1C(=O)NC1=CC=CC=C1
...,...
495,CO[C@@H]1CN(C)C(=O)C2=CC(NS(=O)(=O)CC(F)(F)F)=...
496,C[C@H](CCC(=O)C1=NN=NN1)[C@H]1CC[C@H]2[C@@H]3[...
497,CC(C)C1=CC=C(CN2CCC(CNC(=O)C3=CC(C4=CC=C(N(C)C...
498,CCN1C(=O)N/C(=C/C2=CC=C(SC3=CC=C(C)C=C3)O2)C1=O


In [11]:
# Prepare data for classifier model
negatives = []
positives = []

# Positives are all the smiles downloaded using cheese api
with open(similar_str_smiles_path) as reader:
    data = json.load(reader)
    
    for entry in data:
        positives.append(entry['smiles'])

# Negatives are all the smiles generated using reinvent
with open(negative_output) as reader:
    data = csv.reader(reader)
    next(data, None) # skipping header
    for row in data:
        negatives.append(row[0])

labels = [1] * len(positives)
labels += [0] * len(negatives)

total = positives + negatives
features = extract_features(total)

In [12]:
# Training
from flaml import AutoML
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X = pd.DataFrame(features)
y = pd.Series(labels)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

automl = AutoML()
automl_settings = {
    "time_budget": 120,  # time budget in seconds
    "metric": 'roc_auc',  # metric to optimize
    "task": 'classification',
    "estimator_list": ['rf'] # random forest
}

automl.fit(X_train, y_train, **automl_settings)

# Checking accuracy
y_pred = automl.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")

[flaml.automl.logger: 07-25 13:10:36] {1680} INFO - task = classification
[flaml.automl.logger: 07-25 13:10:36] {1691} INFO - Evaluation method: holdout
[flaml.automl.logger: 07-25 13:10:36] {1789} INFO - Minimizing error metric: 1-roc_auc
[flaml.automl.logger: 07-25 13:10:36] {1901} INFO - List of ML learners in AutoML Run: ['rf']
[flaml.automl.logger: 07-25 13:10:36] {2219} INFO - iteration 0, current learner rf
[flaml.automl.logger: 07-25 13:10:36] {2345} INFO - Estimated sufficient time budget=2226s. Estimated necessary time budget=2s.
[flaml.automl.logger: 07-25 13:10:36] {2392} INFO -  at 2.3s,	estimator rf's best error=0.0115,	best estimator rf's best error=0.0115
[flaml.automl.logger: 07-25 13:10:36] {2219} INFO - iteration 1, current learner rf
[flaml.automl.logger: 07-25 13:10:36] {2392} INFO -  at 2.4s,	estimator rf's best error=0.0044,	best estimator rf's best error=0.0044
[flaml.automl.logger: 07-25 13:10:36] {2219} INFO - iteration 2, current learner rf
[flaml.automl.logg

In [13]:
# Testing generated output (the output generated by the new model.)

gen_out = []

with open(output_smiles) as reader:
    data = csv.reader(reader)
    next(data, None) # skipping header
    for row in data:
        gen_out.append(row[0])

gen_out_feature = extract_features(gen_out)
gen_out_feature = pd.DataFrame(gen_out_feature)

prediction = automl.predict_proba(gen_out_feature) # predict_proba

In [14]:
ml_prediction_output = os.path.join(temp, 'ml_output.csv')

output = []
for idx, mol in enumerate(gen_out):
    if prediction[idx][1] < 0.6:
        continue
    output.append({"smile": gen_out[idx], "score": prediction[idx][1]})


with_ml_df = pd.DataFrame(data=output)
with_ml_df = with_ml_df.sort_values(['score'], ascending=False)
with_ml_df = with_ml_df[:1000] # Picking top 1,000
with_ml_df.to_csv(ml_prediction_output, index=False, header=False)
with_ml_df

Unnamed: 0,smile,score
0,CCCCCNC(=O)c1ccncc1,0.778641
357,O=C(c1ccncc1)C(F)(F)F,0.778641
372,CCCCCCCCC(=O)c1ccncc1,0.778641
371,CC(O)CNC(=O)c1ccncc1,0.778641
370,C#CCNC(=O)c1ccncc1,0.778641
...,...,...
199,CC(C)C(NC(=O)c1ccncc1)C(=O)NCC#N,0.630077
89,CC1(Cl)CC1CNC(=O)c1ccncc1,0.630077
194,CNC(C)C(=O)NC(NC(=O)c1ccncc1)C(C)(C)C,0.630077
86,CC(C)C(=O)NCCNC(=O)c1ccncc1,0.630077
