In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# Imports
import os
import json
import csv
import random
import pandas as pd
from utils.download import download_3d_similar_molecules
from utils.chem import extract_features

In [3]:
# Paths
notebook = os.path.join(".")
temp = os.path.join(notebook, ".temp") # use to download temporary files (temporary downloads).
if not os.path.exists(temp):
    os.makedirs(temp)

molecule_name = "isoniazid"
temp = os.path.join(temp, molecule_name)

if not os.path.exists(temp):
    os.makedirs(temp)

reinvent_prior_path = os.path.join(notebook, '..', 'models', 'reinvent.prior')

In [4]:
# Download Smiles
input_smiles = "C1=CN=CC=C1C(=O)NN"
similar_str_smiles_path = os.path.join(temp, "similar.json")

download_3d_similar_molecules(input_smiles, similar_str_smiles_path)

True

In [5]:
# Reading the downloaded 3d similar structures.

similar_str_smiles = {}
with open(similar_str_smiles_path) as reader:
    similar_str_smiles = json.load(reader)

df = pd.DataFrame(data=similar_str_smiles)
df

Unnamed: 0,smiles,identifier,similarity
0,NNC(=O)C1=CC=NC=C1,ENAMINE-REAL : PV-005809739863,1.000000
1,NNC(=O)C1=CC=C(C2=CC=NC=C2)C=C1,ENAMINE-REAL : Z3244894387,0.769231
2,NNC(=O)C1=CC=C(NC(=O)C2=CC=NC=C2)C=C1,ENAMINE-REAL : Z363387156,0.750000
3,O=C(NCCNC(=O)c1ccncc1)c1ccncc1,ZINC15 : ZINC000001684294,0.653846
4,O=C(NO)c1ccncc1,ZINC15 : 4362829,0.640000
...,...,...,...
395,CCCCCNC(=O)c1ccncc1,ZINC15 : 3164712,0.515152
396,CC(C)C[C@H](C)NC(=O)c1ccncc1,ZINC15 : ZINC000002069508,0.515152
397,C[C@@H]([NH3+])CNC(=O)c1ccncc1,ZINC15 : 74532791,0.515152
398,CC(C)C[C@H](C)NC(=O)C1=CC=NC=C1,ENAMINE-REAL : Z3609346532,0.515152


In [6]:
# Training (80%), Validation(10%), Test(10%)
df = df.sample(frac=1)

# Define your split sizes
train_size = int(0.8 * len(df))

# Split your DataFrame
train_df = df[:train_size]
valid_df = df[train_size:]

train_set_file = os.path.join(temp, 'training.smi')
valid_set_file = os.path.join(temp, 'validation.smi')


train_df.to_csv(train_set_file, sep="\t", index=False, header=False)
valid_df.to_csv(valid_set_file, sep="\t", index=False, header=False)

In [7]:
# Transfer learning config. (Ref: https://github.com/MolecularAI/REINVENT4/blob/main/notebooks/Reinvent_TLRL.py)

config_filename = os.path.join(temp, 'config.json')
temp_models = os.path.join(temp, 'checkpoints')

if not os.path.exists(temp_models):
    os.mkdir(temp_models)

new_model_path = os.path.join(temp_models, 'temp.model')

reinvet_transfer_learning_parameter = {
    "run_type": "transfer_learning",
    "device": "cpu",
    "tb_logdir": os.path.join(temp, 'tb_TL'),
    "parameters": {
        "num_epochs": 100,
        "save_every_n_epochs": 2,
        "batch_size": 50,
        "sample_batch_size": 100,
        "input_model_file": reinvent_prior_path,
        "output_model_file": new_model_path,
        "smiles_file": train_set_file,
        "validation_smiles_file": valid_set_file,
        "standardize_smiles": True,
        "randomize_smiles": False,
        "randomize_all_smiles": False,
        "internal_diversity": True,
    },
}

with open(config_filename, "w") as writer:
    json.dump(reinvet_transfer_learning_parameter, writer, indent=2)

In [8]:
# Transfer Learning.

!reinvent $config_filename -f json

15:31:06 <INFO> Started REINVENT 4.4.22 (C) AstraZeneca 2017, 2023 on 2024-07-28
15:31:06 <INFO> Command line: /root/miniconda3/envs/reinvent-transfer-learning/bin/reinvent ./.temp/isoniazid/config.json -f json
15:31:06 <INFO> User root on host Ank
15:31:06 <INFO> Python version 3.11.9
15:31:06 <INFO> PyTorch version 2.3.1+cu121, git d44533f9d073df13895333e70b66f81c513c1889
15:31:06 <INFO> PyTorch compiled with CUDA version 12.1
15:31:06 <INFO> RDKit version 2023.09.5
15:31:06 <INFO> Platform Linux-5.15.153.1-microsoft-standard-WSL2-x86_64-with-glibc2.35
15:31:06 <INFO> Number of PyTorch CUDA devices 1
15:31:06 <INFO> Using CPU x86_64
15:31:06 <INFO> Writing TensorBoard summary to /mnt/d/projects/github/reinvent-transfer-learning/notebooks/.temp/isoniazid/tb_TL
15:31:06 <INFO> Starting Transfer Learning
15:31:06 <INFO> /mnt/d/projects/github/reinvent-transfer-learning/models/reinvent.prior has valid hash:
{ 'comments': [],
  'creation_date': 0,
  'date_format': 'UNIX epoch',
  'hash_id

In [9]:
# Running new model
new_model_config_path = os.path.join(temp, '_config.json')
output_smiles = os.path.join(temp, 'output.csv')
config = {
    "run_type": "sampling",
    "device": "cpu",
    "parameters": {
        "model_file": new_model_path,
        "output_file": output_smiles,
        "num_smiles": 15_000,
        "unique_molecules": True,
        "randomize_smiles": True,
    }
}

with open(new_model_config_path, "w") as writer:
    json.dump(config, writer, indent=2)

!reinvent $new_model_config_path -f json

15:34:16 <INFO> Started REINVENT 4.4.22 (C) AstraZeneca 2017, 2023 on 2024-07-28
15:34:16 <INFO> Command line: /root/miniconda3/envs/reinvent-transfer-learning/bin/reinvent ./.temp/isoniazid/_config.json -f json
15:34:16 <INFO> User root on host Ank
15:34:16 <INFO> Python version 3.11.9
15:34:16 <INFO> PyTorch version 2.3.1+cu121, git d44533f9d073df13895333e70b66f81c513c1889
15:34:16 <INFO> PyTorch compiled with CUDA version 12.1
15:34:16 <INFO> RDKit version 2023.09.5
15:34:16 <INFO> Platform Linux-5.15.153.1-microsoft-standard-WSL2-x86_64-with-glibc2.35
15:34:16 <INFO> Number of PyTorch CUDA devices 1
15:34:16 <INFO> Using CPU x86_64
15:34:16 <INFO> Starting Sampling
15:34:16 <INFO> /mnt/d/projects/github/reinvent-transfer-learning/notebooks/.temp/isoniazid/checkpoints/temp.model has valid hash:
{ 'comments': ['TL'],
  'creation_date': 0,
  'date_format': 'UNIX epoch',
  'hash_id': '070d833eb6f8bfd7d556a5c298994293',
  'hash_id_format': 'xxhash.xxh3_128_hex 3.4.1',
  'model_id': '55d

# Traning a new classifier model

In [12]:
%%time
negative_entries = []
# Make sure to download and keep the reference library in .temp folder.
# Download it from here: https://github.com/ersilia-os/groverfeat/blob/main/data/reference_library.csv
reference_smiles_path = os.path.join(notebook, '.temp', 'reference_library.csv')

with open(reference_smiles_path) as reader:
    rows = csv.reader(reader)
    negative_entries = random.sample(list(rows), 5000)

negative_output = os.path.join(temp, 'negative.csv')
df = pd.DataFrame(data=negative_entries)
df.to_csv(negative_output, index=False)
df

CPU times: user 1.79 s, sys: 152 ms, total: 1.94 s
Wall time: 3.48 s


Unnamed: 0,0
0,O=C1NC(=O)C(CC2=CC=C(N3CCC(NC[C@H](O)C4=CC=CC(...
1,COC(=O)/C(=C/C1=CC=C(C)C=C1)CP(=O)(O)O
2,CCOC1=CC=C(CCNC(=O)C2=CC=CS2)C=C1OCC
3,CCCCOC1=CC=C(C#CC2=CC=C(S(=O)(=O)N[C@H](CC3=CN...
4,CCCCN1C(N)=C(N(CC(C)C)C(=O)C2=CC(C3=CC=C(OC)C=...
...,...
4995,CCN1C(=O)/C(=C/C=C2\SC3=CC(C(F)(F)F)=CC=C3N2C)...
4996,COC(=O)C1=C(F)C=CC=C1C1=CC=C(CNC(=O)[C@@](C)(O...
4997,CCOC(=O)C1=C(NC(=O)CSC2=NN=C(CC3=CC=CC=C3)O2)S...
4998,C/C=C(/C)C(=O)OC[C@H]1C2=C(C[C@H]3[C@H]4C5=C(C...


In [13]:
# Prepare data for classifier model
negatives = []
positives = []

# Positives are all the smiles downloaded using cheese api
with open(similar_str_smiles_path) as reader:
    data = json.load(reader)
    
    for entry in data:
        positives.append(entry['smiles'])

# Negatives are all the smiles generated using reinvent
with open(negative_output) as reader:
    data = csv.reader(reader)
    next(data, None) # skipping header
    for row in data:
        negatives.append(row[0])

labels = [1] * len(positives)
labels += [0] * len(negatives)

total = positives + negatives
features = extract_features(total)

In [14]:
# Training
from flaml import AutoML
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X = pd.DataFrame(features)
y = pd.Series(labels)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

automl = AutoML()
automl_settings = {
    "time_budget": 120,  # time budget in seconds
    "metric": 'f1',  # metric to optimize
    "task": 'classification',
    "estimator_list": ['rf'] # random forest
}

automl.fit(X_train, y_train, **automl_settings)

# Checking accuracy
y_pred = automl.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")

[flaml.automl.logger: 07-28 15:35:35] {1680} INFO - task = classification
[flaml.automl.logger: 07-28 15:35:35] {1691} INFO - Evaluation method: holdout
[flaml.automl.logger: 07-28 15:35:35] {1789} INFO - Minimizing error metric: 1-f1
[flaml.automl.logger: 07-28 15:35:35] {1901} INFO - List of ML learners in AutoML Run: ['rf']
[flaml.automl.logger: 07-28 15:35:35] {2219} INFO - iteration 0, current learner rf
[flaml.automl.logger: 07-28 15:35:35] {2345} INFO - Estimated sufficient time budget=2053s. Estimated necessary time budget=2s.
[flaml.automl.logger: 07-28 15:35:35] {2392} INFO -  at 2.1s,	estimator rf's best error=0.7778,	best estimator rf's best error=0.7778
[flaml.automl.logger: 07-28 15:35:35] {2219} INFO - iteration 1, current learner rf
[flaml.automl.logger: 07-28 15:35:35] {2392} INFO -  at 2.3s,	estimator rf's best error=0.6410,	best estimator rf's best error=0.6410
[flaml.automl.logger: 07-28 15:35:35] {2219} INFO - iteration 2, current learner rf
[flaml.automl.logger: 0

In [19]:
# Testing generated output (the output generated by the new model.)

gen_out = []

with open(output_smiles) as reader:
    data = csv.reader(reader)
    next(data, None) # skipping header
    for row in data:
        gen_out.append(row[0])

gen_out_feature = extract_features(gen_out)
gen_out_feature = pd.DataFrame(gen_out_feature)

prediction = automl.predict_proba(gen_out_feature) # predict_proba

In [20]:
ml_prediction_output = os.path.join(temp, 'ml_output.csv')

output = []
for idx, mol in enumerate(gen_out):
    if prediction[idx][1] < 0.6:
        continue
    output.append({"smile": gen_out[idx], "score": prediction[idx][1]})


with_ml_df = pd.DataFrame(data=output)
with_ml_df = with_ml_df.sort_values(['score'], ascending=False)
with_ml_df = with_ml_df[:1000] # Picking top 1,000
with_ml_df.to_csv(ml_prediction_output, index=False, header=False)
with_ml_df

Unnamed: 0,smile,score
0,CC1(NC(=O)c2ccncc2)CC1,0.778641
753,Cn1ccc(C(=O)c2ccncc2)c1,0.778641
760,O=C(CCCNC(=O)c1ccncc1)NC1CCC1,0.778641
759,Nc1ccc(Cl)cc1NC(=O)c1ccncc1,0.778641
758,CCCC(C)(O)CNC(=O)c1ccncc1,0.778641
...,...,...
353,NNC(=O)CCNC(=O)c1ccncc1,0.778641
334,CC(C)(C)C(=O)NCCCCCCNC(=O)c1ccncc1,0.778641
354,CC(C)(C)C(O)CNC(=O)c1ccncc1,0.778641
333,CC(C)(C)C(NC(=O)CNC(=O)c1ccncc1)c1ccncc1,0.778641
