In [None]:
# @title Install packages
!pip install wget
!pip install rdkit-pypi
!pip install selfies --upgrade
!pip install -q --upgrade git+https://github.com/ziatdinovmax/gpax
!pip install -q atomai  # we will use the AtomAI VAE

In [1]:
# @title Import libraries
import wget
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from PIL import Image
from io import BytesIO
import warnings

import glob

import rdkit
from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.Chem import rdMolDescriptors
from rdkit.Chem import Draw

import torch
import torch.nn as nn
tt = torch.tensor
from math import log
import matplotlib.pyplot as plt
from pandas import json_normalize
from scipy.spatial import distance

import selfies as sf
import gpax
import atomai as aoi

import jax.numpy as jnp
gpax.utils.enable_x64()

In [2]:
# @title Utility functions
def find_nearest_neighbors(reference_idx, embedding, num_neighbors=9):
    """Find the indices of the nearest neighbors to a reference point."""
    reference_point = embedding[reference_idx]
    distances = distance.cdist([reference_point], embedding, 'euclidean').flatten()
    nearest_indices = np.argsort(distances)[1:num_neighbors+1]  # Exclude the reference point itself
    warnings.filterwarnings("ignore", category=UserWarning)  # Ignore userwarnings
    return nearest_indices

def find_indices(original, search):
    indices = []
    for row in search:
        # Find the index of the row in the original array
        index = np.where((original == row).all(axis=1))[0]
        if index.size > 0:
            indices.append(index[0])
        else:
            indices.append(-1)  # -1 indicates not found
    return np.array(indices)

def are_points_separated(point, other_points, min_distance):
    """Check if 'point' is at least 'min_distance' away from all points in 'other_points'."""
    return np.all(np.linalg.norm(other_points - point, axis=1) >= min_distance)

Read in dataset (One of the subsets from QM9 containing 5000 randomly selected molecules)

In [3]:
git_link = "https://github.com/aghosh92/DKLActiveLearnMol/blob/main/datasets/dataset0_5k.csv?raw=true"
df = pd.read_csv(git_link, index_col=0)
df

Unnamed: 0,smiles,mole_logp,tpsa,mol_wt,hbd,hba,valencee,max_partialcharge,min_partialcharge,rotatablebd,...,stereocent,dipole_moment,enthalpy,internal_energy,internal_energy_zero,free_energy,homo,lumo,gap,zero_point_vib_energy
80704,OCC(C#C)C#CC#C,-0.13530,20.23,118.041865,1,1,"(44,)",0.104838,-0.394070,1,...,1,1.0110,-383.379422,-383.380366,-383.390139,-383.425247,-0.2487,-0.0227,0.2260,0.109023
65243,OC1C2NC1(C#C)C2O,-1.93440,52.49,125.047678,3,3,"(48,)",0.135711,-0.388302,0,...,2,2.9389,-437.764349,-437.765293,-437.773455,-437.805879,-0.2457,0.0183,0.2640,0.123328
127044,C1C2C3CC(CCO3)N12,0.23180,12.24,125.084064,0,2,"(50,)",0.075724,-0.376416,0,...,4,1.2046,-403.111686,-403.112630,-403.119162,-403.149575,-0.2233,0.0819,0.3052,0.175386
78132,CC1=CC2CC(O2)C1O,0.46470,29.46,126.068080,1,2,"(50,)",0.101096,-0.386045,0,...,3,2.0914,-423.006427,-423.007371,-423.015176,-423.047111,-0.2388,0.0018,0.2405,0.160473
90425,CC1CC1(O)CCC=O,0.73640,37.30,128.083730,1,2,"(52,)",0.119600,-0.389588,3,...,2,2.1074,-424.213274,-424.214219,-424.224729,-424.261135,-0.2481,-0.0208,0.2273,0.178994
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
117107,OC1C2C3NC3(C#N)C12,-1.15892,65.96,122.048013,2,3,"(46,)",0.128183,-0.392345,0,...,5,6.3772,-416.738039,-416.738983,-416.746403,-416.778259,-0.2729,0.0142,0.2871,0.113488
8240,NC(=O)C(=O)NCC=O,-2.21320,89.26,130.037842,2,3,"(50,)",0.308947,-0.361393,2,...,0,2.7234,-491.145917,-491.146862,-491.156301,-491.192146,-0.2543,-0.0462,0.2081,0.111181
82724,CC1C(C)N1CC1CO1,0.47780,15.54,127.099714,0,2,"(52,)",0.093621,-0.371734,2,...,3,2.2996,-404.281755,-404.282699,-404.292728,-404.328359,-0.2208,0.0761,0.2969,0.192277
33180,NC(=O)C1CC1CCO,-0.50980,63.32,129.078979,2,2,"(52,)",0.220321,-0.396365,3,...,2,3.7023,-440.294320,-440.295265,-440.305434,-440.341331,-0.2486,0.0246,0.2732,0.170268


CMF initialization

In [4]:
!docker stop "vaedkl"

!docker run --name vaedkl --rm -p7687:7687 -p7474:7474 -d -v $HOME/neo4j/data:/data -v $HOME/neo4j/logs:/logs -v $HOME/neo4j/import:/var/lib/neo4j/import -v $HOME/neo4j/plugins:/plugins --env NEO4J_AUTH=neo4j/test1234 neo4j:latest
                            
'''
!docker run --name vaedkl --rm -p7687:7687 -p7474:7474 \
-d -v $HOME/neo4j/data:/data -v $HOME/neo4j/logs:/logs \
-v $HOME/neo4j/import:/var/lib/neo4j/import -v $HOME/neo4j/plugins:/plugins \
--env NEO4J_AUTH=neo4j/test1234 neo4j:latest '''

  pid, fd = os.forkpty()


vaedkl
9b9513a27fa021b99eb5b5f777baaeabd570849aa2fa90ec626ee3a9d45a500b


'\n!docker run --name vaedkl --rm -p7687:7687 -p7474:7474 -d -v $HOME/neo4j/data:/data -v $HOME/neo4j/logs:/logs -v $HOME/neo4j/import:/var/lib/neo4j/import -v $HOME/neo4j/plugins:/plugins --env NEO4J_AUTH=neo4j/test1234 neo4j:latest '

In [None]:
!sudo neo4j start

In [None]:
!git config --list --show-origin

In [5]:
!cmf init local --path /VAEDKL/ --git-remote-url https://github.com/gayathri-saranathan/VAEDKL/ --cmf-server-url http://127.0.0.1:80 --neo4j-user neo4j --neo4j-password test1234 --neo4j-uri bolt://localhost:7687

'''
!cmf init local --path /VAEDKL/ --git-remote-url https://github.com/atripathy86/AE-DKL/ \
--cmf-server-url http://127.0.0.1:80 --neo4j-user neo4j --neo4j-password test1234 \
--neo4j-uri bolt://localhost:7687
'''

git_dir /lustre/saranath/Techcon24/AE-DKL/.git
Starting cmf init.
Setting 'local-storage' as a default remote.
cmf init complete.
[0m

'\n!cmf init local --path /VAEDKL/ --git-remote-url https://github.com/atripathy86/AE-DKL/ --cmf-server-url http://127.0.0.1:80 --neo4j-user neo4j --neo4j-password test1234 --neo4j-uri bolt://localhost:7687\n'

In [6]:
from cmflib.cmf import Cmf
from ml_metadata.proto import metadata_store_pb2 as mlpb 

graph = True 
#metawriter = Cmf(filename="mlmd", pipeline_name="aifcmf-env")
!rm -rf aldkl_vae
cmf = Cmf(
    filename="aldkl_vae",
    pipeline_name="VAEDKL",
    graph = graph
)

*** Note: CMF will check out a new branch in git to commit the metadata files ***
*** The checked out branch is aldkl_vae. ***


In [7]:
selfies_dataset = []
error_smiles = []

# Assuming 'smiles' is the column in df containing SMILES strings
# Replace 'smiles' with the actual column name if it's different
for i, row in df.iterrows():
    try:
        local_sf = sf.encoder(row['smiles'])
        selfies_dataset.append(local_sf)
    except Exception as e:
        print(f"Error encoding SMILES at index {i}: {e}")
        error_smiles.append(row['smiles'])  # Store the SMILES string that caused the error
        continue  # Skip to the next iteration

#for producing corresponding one-hot vectors
alphabet = sf.get_alphabet_from_selfies(selfies_dataset)
alphabet.add("[nop]")  # [nop] is a special padding symbol
alphabet = list(sorted(alphabet))

length_list = []
for s in selfies_dataset:
  length = sf.len_selfies(s)
  length_list.append(length)
length_list.sort()
pad_to_len = length_list[-1]

symbol_to_idx = {s: i for i, s in enumerate(alphabet)}

labels = []
one_hot_vectors = []
for i in range(len(selfies_dataset)):

  label, one_hot = sf.selfies_to_encoding(selfies=selfies_dataset[i], vocab_stoi=symbol_to_idx,
                                          pad_to_len=pad_to_len, enc_type="both")
  labels.append(label)
  one_hot_vectors.append(one_hot)

molecules = np.array(one_hot_vectors)
s1, s2, s3 = molecules.shape
X = np.asarray(molecules).reshape([-1,s2*s3])
np.save("VAEDKL_logs/endoded_data.npy",X)
print(X.shape)

(5000, 567)


In [8]:
context_stage1 = cmf.create_context(pipeline_stage="Prepare",
                                          custom_properties={"Symbols":symbol_to_idx,
                                                            "Data_size":X.shape})

In [9]:
#Different targets
targets1 = df['ringct'].values
targets2 = df['mole_logp'].values
targets3 = df['mol_wt'].values
targets4 = df['dipole_moment'].values

For illustration purpose, we show the training on 1,000 molecules since training on a higher number of molecules may take longer & more computing resources.

In [11]:
X1 = np.copy(X)
X1 = X1[:1000]
targets1 = targets1[:1000]

np.save("VAEDKL_logs/train_data.npy",np.array(X1))
np.save("VAEDKL_logs/target_data.npy",np.array(targets1))

#Train the DKL model
rng_key_1, rng_key_predict_1 = gpax.utils.get_keys()
dkl_1 = gpax.viDKL(s2*s3, 2, 'RBF')
dkl_1.fit(rng_key_1, X1, targets1)
loss = dkl_1.loss
np.save("VAEDKL_logs/loss_dkl_1.npy",np.array(loss))


Inferred GP kernel parameters
k_length         [1.3033 1.0485]
k_scale          4.1401
noise            0.0165


In [12]:
weights_and_biases = dkl_1.nn_params
np.save("VAEDKL_logs/dkl_1_params.npy",weights_and_biases)

In [None]:
#Saving the model - %%%% Verify

In [13]:
torch.save(dkl_1.nn_params,"VAEDKL_logs/dkl_1_model.pt")

In [14]:
execution_stage1 = cmf.create_execution(execution_type="Train viDKL",
                                              custom_properties={"Kernel_Parameters":dkl_1.kernel_params,
                                                                "Kernel_Name":dkl_1.kernel_name,
                                                                "Kernel_Dimension":dkl_1.kernel_dim,
                                                                "Guide_type":dkl_1.guide_type})


In [15]:
_ = cmf.log_dataset("VAEDKL_logs/train_data.npy","input")

_ = cmf.log_dataset("VAEDKL_logs/target_data.npy","input")

_ = cmf.log_model("VAEDKL_logs/dkl_1_model.pt","output",model_framework="pytorch")

for ls in loss:
    cmf.log_metric("training_metrics",{"training_loss":float(ls)})
_ = cmf.commit_metrics("training_metrics")


In [16]:
batch_size_reconstruct = 250
embeded_1 = dkl_1.embed(X1)
pred_mean1, pred_var1 = dkl_1.predict_in_batches(rng_key_predict_1, X1, batch_size=batch_size_reconstruct)
pred_std1 = jnp.sqrt(pred_var1)

In [17]:
execution_stage1 = cmf.create_execution(execution_type="Predict viDKL",
                                              custom_properties={"Kernel_Parameters":dkl_1.kernel_params,
                                                                "Kernel_Name":dkl_1.kernel_name,
                                                                "Kernel_Dimension":dkl_1.kernel_dim,
                                                                "Guide_type":dkl_1.guide_type,
                                                                "Batch_Size":batch_size_reconstruct})

In [18]:
np.save("VAEDKL_logs/embedding.npy", np.array(embeded_1))
np.save("VAEDKL_logs/predicted_mean.npy",np.array(pred_mean1))
np.save("VAEDKL_logs/predicted_unc.npy",np.array(pred_var1))
np.save("VAEDKL_logs/prediccted_std.npy",np.array(pred_std1))

In [None]:
_ = cmf.log_dataset("VAEDKL_logs/embedding.npy","output")

_ = cmf.log_dataset("VAEDKL_logs/prediccted_std.npy","output")

_ = cmf.log_dataset("VAEDKL_logs/predicted_mean.npy","output")
_ = cmf.log_dataset("VAEDKL_logs/indices_train_al.npy","output")

_ = cmf.log_dataset("VAEDKL_logs/predicted_unc.npy","output")
_ = cmf.log_dataset("VAEDKL_logs/indices_unmeasured_al.npy","output")


_ = cmf.log_dataset("VAEDKL_logs/train_data.npy","input")

_ = cmf.log_model("VAEDKL_logs/dkl_1_model.pt","input",model_framework="pytorch")


#### Active Learning Part

In [None]:
X1 = np.copy(X)
X1 = X1[:1000]
#target
targets1 = - df['enthalpy'].values
targets1 = targets1[:1000]

np.save("VAEDKL_logs/X_al.npy",np.array(X1))
np.save("VAEDKL_logs/trgt_enthalpy_al.npy",np.array(targets1))

For illustration purpose, we show the training on 1,000 molecules with 30 exploration steps since training on a higher number of molecules may take longer & more computing resources.

In [None]:
#Active learning setup parameters
init_num = 50
exp_step = 30
batch_size_learn = 50 #make it 100 if start with 100 seed
batch_size_reconstruct = 250

In [17]:
context_stage2 = cmf.create_context(pipeline_stage="Active Learning",
                                          custom_properties={"Number_of_training_points":init_num,
                                                            "Total_Data_size":X1.shape,
                                                            "Exploration_Steps":exp_step,
                                                            "Batch_size":batch_size_learn,
                                                            "Batch_size_reconstruct":batch_size_reconstruct})



In [11]:
np.random.seed(0)
idx1 = np.random.choice(np.arange(len(X1)), size = init_num, replace = False)
X1_train = X1[idx1]
X1_unmeasured = np.delete(X1, idx1, axis=0)

indices_total_1 = np.arange(len(X))
indices_train_1 = indices_total_1[idx1]

y1_train = targets1[idx1]
indices_unmeasured_1 = np.delete(indices_total_1, idx1)

np.save("VAEDKL_logs/X_train_al.npy",np.array(X1_train))
np.save("VAEDKL_logs/X_unmeasured_al.npy",np.array(X1_unmeasured))

np.save("VAEDKL_logs/indices_train_al.npy",np.array(indices_train_1))
np.save("VAEDKL_logs/indices_unmeasured_al.npy",np.array(indices_unmeasured_1))

np.save("VAEDKL_logs/trgt_enthalpy_train_al.npy",np.array(y1_train))

In [24]:
execution_stage2 = cmf.create_execution(execution_type="Active Learning",
                                        custom_properties={"Number_of_training_points":init_num,
                                                           "Exploration_Steps":exp_step,
                                                           "Train_size":X1_train.shape,
                                                           "Unmeasured_size":X1_unmeasured.shape,
                                                           "indices_selected":idx1})

In [12]:
exp_mean1, exp_std1 = [], []
dkl_mean1, dkl_std1 = [], []
embed_traj1 = []
traj_mean1, traj_std1 = [], []

exploration_steps = exp_step


for e in range(exploration_steps):
    print("\nStep {}".format(e+1))
    # Obtain/update DKL posterior
    rng_key11, rng_key_predict11 = gpax.utils.get_keys()
    dkl_BO_1 = gpax.viDKL(s2*s3, 2, 'RBF')
    if e == 0:
        torch.save(dkl_BO_1.nn_params,f"VAEDKL_logs/active_learning_model_{e}.pth")
    dkl_BO_1.fit(rng_key11, X1_train, y1_train)
    torch.save(dkl_BO_1.nn_params,f"VAEDKL_logs/active_learning_model_{e+1}.pth")
    
    # Compute acqusition function
    y_mean, y_var = dkl_BO_1.predict_in_batches(#make prediction batch-by-batch to avoid memory overflow
        rng_key_predict11, X1_unmeasured, batch_size=250)
    y_std = jnp.sqrt(y_var)
    obj = y_mean + 10 * y_std
    # Get the next point to evaluate
    id_next = obj.argmax()
    np.save(f"VAEDKL_logs/next_pt_{e}.npy",np.array(id_next))

    # let's get the mean and std of the next point predicted by dkl_BO
    exp_mean1.append(y_mean[id_next])
    exp_std1.append(y_std[id_next])

    x_next = X1_unmeasured[id_next]
    ind_next = indices_unmeasured_1[id_next]

    # Getting the mean and std of the next point as predicted by the dkl trained on the full dataset
    dkl_mean1.append(pred_mean1[ind_next])
    dkl_std1.append(pred_std1[ind_next])

    # Perform evaluation
    y_measured = targets1[ind_next]

    # Update training arrays
    X1_train = np.append(X1_train, x_next[None], axis=0)
    y1_train = np.append(y1_train, y_measured)
    X1_unmeasured = np.delete(X1_unmeasured, id_next, axis=0)
    indices_unmeasured_1 = np.delete(indices_unmeasured_1, id_next)

    np.save(f"VAEDKL_logs/active_train_{e+1}.npy", np.array(X1_train))
    np.save(f"VAEDKL_logs/active_trgt_{e+1}.npy", np.array(y1_train))
    
    embed_traj = dkl_BO_1.embed(X1_train)
    traj_m, traj_s = dkl_BO_1.predict_in_batches(  # make prediction batch-by-batch to avoid memory overflow
        rng_key_predict11, X1_train, batch_size=batch_size_learn)

    np.save(f"VAEDKL_logs/embed_traj_{e+1}.npy",np.array(embed_traj))
    
    
    embed_traj1.append(embed_traj)
    traj_mean1.append(traj_m)
    traj_std1.append(traj_s)




Step 1


100%|█| 1000/1000 [00:02<00:00, 431.16it/s, init loss: 1217497.7502, avg. loss [95



Inferred GP kernel parameters
k_length         [1.4005 0.9728]
k_scale          6.2304
noise            2.3708

Step 2


100%|█| 1000/1000 [00:02<00:00, 428.86it/s, init loss: 1219386.7387, avg. loss [95



Inferred GP kernel parameters
k_length         [1.4061 0.9697]
k_scale          6.236400000000001
noise            2.3471

Step 3


100%|█| 1000/1000 [00:02<00:00, 428.30it/s, init loss: 1220754.4629, avg. loss [95



Inferred GP kernel parameters
k_length         [1.4018 0.9692]
k_scale          6.2302
noise            2.3539000000000003

Step 4


100%|█| 1000/1000 [00:02<00:00, 431.81it/s, init loss: 1223580.4929, avg. loss [95



Inferred GP kernel parameters
k_length         [1.4208 0.9787]
k_scale          6.201300000000001
noise            2.3718

Step 5


100%|█| 1000/1000 [00:02<00:00, 407.97it/s, init loss: 1229329.7420, avg. loss [95



Inferred GP kernel parameters
k_length         [1.4208 0.9574]
k_scale          6.1771
noise            2.3669000000000002

Step 6


100%|█| 1000/1000 [00:02<00:00, 404.57it/s, init loss: 1234676.2206, avg. loss [95



Inferred GP kernel parameters
k_length         [1.4111 0.971 ]
k_scale          6.1713000000000005
noise            2.3664

Step 7


100%|█| 1000/1000 [00:02<00:00, 403.39it/s, init loss: 1239720.7219, avg. loss [95



Inferred GP kernel parameters
k_length         [1.3957 0.9572]
k_scale          6.1526000000000005
noise            2.3785000000000003

Step 8


100%|█| 1000/1000 [00:02<00:00, 381.01it/s, init loss: 1251305.6825, avg. loss [95



Inferred GP kernel parameters
k_length         [1.4155 0.957 ]
k_scale          6.1809
noise            2.3755

Step 9


100%|█| 1000/1000 [00:02<00:00, 393.56it/s, init loss: 1260401.1145, avg. loss [95



Inferred GP kernel parameters
k_length         [1.4279 0.957 ]
k_scale          6.1379
noise            2.4075

Step 10


100%|█| 1000/1000 [00:02<00:00, 358.04it/s, init loss: 1261969.0644, avg. loss [95



Inferred GP kernel parameters
k_length         [1.44   0.9735]
k_scale          6.119800000000001
noise            2.4104

Step 11


100%|█| 1000/1000 [00:02<00:00, 341.24it/s, init loss: 1286281.3130, avg. loss [95



Inferred GP kernel parameters
k_length         [1.4018 0.9716]
k_scale          6.0961
noise            2.3592

Step 12


100%|█| 1000/1000 [00:03<00:00, 298.40it/s, init loss: 1293325.8219, avg. loss [95



Inferred GP kernel parameters
k_length         [1.3926 0.9613]
k_scale          6.043200000000001
noise            2.3555

Step 13


100%|█| 1000/1000 [00:03<00:00, 288.01it/s, init loss: 1295336.8520, avg. loss [95



Inferred GP kernel parameters
k_length         [1.4145 0.9606]
k_scale          6.0552
noise            2.3759

Step 14


100%|█| 1000/1000 [00:03<00:00, 294.29it/s, init loss: 1297509.9440, avg. loss [95



Inferred GP kernel parameters
k_length         [1.3952 0.9615]
k_scale          6.0679
noise            2.3754

Step 15


100%|█| 1000/1000 [00:03<00:00, 286.47it/s, init loss: 1299079.1582, avg. loss [95



Inferred GP kernel parameters
k_length         [1.4263 0.9711]
k_scale          6.071400000000001
noise            2.4045

Step 16


100%|█| 1000/1000 [00:04<00:00, 249.80it/s, init loss: 1315826.8396, avg. loss [95



Inferred GP kernel parameters
k_length         [1.4095 0.9743]
k_scale          6.0574
noise            2.3645

Step 17


100%|█| 1000/1000 [00:03<00:00, 264.85it/s, init loss: 1321211.2235, avg. loss [95



Inferred GP kernel parameters
k_length         [1.4005 0.9474]
k_scale          6.0225
noise            2.3809

Step 18


100%|█| 1000/1000 [00:04<00:00, 246.67it/s, init loss: 1326293.4468, avg. loss [95



Inferred GP kernel parameters
k_length         [1.3957 0.9706]
k_scale          6.0099
noise            2.4143000000000003

Step 19


100%|█| 1000/1000 [00:03<00:00, 263.99it/s, init loss: 1334831.3427, avg. loss [95



Inferred GP kernel parameters
k_length         [1.4005 0.9547]
k_scale          6.0364
noise            2.4162

Step 20


100%|█| 1000/1000 [00:04<00:00, 232.69it/s, init loss: 1335079.8503, avg. loss [95



Inferred GP kernel parameters
k_length         [1.4232 0.9691]
k_scale          6.053100000000001
noise            2.4132000000000002

Step 21


100%|█| 1000/1000 [00:03<00:00, 252.85it/s, init loss: 1339245.0285, avg. loss [95



Inferred GP kernel parameters
k_length         [1.4073 0.9674]
k_scale          6.0364
noise            2.4051

Step 22


100%|█| 1000/1000 [00:04<00:00, 220.63it/s, init loss: 1340120.9270, avg. loss [95



Inferred GP kernel parameters
k_length         [1.4188 0.9699]
k_scale          6.029800000000001
noise            2.4256

Step 23


100%|█| 1000/1000 [00:04<00:00, 226.00it/s, init loss: 1343662.5860, avg. loss [95



Inferred GP kernel parameters
k_length         [1.4039 0.9626]
k_scale          6.009600000000001
noise            2.3797

Step 24


100%|█| 1000/1000 [00:04<00:00, 224.77it/s, init loss: 1345476.5368, avg. loss [95



Inferred GP kernel parameters
k_length         [1.4249 0.9689]
k_scale          6.0120000000000005
noise            2.3821000000000003

Step 25


100%|█| 1000/1000 [00:04<00:00, 244.36it/s, init loss: 1348647.5247, avg. loss [95



Inferred GP kernel parameters
k_length         [1.4211 0.9599]
k_scale          5.973400000000001
noise            2.3943000000000003

Step 26


100%|█| 1000/1000 [00:04<00:00, 215.37it/s, init loss: 1349017.3632, avg. loss [95



Inferred GP kernel parameters
k_length         [1.4313 0.945 ]
k_scale          5.9791
noise            2.407

Step 27


100%|█| 1000/1000 [00:04<00:00, 238.99it/s, init loss: 1351048.4285, avg. loss [95



Inferred GP kernel parameters
k_length         [1.4216 0.9583]
k_scale          5.968100000000001
noise            2.4358

Step 28


100%|█| 1000/1000 [00:05<00:00, 173.26it/s, init loss: 1352475.9561, avg. loss [95



Inferred GP kernel parameters
k_length         [1.4168 0.9533]
k_scale          5.9707
noise            2.4198

Step 29


100%|█| 1000/1000 [00:04<00:00, 221.84it/s, init loss: 1357381.1225, avg. loss [95



Inferred GP kernel parameters
k_length         [1.4273 0.9615]
k_scale          5.978400000000001
noise            2.4330000000000003

Step 30


100%|█| 1000/1000 [00:04<00:00, 237.04it/s, init loss: 1389132.2803, avg. loss [95



Inferred GP kernel parameters
k_length         [1.4198 0.9679]
k_scale          5.9295
noise            2.4008000000000003


NameError: name 'cmf' is not defined

In [18]:
_ = cmf.log_dataset("VAEDKL_logs/X_al.npy","input")
_ = cmf.log_dataset("VAEDKL_logs/trgt_enthalpy_al.npy","input")

_ = cmf.log_dataset("VAEDKL_logs/active_train_0.npy","output")
_ = cmf.log_dataset("VAEDKL_logs/active_trgt_0.npy","output")


_ = cmf.log_model("VAEDKL_logs/active_learning_model_0.pt","input",model_framework="pytorch")


for e in range(exploration_steps):
    _ = cmf.create_execution(execution_type=f"ActiveLearningTrainingExp{e}", custom_properties={"Execution_Step":e})

    _ = cmf.log_model(f"VAEDKL_logs/active_learning_model_{e+1}.pth","output")
    _ = cmf.log_dataset(f"VAEDKL_logs/active_train_{e+1}.npy","output")
    _ = cmf.log_dataset(f"VAEDKL_logs/active_trgt_{e+1}.npy","output")
    _ = cmf.log_dataset(f"VAEDKL_logs/embed_traj_{e+1}.npy","output")
    
    _ = cmf.log_model(f"VAEDKL_logs/active_learning_model_{e}.pth","input")
    _ = cmf.log_dataset(f"VAEDKL_logs/active_train_{e}.npy","input")
    _ = cmf.log_dataset(f"VAEDKL_logs/active_trgt_{e}.npy","input")
        

AttributeError: 'NoneType' object has no attribute 'id'

In [None]:
#prediction
rng_key11, rng_key_predict11 = gpax.utils.get_keys()
dkl_BO_1 = gpax.viDKL(s2*s3, 2, 'RBF')
dkl_BO_1.fit(rng_key11, X1_train, y1_train)

predmean_train1, predstd_train1 = dkl_BO_1.predict_in_batches(rng_key_predict11,
                                                              X1_train, batch_size=batch_size_learn)
predmean_unmeas1, predstd_unmeas1 = dkl_BO_1.predict_in_batches(rng_key_predict11,
                                                                X1_unmeasured, batch_size=batch_size_reconstruct)
predmean11, predstd11 = dkl_BO_1.predict_in_batches(rng_key_predict11, X1,
                                                    batch_size=batch_size_reconstruct)


embeded_train1 = dkl_BO_1.embed(X1_train)  # Measured points
embeded_unmeasured1 = dkl_BO_1.embed(X1_unmeasured)  # Unmeasured points
embeded11 = dkl_BO_1.embed(X1)   # All points

#visualize the latent space
fig, ax = plt.subplots(nrows = 2, ncols = 3, figsize = (18,12))

im = ax[0,0].scatter(embeded_train1[:, 1], embeded_train1[:, 0], s=10, c = predmean_train1, cmap = 'jet')
ax[0,0].set_title('Measured points')
cbar = fig.colorbar(im, ax=ax[0,0], shrink=.8)
cbar.set_label("Predicted_Target", fontsize=14)
cbar.ax.tick_params(labelsize=10)

im = ax[1,0].scatter(embeded_unmeasured1[:, 1], embeded_unmeasured1[:, 0], s=10,
                c = predmean_unmeas1, cmap = 'jet')
ax[1,0].set_title('UnMeasured points')
cbar = fig.colorbar(im, ax=ax[0,1], shrink=.8)
cbar.set_label("Predicted_Target", fontsize=14)
cbar.ax.tick_params(labelsize=10)

im = ax[0,1].scatter(embeded11[:, 1], embeded11[:, 0], s=10, c = predmean11, cmap = 'jet')
ax[0,1].set_title('All points')
cbar = fig.colorbar(im, ax=ax[0,2], shrink=.8)
cbar.set_label("Predicted_Target", fontsize=14)
cbar.ax.tick_params(labelsize=10)

im = ax[1,1].scatter(embeded11[:, 1], embeded11[:, 0], s=10, c = targets1, cmap = 'jet')
ax[1,1].set_title('All points (Groung truth)')
cbar = fig.colorbar(im, ax=ax[1,0], shrink=.8)
cbar.set_label("Class_Labels", fontsize=14)
cbar.ax.tick_params(labelsize=10)

im = ax[0,2].scatter(embeded11[:, 1], embeded11[:, 0], s=10, c = predmean11 - targets1, cmap = 'jet')
ax[0,2].set_title('Error')
cbar = fig.colorbar(im, ax=ax[1,1], shrink=.8)
#cbar.set_label("Log(P)", fontsize=14)
cbar.ax.tick_params(labelsize=10)

im = ax[1,2].scatter(embeded11[:, 1], embeded11[:, 0], s=10, c = predstd11, cmap = 'jet')
ax[1,2].set_title('Uncertainty')
cbar = fig.colorbar(im, ax=ax[1,2], shrink=.8)
#cbar.set_label("Wt", fontsize=14)
cbar.ax.tick_params(labelsize=10)