In [None]:
# @title Install packages
!pip install wget
!pip install rdkit-pypi
!pip install selfies --upgrade
!pip install -q --upgrade git+https://github.com/ziatdinovmax/gpax
!pip install -q atomai  # we will use the AtomAI VAE

In [2]:
# @title Import libraries
import wget
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from PIL import Image
from io import BytesIO
import warnings

import glob

import rdkit
from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.Chem import rdMolDescriptors
from rdkit.Chem import Draw

import torch
import torch.nn as nn
tt = torch.tensor
from math import log
import matplotlib.pyplot as plt
from pandas import json_normalize
from scipy.spatial import distance

import selfies as sf
import gpax
import atomai as aoi

import jax.numpy as jnp
gpax.utils.enable_x64()

In [3]:
# @title Utility functions
def find_nearest_neighbors(reference_idx, embedding, num_neighbors=9):
    """Find the indices of the nearest neighbors to a reference point."""
    reference_point = embedding[reference_idx]
    distances = distance.cdist([reference_point], embedding, 'euclidean').flatten()
    nearest_indices = np.argsort(distances)[1:num_neighbors+1]  # Exclude the reference point itself
    warnings.filterwarnings("ignore", category=UserWarning)  # Ignore userwarnings
    return nearest_indices

def find_indices(original, search):
    indices = []
    for row in search:
        # Find the index of the row in the original array
        index = np.where((original == row).all(axis=1))[0]
        if index.size > 0:
            indices.append(index[0])
        else:
            indices.append(-1)  # -1 indicates not found
    return np.array(indices)

def are_points_separated(point, other_points, min_distance):
    """Check if 'point' is at least 'min_distance' away from all points in 'other_points'."""
    return np.all(np.linalg.norm(other_points - point, axis=1) >= min_distance)

Read in dataset (One of the subsets from QM9 containing 5000 randomly selected molecules)

In [4]:
git_link = "https://github.com/aghosh92/DKLActiveLearnMol/blob/main/datasets/dataset0_5k.csv?raw=true"
df = pd.read_csv(git_link, index_col=0)
df

Unnamed: 0,smiles,mole_logp,tpsa,mol_wt,hbd,hba,valencee,max_partialcharge,min_partialcharge,rotatablebd,...,stereocent,dipole_moment,enthalpy,internal_energy,internal_energy_zero,free_energy,homo,lumo,gap,zero_point_vib_energy
80704,OCC(C#C)C#CC#C,-0.13530,20.23,118.041865,1,1,"(44,)",0.104838,-0.394070,1,...,1,1.0110,-383.379422,-383.380366,-383.390139,-383.425247,-0.2487,-0.0227,0.2260,0.109023
65243,OC1C2NC1(C#C)C2O,-1.93440,52.49,125.047678,3,3,"(48,)",0.135711,-0.388302,0,...,2,2.9389,-437.764349,-437.765293,-437.773455,-437.805879,-0.2457,0.0183,0.2640,0.123328
127044,C1C2C3CC(CCO3)N12,0.23180,12.24,125.084064,0,2,"(50,)",0.075724,-0.376416,0,...,4,1.2046,-403.111686,-403.112630,-403.119162,-403.149575,-0.2233,0.0819,0.3052,0.175386
78132,CC1=CC2CC(O2)C1O,0.46470,29.46,126.068080,1,2,"(50,)",0.101096,-0.386045,0,...,3,2.0914,-423.006427,-423.007371,-423.015176,-423.047111,-0.2388,0.0018,0.2405,0.160473
90425,CC1CC1(O)CCC=O,0.73640,37.30,128.083730,1,2,"(52,)",0.119600,-0.389588,3,...,2,2.1074,-424.213274,-424.214219,-424.224729,-424.261135,-0.2481,-0.0208,0.2273,0.178994
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
117107,OC1C2C3NC3(C#N)C12,-1.15892,65.96,122.048013,2,3,"(46,)",0.128183,-0.392345,0,...,5,6.3772,-416.738039,-416.738983,-416.746403,-416.778259,-0.2729,0.0142,0.2871,0.113488
8240,NC(=O)C(=O)NCC=O,-2.21320,89.26,130.037842,2,3,"(50,)",0.308947,-0.361393,2,...,0,2.7234,-491.145917,-491.146862,-491.156301,-491.192146,-0.2543,-0.0462,0.2081,0.111181
82724,CC1C(C)N1CC1CO1,0.47780,15.54,127.099714,0,2,"(52,)",0.093621,-0.371734,2,...,3,2.2996,-404.281755,-404.282699,-404.292728,-404.328359,-0.2208,0.0761,0.2969,0.192277
33180,NC(=O)C1CC1CCO,-0.50980,63.32,129.078979,2,2,"(52,)",0.220321,-0.396365,3,...,2,3.7023,-440.294320,-440.295265,-440.305434,-440.341331,-0.2486,0.0246,0.2732,0.170268


CMF initialization

In [5]:
!docker stop "vaedkl"

!docker run --name vaedkl --rm -p7687:7687 -p7474:7474 -d -v $HOME/neo4j/data:/data -v $HOME/neo4j/logs:/logs -v $HOME/neo4j/import:/var/lib/neo4j/import -v $HOME/neo4j/plugins:/plugins --env NEO4J_AUTH=neo4j/test1234 neo4j:latest
                            
'''
!docker run --name vaedkl --rm -p7687:7687 -p7474:7474 \
-d -v $HOME/neo4j/data:/data -v $HOME/neo4j/logs:/logs \
-v $HOME/neo4j/import:/var/lib/neo4j/import -v $HOME/neo4j/plugins:/plugins \
--env NEO4J_AUTH=neo4j/test1234 neo4j:latest '''

  pid, fd = os.forkpty()


vaedkl
f8df77aa0985255c83c9a1d778ac6efd6aed8e22a8330221b3b2429565d6eb18


'\n!docker run --name vaedkl --rm -p7687:7687 -p7474:7474 -d -v $HOME/neo4j/data:/data -v $HOME/neo4j/logs:/logs -v $HOME/neo4j/import:/var/lib/neo4j/import -v $HOME/neo4j/plugins:/plugins --env NEO4J_AUTH=neo4j/test1234 neo4j:latest '

!sudo neo4j start

In [6]:
!git config --list --show-origin

file:/home/saranath/.gitconfig	user.email=gayathri.saranathan@hpe.com
file:/home/saranath/.gitconfig	user.name=gayathri-saranathan
file:.git/config	core.repositoryformatversion=0
file:.git/config	core.filemode=true
file:.git/config	core.bare=false
file:.git/config	core.logallrefupdates=true
file:.git/config	remote.origin.url=git@github.com:atripathy86/AE-DKL.git
file:.git/config	remote.origin.fetch=+refs/heads/*:refs/remotes/origin/*
file:.git/config	branch.main.remote=origin
file:.git/config	branch.main.merge=refs/heads/main
file:.git/config	user.email=gayathrisaranath1998@gmail.com
file:.git/config	user.name=g3saranath


In [7]:
!cmf init local --path /VAEDKL/ --git-remote-url https://github.com/gayathri-saranathan/VAEDKL/ --cmf-server-url http://127.0.0.1:80 --neo4j-user neo4j --neo4j-password test1234 --neo4j-uri bolt://localhost:7687

git_dir /lustre/saranath/Techcon24/AE-DKL/.git
Starting cmf init.
Setting 'local-storage' as a default remote.
cmf init complete.
[0m

In [8]:
from cmflib.cmf import Cmf
from ml_metadata.proto import metadata_store_pb2 as mlpb 

graph = True 
#metawriter = Cmf(filename="mlmd", pipeline_name="aifcmf-env")
#provide different obj name - metawriter, metalogger, ..
!rm -rf aldkl_vae
metawriter = Cmf(
    filename="aldkl_vae",
    pipeline_name="VAEDKL",
    graph = graph
)

*** Note: CMF will check out a new branch in git to commit the metadata files ***
*** The checked out branch is aldkl_vae. ***


In [9]:
selfies_dataset = []
error_smiles = []

# Assuming 'smiles' is the column in df containing SMILES strings
# Replace 'smiles' with the actual column name if it's different
for i, row in df.iterrows():
    try:
        local_sf = sf.encoder(row['smiles'])
        selfies_dataset.append(local_sf)
    except Exception as e:
        print(f"Error encoding SMILES at index {i}: {e}")
        error_smiles.append(row['smiles'])  # Store the SMILES string that caused the error
        continue  # Skip to the next iteration

#for producing corresponding one-hot vectors
alphabet = sf.get_alphabet_from_selfies(selfies_dataset)
alphabet.add("[nop]")  # [nop] is a special padding symbol
alphabet = list(sorted(alphabet))

length_list = []
for s in selfies_dataset:
  length = sf.len_selfies(s)
  length_list.append(length)
length_list.sort()
pad_to_len = length_list[-1]

symbol_to_idx = {s: i for i, s in enumerate(alphabet)}

labels = []
one_hot_vectors = []
for i in range(len(selfies_dataset)):

  label, one_hot = sf.selfies_to_encoding(selfies=selfies_dataset[i], vocab_stoi=symbol_to_idx,
                                          pad_to_len=pad_to_len, enc_type="both")
  labels.append(label)
  one_hot_vectors.append(one_hot)

molecules = np.array(one_hot_vectors)
s1, s2, s3 = molecules.shape
X = np.asarray(molecules).reshape([-1,s2*s3])
np.save("VAEDKL_logs/endoded_data.npy",X)
print(X.shape)

(5000, 567)


In [10]:
context_stage1 = metawriter.create_context(pipeline_stage="Prepare",
                                          custom_properties={"Symbols":symbol_to_idx,
                                                            "Data_size":X.shape})

In [11]:
#Different targets
targets1 = df['ringct'].values
targets2 = df['mole_logp'].values
targets3 = df['mol_wt'].values
targets4 = df['dipole_moment'].values

For illustration purpose, we show the training on 1,000 molecules since training on a higher number of molecules may take longer & more computing resources.

In [14]:
X1 = np.copy(X)
X1 = X1[:1000]
targets1 = targets1[:1000]

np.save("VAEDKL_logs/train_data.npy",np.array(X1))
np.save("VAEDKL_logs/target_data.npy",np.array(targets1))

#Train the DKL model
rng_key_1, rng_key_predict_1 = gpax.utils.get_keys()

dkl_1 = gpax.viDKL(s2*s3, 2, 'RBF')
dkl_1.fit(rng_key_1, X1, targets1)
loss = dkl_1.loss
np.save("VAEDKL_logs/loss_dkl_1.npy",np.array(loss))


Inferred GP kernel parameters
k_length         [1.3033 1.0485]
k_scale          4.1401
noise            0.0165


In [15]:
nn, kernel = dkl_1.get_samples() # Returns the kernel and nn parameters
kernel 

{'k_length': Array([1.3032896 , 1.04847207], dtype=float64),
 'k_scale': Array(4.14014159, dtype=float64),
 'noise': Array(0.01645812, dtype=float64)}

Saving the Jax Model - NN parameters and Kernel Info as .npy

In [16]:
pred, cov = dkl_1.get_mvn_posterior(X1, nn, kernel)

In [17]:
import jax.numpy as jnp

import jax

# Assuming params_dict is a dictionary with keys of type str and values of type jnp.ndarray
def save_params_dict(params_dict, filename):
    with open(filename, 'wb') as f:
        jnp.save(filename, params_dict)

# # Example usage:
# params_dict = {'key1': jnp.array([1, 2, 3]), 'key2': jnp.array([4, 5, 6])}
save_params_dict(kernel,"VAEDKL_logs/dkl1_kernel_params.npy")
save_params_dict(nn, "VAEDKL_logs/dkl1_nn_params_dict.npy")


Loading the saved params and comparing with the original - to verify if the saving works fine

In [18]:
import pickle
import pandas as pd
import numpy


def load_params(filename):
    with open(filename,'rb') as f:
        val = jnp.load(filename,allow_pickle=True)
    f.close()
    return val.item()

nn_read = load_params("VAEDKL_logs/dkl1_nn_params_dict.npy")
kernel_read = load_params("VAEDKL_logs/dkl1_kernel_params.npy")
print(kernel_read)
pred_check, cov_check = dkl_1.get_mvn_posterior(X1,nn_read,kernel_read)

{'k_length': Array([1.3032896 , 1.04847207], dtype=float64), 'k_scale': Array(4.14014159, dtype=float64), 'noise': Array(0.01645812, dtype=float64)}


In [19]:
(pred_check != pred).sum() ##Same prediction before and after loading the saved JAX numpy

Array(0, dtype=int64)

In [20]:
weights_and_biases = dkl_1.nn_params
np.save("VAEDKL_logs/dkl_1_params.npy",weights_and_biases)

In [21]:
#Saving the model - %%%% Verify

NN params alone = torch.save(dkl_1.nn_params,"VAEDKL_logs/dkl_1_model.pt")

In [22]:
execution_stage1 = metawriter.create_execution(execution_type="Train viDKL",
                                              custom_properties={"Kernel_Parameters":dkl_1.kernel_params,
                                                                "Kernel_Name":dkl_1.kernel_name,
                                                                "Kernel_Dimension":dkl_1.kernel_dim,
                                                                "Guide_type":dkl_1.guide_type})


In [23]:
_ = metawriter.log_dataset("VAEDKL_logs/train_data.npy","input")

_ = metawriter.log_dataset("VAEDKL_logs/target_data.npy","input")

_ = metawriter.log_model("VAEDKL_logs/dkl1_kernel_params.npy","output",model_framework="haiku_and_jax",model_name="kernel_params")
_ = metawriter.log_model("VAEDKL_logs/dkl1_nn_params_dict.npy","output",model_framework="haiku_and_jax",model_name="nn_params")



for ls in loss:
    metawriter.log_metric("training_metrics",{"training_loss":float(ls)}) #Loss is originally jnp array, but has to be converted to float for logging in CMF, as it throws an error when logged as jnp.ndarray
_ = metawriter.commit_metrics("training_metrics")


In [24]:
batch_size_reconstruct = 250
embeded_1 = dkl_1.embed(X1)
pred_mean1, pred_var1 = dkl_1.predict_in_batches(rng_key_predict_1, X1, batch_size=batch_size_reconstruct)
pred_std1 = jnp.sqrt(pred_var1)

In [25]:
execution_stage1 = metawriter.create_execution(execution_type="Predict viDKL",
                                              custom_properties={"Kernel_Parameters":dkl_1.kernel_params,
                                                                "Kernel_Name":dkl_1.kernel_name,
                                                                "Kernel_Dimension":dkl_1.kernel_dim,
                                                                "Guide_type":dkl_1.guide_type,
                                                                "Batch_Size":batch_size_reconstruct})

In [26]:
np.save("VAEDKL_logs/embedding.npy", np.array(embeded_1))
np.save("VAEDKL_logs/predicted_mean.npy",np.array(pred_mean1))
np.save("VAEDKL_logs/predicted_unc.npy",np.array(pred_var1))
np.save("VAEDKL_logs/prediccted_std.npy",np.array(pred_std1))

In [27]:
_ = metawriter.log_dataset("VAEDKL_logs/embedding.npy","output")

_ = metawriter.log_dataset("VAEDKL_logs/prediccted_std.npy","output")

_ = metawriter.log_dataset("VAEDKL_logs/predicted_mean.npy","output")
_ = metawriter.log_dataset("VAEDKL_logs/indices_train_al.npy","output")

_ = metawriter.log_dataset("VAEDKL_logs/predicted_unc.npy","output")
_ = metawriter.log_dataset("VAEDKL_logs/indices_unmeasured_al.npy","output")


"""Please log the above as metrics, Currently logged as Datasets"""

_ = metawriter.log_dataset("VAEDKL_logs/train_data.npy","input")


_ = metawriter.log_model("VAEDKL_logs/dkl1_nn_params_dict.npy","input",model_framework="haiku_and_jax",model_name="nn_params")
_ = metawriter.log_model("VAEDKL_logs/dkl1_kernel_params.npy","input",model_framework="haiku_and_jax",model_name="kernel_params")



#### Active Learning Part

In [28]:
X1 = np.copy(X)
X1 = X1[:1000]
#target
targets1 = - df['enthalpy'].values
targets1 = targets1[:1000]

np.save("VAEDKL_logs/X_al.npy",np.array(X1))
np.save("VAEDKL_logs/trgt_enthalpy_al.npy",np.array(targets1))

For illustration purpose, we show the training on 1,000 molecules with 30 exploration steps since training on a higher number of molecules may take longer & more computing resources.

In [29]:
#Active learning setup parameters
init_num = 50
exp_step = 30
batch_size_learn = 50 #make it 100 if start with 100 seed
batch_size_reconstruct = 250

In [30]:
context_stage2 = metawriter.create_context(pipeline_stage="Active Learning",
                                          custom_properties={"Number_of_training_points":init_num,
                                                            "Total_Data_size":X1.shape,
                                                            "Exploration_Steps":exp_step,
                                                            "Batch_size":batch_size_learn,
                                                            "Batch_size_reconstruct":batch_size_reconstruct})

In [31]:
np.random.seed(0)
idx1 = np.random.choice(np.arange(len(X1)), size = init_num, replace = False)
X1_train = X1[idx1]
X1_unmeasured = np.delete(X1, idx1, axis=0)

indices_total_1 = np.arange(len(X))
indices_train_1 = indices_total_1[idx1]

y1_train = targets1[idx1]
indices_unmeasured_1 = np.delete(indices_total_1, idx1)

np.save("VAEDKL_logs/X_train_al.npy",np.array(X1_train))
np.save("VAEDKL_logs/X_unmeasured_al.npy",np.array(X1_unmeasured))

np.save("VAEDKL_logs/indices_train_al.npy",np.array(indices_train_1))
np.save("VAEDKL_logs/indices_unmeasured_al.npy",np.array(indices_unmeasured_1))

np.save("VAEDKL_logs/trgt_enthalpy_train_al.npy",np.array(y1_train))

In [32]:
execution_stage2 = metawriter.create_execution(execution_type="Active Learning",
                                        custom_properties={"Number_of_training_points":init_num,
                                                           "Exploration_Steps":exp_step,
                                                           "Train_size":X1_train.shape,
                                                           "Unmeasured_size":X1_unmeasured.shape,
                                                           "indices_selected":idx1})

In [33]:
exp_mean1, exp_std1 = [], []
dkl_mean1, dkl_std1 = [], []
embed_traj1 = []
traj_mean1, traj_std1 = [], []
objective_func = []

exploration_steps = exp_step


for e in range(exploration_steps):
    print("\nStep {}".format(e+1))
    # Obtain/update DKL posterior
    rng_key11, rng_key_predict11 = gpax.utils.get_keys()
    dkl_BO_1 = gpax.viDKL(s2*s3, 2, 'RBF')
    
    if e == 0:
        nn,kernel = dkl_BO_1.get_samples()
        save_params_dict(nn,f"VAEDKL_logs/active_learning_nn_model_{e}.npy")
        save_params_dict(kernel,f"VAEDKL_logs/active_learning_kernel_model_{e}.npy")
        #torch.save(dkl_BO_1.nn_params,f"VAEDKL_logs/active_learning_model_{e}.pth")
    
    dkl_BO_1.fit(rng_key11, X1_train, y1_train)
    #torch.save(dkl_BO_1.nn_params,f"VAEDKL_logs/active_learning_model_{e+1}.pth")
    nn,kernel = dkl_BO_1.get_samples()
    save_params_dict(nn,f"VAEDKL_logs/active_learning_nn_model_{e+1}.npy")
    save_params_dict(kernel,f"VAEDKL_logs/active_learning_kernel_model_{e+1}.npy")
    
    # Compute acqusition function
    y_mean, y_var = dkl_BO_1.predict_in_batches(#make prediction batch-by-batch to avoid memory overflow
        rng_key_predict11, X1_unmeasured, batch_size=250)
    y_std = jnp.sqrt(y_var)
    obj = y_mean + 10 * y_std
    objective_func.append(obj)
    #np.save(f"VAEDKL_logs/object_{e+1}.npy",np.array(obj))
    # Get the next point to evaluate
    id_next = obj.argmax()
    np.save(f"VAEDKL_logs/next_pt_{e+1}.npy",np.array(id_next))

    # let's get the mean and std of the next point predicted by dkl_BO
    exp_mean1.append(y_mean[id_next])
    exp_std1.append(y_std[id_next])

    x_next = X1_unmeasured[id_next]
    ind_next = indices_unmeasured_1[id_next]

    # Getting the mean and std of the next point as predicted by the dkl trained on the full dataset
    dkl_mean1.append(pred_mean1[ind_next])
    dkl_std1.append(pred_std1[ind_next])

    # Perform evaluation
    y_measured = targets1[ind_next]

    # Update training arrays
    X1_train = np.append(X1_train, x_next[None], axis=0)
    y1_train = np.append(y1_train, y_measured)
    X1_unmeasured = np.delete(X1_unmeasured, id_next, axis=0)
    indices_unmeasured_1 = np.delete(indices_unmeasured_1, id_next)

    np.save(f"VAEDKL_logs/active_train_{e+1}.npy", np.array(X1_train))
    np.save(f"VAEDKL_logs/active_trgt_{e+1}.npy", np.array(y1_train))
    
    embed_traj = dkl_BO_1.embed(X1_train)
    traj_m, traj_s = dkl_BO_1.predict_in_batches(  # make prediction batch-by-batch to avoid memory overflow
        rng_key_predict11, X1_train, batch_size=batch_size_learn)

    np.save(f"VAEDKL_logs/embed_traj_{e+1}.npy",np.array(embed_traj))
    
    
    embed_traj1.append(embed_traj)
    traj_mean1.append(traj_m)
    traj_std1.append(traj_s)


Step 1

Inferred GP kernel parameters
k_length         [1.4005 0.9728]
k_scale          6.2304
noise            2.3708

Step 2

Inferred GP kernel parameters
k_length         [1.4061 0.9697]
k_scale          6.236400000000001
noise            2.3471

Step 3

Inferred GP kernel parameters
k_length         [1.4018 0.9692]
k_scale          6.2302
noise            2.3539000000000003

Step 4

Inferred GP kernel parameters
k_length         [1.4208 0.9787]
k_scale          6.201300000000001
noise            2.3718

Step 5

Inferred GP kernel parameters
k_length         [1.4208 0.9574]
k_scale          6.1771
noise            2.3669000000000002

Step 6

Inferred GP kernel parameters
k_length         [1.4111 0.971 ]
k_scale          6.1713000000000005
noise            2.3664

Step 7

Inferred GP kernel parameters
k_length         [1.3957 0.9572]
k_scale          6.1526000000000005
noise            2.3785000000000003

Step 8

Inferred GP kernel parameters
k_length         [1.4155 0.957 ]
k_scal

In [34]:
_ = metawriter.log_dataset("VAEDKL_logs/X_al.npy","input")
_ = metawriter.log_dataset("VAEDKL_logs/trgt_enthalpy_al.npy","input")

_ = metawriter.log_dataset("VAEDKL_logs/active_train_0.npy","output")
_ = metawriter.log_dataset("VAEDKL_logs/active_trgt_0.npy","output")


# _ = cmf.log_model("VAEDKL_logs/active_learning_model_0.pt","input",model_framework="pytorch")


for e in range(exploration_steps):
    _ = metawriter.create_execution(execution_type=f"ActiveLearningTrainingExp{e}", custom_properties={"Execution_Step":e})

    _ = metawriter.log_model(f"VAEDKL_logs/active_learning_nn_model_{e+1}.npy","output",model_framework="haiku_and_jax",model_name='nn_params')
    _ = metawriter.log_model(f"VAEDKL_logs/active_learning_kernel_model_{e+1}.npy","input",model_framework="haiku_and_jax",model_name='kernel_params')
    
    _ = metawriter.log_dataset(f"VAEDKL_logs/active_train_{e+1}.npy","output")
    _ = metawriter.log_dataset(f"VAEDKL_logs/active_trgt_{e+1}.npy","output")
    _ = metawriter.log_dataset(f"VAEDKL_logs/embed_traj_{e+1}.npy","output")

    for obj in objective_func[e]:
        metawriter.log_metric(f"obj_funct{e}_val",{"obj_val":float(obj)})
    _ = metawriter.commit_metrics(f"obj_funct{e}_val")
    
    
    _ = metawriter.log_model(f"VAEDKL_logs/active_learning_nn_model_{e}.npy","input",model_framework="haiku",model_name='nn_params')
    _ = metawriter.log_model(f"VAEDKL_logs/active_learning_kernel_model_{e}.npy","input",model_framework="haiku",model_name='kernel_params')
    
    _ = metawriter.log_dataset(f"VAEDKL_logs/active_train_{e}.npy","input")
    _ = metawriter.log_dataset(f"VAEDKL_logs/active_trgt_{e}.npy","input")
        

Retrying with full path
dvc.exceptions.PathMissingError Caught  Unexpected The path '/lustre/saranath/Techcon24/AE-DKL/VAEDKL_logs/active_train_0.npy' does not exist in the target repository 'None' neither as a DVC output nor as a Git-tracked file., <class 'dvc.exceptions.PathMissingError'>
Retrying with full path
dvc.exceptions.PathMissingError Caught  Unexpected The path '/lustre/saranath/Techcon24/AE-DKL/VAEDKL_logs/active_train_0.npy' does not exist in the target repository 'None' neither as a DVC output nor as a Git-tracked file., <class 'dvc.exceptions.PathMissingError'>
Retrying with full path
dvc.exceptions.PathMissingError Caught  Unexpected The path '/lustre/saranath/Techcon24/AE-DKL/VAEDKL_logs/active_trgt_0.npy' does not exist in the target repository 'None' neither as a DVC output nor as a Git-tracked file., <class 'dvc.exceptions.PathMissingError'>
Retrying with full path
dvc.exceptions.PathMissingError Caught  Unexpected The path '/lustre/saranath/Techcon24/AE-DKL/VAEDKL

AlreadyExistsError: Given node already exists: type_id: 13
uri: "e5f59d0e-e83b-11ee-8732-b47af137252e"
properties {
  key: "Commit"
  value {
    string_value: ""
  }
}
properties {
  key: "git_repo"
  value {
    string_value: "git@github.com:atripathy86/AE-DKL.git"
  }
}
properties {
  key: "url"
  value {
    string_value: "VAEDKL:"
  }
}
name: "VAEDKL_logs/active_train_0.npy:"
INTERNAL: Cannot create node for type_id: 13 uri: "e5f59d0e-e83b-11ee-8732-b47af137252e" properties { key: "Commit" value { string_value: "" } } properties { key: "git_repo" value { string_value: "git@github.com:atripathy86/AE-DKL.git" } } properties { key: "url" value { string_value: "VAEDKL:" } } name: "VAEDKL_logs/active_train_0.npy:"Error when executing query: UNIQUE constraint failed: Artifact.type_id, Artifact.name query:  INSERT INTO `Artifact`(    `type_id`, `uri`, `state`, `name`, `external_id`,    `create_time_since_epoch`, `last_update_time_since_epoch` ) VALUES(13, 'e5f59d0e-e83b-11ee-8732-b47af137252e', NULL, 'VAEDKL_logs/active_train_0.npy:', NULL, 1711105445387, 1711105445387);

In [None]:
#prediction
rng_key11, rng_key_predict11 = gpax.utils.get_keys()
dkl_BO_1 = gpax.viDKL(s2*s3, 2, 'RBF')
dkl_BO_1.fit(rng_key11, X1_train, y1_train)

predmean_train1, predstd_train1 = dkl_BO_1.predict_in_batches(rng_key_predict11,
                                                              X1_train, batch_size=batch_size_learn)
predmean_unmeas1, predstd_unmeas1 = dkl_BO_1.predict_in_batches(rng_key_predict11,
                                                                X1_unmeasured, batch_size=batch_size_reconstruct)
predmean11, predstd11 = dkl_BO_1.predict_in_batches(rng_key_predict11, X1,
                                                    batch_size=batch_size_reconstruct)


embeded_train1 = dkl_BO_1.embed(X1_train)  # Measured points
embeded_unmeasured1 = dkl_BO_1.embed(X1_unmeasured)  # Unmeasured points
embeded11 = dkl_BO_1.embed(X1)   # All points

#visualize the latent space
fig, ax = plt.subplots(nrows = 2, ncols = 3, figsize = (18,12))

im = ax[0,0].scatter(embeded_train1[:, 1], embeded_train1[:, 0], s=10, c = predmean_train1, cmap = 'jet')
ax[0,0].set_title('Measured points')
cbar = fig.colorbar(im, ax=ax[0,0], shrink=.8)
cbar.set_label("Predicted_Target", fontsize=14)
cbar.ax.tick_params(labelsize=10)

im = ax[1,0].scatter(embeded_unmeasured1[:, 1], embeded_unmeasured1[:, 0], s=10,
                c = predmean_unmeas1, cmap = 'jet')
ax[1,0].set_title('UnMeasured points')
cbar = fig.colorbar(im, ax=ax[0,1], shrink=.8)
cbar.set_label("Predicted_Target", fontsize=14)
cbar.ax.tick_params(labelsize=10)

im = ax[0,1].scatter(embeded11[:, 1], embeded11[:, 0], s=10, c = predmean11, cmap = 'jet')
ax[0,1].set_title('All points')
cbar = fig.colorbar(im, ax=ax[0,2], shrink=.8)
cbar.set_label("Predicted_Target", fontsize=14)
cbar.ax.tick_params(labelsize=10)

im = ax[1,1].scatter(embeded11[:, 1], embeded11[:, 0], s=10, c = targets1, cmap = 'jet')
ax[1,1].set_title('All points (Groung truth)')
cbar = fig.colorbar(im, ax=ax[1,0], shrink=.8)
cbar.set_label("Class_Labels", fontsize=14)
cbar.ax.tick_params(labelsize=10)

im = ax[0,2].scatter(embeded11[:, 1], embeded11[:, 0], s=10, c = predmean11 - targets1, cmap = 'jet')
ax[0,2].set_title('Error')
cbar = fig.colorbar(im, ax=ax[1,1], shrink=.8)
#cbar.set_label("Log(P)", fontsize=14)
cbar.ax.tick_params(labelsize=10)

im = ax[1,2].scatter(embeded11[:, 1], embeded11[:, 0], s=10, c = predstd11, cmap = 'jet')
ax[1,2].set_title('Uncertainty')
cbar = fig.colorbar(im, ax=ax[1,2], shrink=.8)
#cbar.set_label("Wt", fontsize=14)
cbar.ax.tick_params(labelsize=10)