In [27]:
import pandas as pd
import torch
import numpy as np
from src.utils.finger import encode, dense2sparse
from src.utils.modelinit import initialize_model

In [33]:
# load validation det for klek
df_klek = pd.read_parquet('data/train_data/big_dataset_std_val_10.parquet').sample(10000)

# load validation set for ECFP
df_ECFP = pd.read_parquet('data/train_data/big_dataset_ECFP_val_10.parquet').sample(10000)

# path to klek model weights
klek_path = 'models/GRUv3_klek_sonic/epoch_200.pt'
klek_config_path = 'models/GRUv3_klek_sonic/hyperparameters.ini'

# path to ECFP model weights
ECFP_path = 'models/GRUv3_ECFP_tola/epoch_150.pt'
ECFP_config_path = 'models/GRUv3_ECFP_tola/hyperparameters.ini'

In [34]:
# initialize models and load weights
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

klek_model = initialize_model(klek_config_path, dropout=False, device=device)
klek_model.load_state_dict(torch.load(klek_path, map_location=device))

ECFP_model = initialize_model(ECFP_config_path, dropout=False, device=device)
ECFP_model.load_state_dict(torch.load(ECFP_path, map_location=device))

<All keys matched successfully>

In [35]:
# encode fingerprints
klek_encoded, _ = encode(df_klek, klek_model, device)
ECFP_encoded, _ = encode(df_ECFP, ECFP_model, device)

print(klek_encoded.shape) # (10000, 32)
print(ECFP_encoded.shape) # (10000, 32)

100%|██████████| 10/10 [00:07<00:00,  1.39it/s]
100%|██████████| 10/10 [00:05<00:00,  1.87it/s]

(10000, 32)
(10000, 32)





In [52]:
# original fingerprints to numpy
klek_original = df_klek['fps'].apply(lambda x: dense2sparse(x, fp_len=4860)).to_numpy()
klek_original = np.stack(klek_original)

ECFP_original = df_ECFP['fps'].apply(lambda x: dense2sparse(x, fp_len=2048)).to_numpy()
ECFP_original = np.stack(ECFP_original)

print(klek_original.shape) # (10000, 4860)
print(ECFP_original.shape) # (10000, 2048)

(10000, 4860)
(10000, 2048)
