In [1]:
import lmdb
import torch
from ocpmodels.datasets import SinglePointLmdbDataset
import pandas as pd
import pickle

In [2]:
#for train sample
dataset_origin_path = "/Users/humonen/Downloads/structures_train.pkl"
dataset_origin_old_path = "../../ocp_datasets/data/is2re/10k/train/data.lmdb"
dataset_target_path = "../../ocp_datasets/data/is2re/10k/train/data_mod.lmdb"

In [3]:
dataset_origin_old = SinglePointLmdbDataset({"src": dataset_origin_old_path})

In [4]:
dataset_origin_old[0]['distances']

tensor([2.8117, 2.8117, 2.8117,  ..., 5.7203, 5.8450, 5.8527])

In [5]:
dataset_origin = pd.read_pickle(dataset_origin_path)

In [6]:
dataset_origin[0]['distances_new']

array([4.70419312, 4.70419312, 2.84391737, ..., 3.74769878, 4.21590567,
       4.21590567])

In [7]:
def update_dataset(dataset_target_path, dataset_origin_old, dataset_origin, features_names=None):
    dataset_target = lmdb.open(
        dataset_target_path,
        map_size=int(1e9*5), #~ 5 Gbyte
        subdir=False,
        meminit=False,
        map_async=True,
    )

    idx = 0

    for ii, data_object_origin_old in enumerate(dataset_origin_old):

            # Substitute: edge_index -> edge_index_new
            data_object = dataset_origin_old[ii]
            for feature_name in features_names:
                feature = torch.from_numpy(dataset_origin[ii][feature_name+'_new'])
                data_object[feature_name] = feature

            # Write to LMDB
            txn = dataset_target.begin(write=True)
            txn.put(f"{idx}".encode("ascii"), pickle.dumps(data_object, protocol=-1))
            txn.commit()
            dataset_target.sync()
            if idx % 1000 == 0:
                print('{} of {} for file {}'.format(idx, len(dataset_origin_old), dataset_target_path))
            idx += 1

    dataset_target.close()
    print("done")

In [8]:
%%time
update_dataset(dataset_target_path, dataset_origin_old, dataset_origin, features_names=['edge_index', 'distances'])

0 of 10000 for file ../../ocp_datasets/data/is2re/10k/train/data_mod.lmdb
1000 of 10000 for file ../../ocp_datasets/data/is2re/10k/train/data_mod.lmdb
2000 of 10000 for file ../../ocp_datasets/data/is2re/10k/train/data_mod.lmdb
3000 of 10000 for file ../../ocp_datasets/data/is2re/10k/train/data_mod.lmdb
4000 of 10000 for file ../../ocp_datasets/data/is2re/10k/train/data_mod.lmdb
5000 of 10000 for file ../../ocp_datasets/data/is2re/10k/train/data_mod.lmdb
6000 of 10000 for file ../../ocp_datasets/data/is2re/10k/train/data_mod.lmdb
7000 of 10000 for file ../../ocp_datasets/data/is2re/10k/train/data_mod.lmdb
8000 of 10000 for file ../../ocp_datasets/data/is2re/10k/train/data_mod.lmdb
9000 of 10000 for file ../../ocp_datasets/data/is2re/10k/train/data_mod.lmdb
done
CPU times: user 12 s, sys: 6.86 s, total: 18.9 s
Wall time: 32.4 s


In [9]:
dataset_target = SinglePointLmdbDataset({"src": dataset_target_path})

In [10]:
dataset_target[0]['distances']

tensor([4.7042, 4.7042, 2.8439,  ..., 3.7477, 4.2159, 4.2159],
       dtype=torch.float64)

In [11]:
#for val_ood_both sample
dataset_origin_path = "/Users/humonen/Downloads/structures_val_ood_both.pkl"
dataset_origin_old_path = "../../ocp_datasets/data/is2re/all/val_ood_both/data.lmdb"
dataset_target_path = "../../ocp_datasets/data/is2re/all/val_ood_both/data.lmdb"

dataset_origin_old = SinglePointLmdbDataset({"src": dataset_origin_old_path})
dataset_origin = pd.read_pickle(dataset_origin_path)

In [12]:
%%time
update_dataset(dataset_target_path, dataset_origin_old, dataset_origin, features_names=['edge_index', 'distances'])

0 of 24987 for file ../../ocp_datasets/data/is2re/all/val_ood_both/data.lmdb
1000 of 24987 for file ../../ocp_datasets/data/is2re/all/val_ood_both/data.lmdb
2000 of 24987 for file ../../ocp_datasets/data/is2re/all/val_ood_both/data.lmdb
3000 of 24987 for file ../../ocp_datasets/data/is2re/all/val_ood_both/data.lmdb
4000 of 24987 for file ../../ocp_datasets/data/is2re/all/val_ood_both/data.lmdb
5000 of 24987 for file ../../ocp_datasets/data/is2re/all/val_ood_both/data.lmdb
6000 of 24987 for file ../../ocp_datasets/data/is2re/all/val_ood_both/data.lmdb
7000 of 24987 for file ../../ocp_datasets/data/is2re/all/val_ood_both/data.lmdb
8000 of 24987 for file ../../ocp_datasets/data/is2re/all/val_ood_both/data.lmdb
9000 of 24987 for file ../../ocp_datasets/data/is2re/all/val_ood_both/data.lmdb
10000 of 24987 for file ../../ocp_datasets/data/is2re/all/val_ood_both/data.lmdb
11000 of 24987 for file ../../ocp_datasets/data/is2re/all/val_ood_both/data.lmdb
12000 of 24987 for file ../../ocp_dataset