In [1]:
import gc
import pickle
import zlib

import lmdb
import torch
import torch.nn.functional as F
import torch.optim as optim

import numpy as np
import pandas as pd

from DataClasses import lmdb_dataset
from joblib import Parallel, delayed

np.set_printoptions(linewidth=100, precision=4, suppress=True)

from ModelFunctions import to_bins_torch, convert_angles, restore_edge_angles, preprocessing, my_reshape
from torch_geometric.data import Data

In [87]:
#for train sample
dataset_size_list = {
    0: "10k",
    1: "100k",
    2: "all"
}
mode_list = {
    0: "train",
    1: "val",
    2: "test"
}
dataset_list = {
    1: "id",
    2: "ood_ads",
    3: "ood_cat",
    4: "ood_both"
}

In [88]:
# setting section
root = "../../ocp_datasets/data/is2re"
dataset_size = dataset_size_list[0]
mode = mode_list[0]
dataset = dataset_list[4]
#

In [98]:
def path_build(root, dataset_size, mode, dataset): # : mode: "origin, origin_old, target"
    path = f'{root}/{dataset_size}/{mode}'
    if mode != mode_list[0]:
        path = f'{path}_{dataset}'
    return path
    
dataset_origin_path_pkl = f'{path_build(root, dataset_size, mode, dataset)}/structures.pkl'
dataset_origin_path = f'{path_build(root, dataset_size, mode, dataset)}/data_mod.lmdb'
dataset_target_path = f'{path_build(root, dataset_size, mode, dataset)}/data_mod_conv.lmdb'

(print(
    f'dataset_origin_path_pkl: {dataset_origin_path_pkl}',
    f'dataset_origin_path: {dataset_origin_path}',
    f'dataset_target_path: {dataset_target_path}',
    sep='\n')
)
#/home/alex/Documents/ocp_datasets/data/is2re/all/val_ood_both
#/home/alex/Documents/ocp_datasets/data/is2re/all/test_ood_both

dataset_origin_path_pkl: ../../ocp_datasets/data/is2re/10k/train/structures.pkl
dataset_origin_path: ../../ocp_datasets/data/is2re/10k/train/data_mod.lmdb
dataset_target_path: ../../ocp_datasets/data/is2re/10k/train/data_mod_conv.lmdb


In [93]:
# dataset_origin_old = SinglePointLmdbDataset({"src": dataset_origin_old_path})

In [94]:
# dataset_origin_old[0]['distances']

In [95]:
# dataset_origin = pd.read_pickle(dataset_origin_path)

In [96]:
data_10k = lmdb_dataset(dataset_origin_path, compressed=False)
data_10k[0]

Data(atomic_numbers=[86], cell=[1, 3, 3], cell_offsets=[2964, 3], cell_offsets_new=[1214, 3], contact_solid_angles=[1214], direct_neighbor=[1214], distances=[2964], distances_new=[1214], edge_index=[2, 2964], edge_index_new=[2, 1214], fixed=[86], force=[86, 3], natoms=86, pos=[86, 3], pos_relaxed=[86, 3], sid=2472718, spherical_domain_radii=[86], tags=[86], voronoi_surface_areas=[86], voronoi_volumes=[86], y_init=6.282500615000004, y_relaxed=-0.025550085000020317)

### update dataset

In [10]:
def update_dataset(dataset_target_path, dataset_origin_old_path, dataset_origin_path, features_names=None):
    dataset_origin = pd.read_pickle(dataset_origin_path)
    
    dataset_origin_old = lmdb_dataset(dataset_origin_path)
    
    dataset_target = lmdb.open(
        dataset_target_path,
        map_size=int(1e9*5), #~ 5 Gbyte
        subdir=False,
        meminit=False,
        map_async=True,
    )

    idx = 0

    for ii, data_object_origin_old in enumerate(dataset_origin_old):

            # Substitute: edge_index -> edge_index_new
            data_object = dataset_origin_old[ii]
            for feature_name in features_names:
                feature = torch.from_numpy(dataset_origin[ii][feature_name+'_new'])
                data_object[feature_name] = feature

            # Write to LMDB
            txn = dataset_target.begin(write=True)
            txn.put(f"{idx}".encode("ascii"), pickle.dumps(data_object, protocol=-1))
            txn.commit()
            dataset_target.sync()
            if idx % 1000 == 0:
                print('{} of {} for file {}'.format(idx, len(dataset_origin_old), dataset_target_path))
            idx += 1

    dataset_target.close()
    print("done")

### update_dataset_pyg2dict

In [121]:
def update_dataset_pyg2dict(dataset_target_path, dataset_origin_path):
        
    dataset_origin = lmdb_dataset(dataset_origin_path, compressed=False)
    
    dataset_target = lmdb.open(
        dataset_target_path,
        map_size=int(1e12), #~ 5 Gbyte
        subdir=False,
        meminit=False,
        map_async=False,
    )

    idx = 0

    for ii, element in enumerate(dataset_origin):

            # Substitute: edge_index -> edge_index_new
            
            element = dict(list(element))
            del element['edge_angles'][1::2]    
            for ii, el in enumerate(element['edge_angles']):
               element['edge_angles'][ii] = element['edge_angles'][ii].reset_index().values
            
            # Write to LMDB
            
            txn = dataset_target.begin(write=True)
            txn.put(f"{idx}".encode("ascii"), zlib.compress(pickle.dumps(element, protocol=-1), level=1))
            
            #txn.put(key=f"{idx}".encode("ascii"), value=pickle.dumps(element, protocol=-1))
            txn.commit()
            dataset_target.sync()
            if idx%1000==0:
                print('{} of {} for file {}'.format(idx, len(dataset_origin), dataset_target_path))
            idx += 1
            
    print(dataset_target.info())
    dataset_target.close()
    print("done")

In [None]:
update_dataset_pyg2dict(dataset_target_path, dataset_origin_path)

0 of 10000 for file ../../ocp_datasets/data/is2re/10k/train/data_mod2_torch
1000 of 10000 for file ../../ocp_datasets/data/is2re/10k/train/data_mod2_torch
2000 of 10000 for file ../../ocp_datasets/data/is2re/10k/train/data_mod2_torch
3000 of 10000 for file ../../ocp_datasets/data/is2re/10k/train/data_mod2_torch
4000 of 10000 for file ../../ocp_datasets/data/is2re/10k/train/data_mod2_torch
5000 of 10000 for file ../../ocp_datasets/data/is2re/10k/train/data_mod2_torch
6000 of 10000 for file ../../ocp_datasets/data/is2re/10k/train/data_mod2_torch


In [5]:
dataset_target_path_test ='../../ocp_datasets/data/is2re/10k/train/data_mod2_torch'
suffix = '.lmdb'

### Benchmark of diffferent options for .lmdb

In [99]:
%%time
dataset_target = lmdb_dataset(dataset_origin_path, compressed=False)
for el in enumerate(dataset_target):
    a = el

CPU times: user 6.37 s, sys: 1.29 s, total: 7.66 s
Wall time: 12.9 s


In [90]:
%%time
dataset_target = lmdb_dataset(dataset_target_path_test+'_orig'+suffix, compressed=False)
for el in enumerate(dataset_target):
    a = el

CPU times: user 1.24 s, sys: 3.97 ms, total: 1.24 s
Wall time: 1.24 s


In [91]:
%%time
dataset_target = lmdb_dataset(dataset_target_path_test+'_dict'+suffix, compressed=False)
for el in enumerate(dataset_target):
    a = el

CPU times: user 1.25 s, sys: 0 ns, total: 1.25 s
Wall time: 1.25 s


In [92]:
%%time
dataset_target = lmdb_dataset(dataset_target_path_test+'_dict_short'+suffix, compressed=False)
for el in enumerate(dataset_target):
    a = el

CPU times: user 639 ms, sys: 0 ns, total: 639 ms
Wall time: 638 ms


In [93]:
%%time
dataset_target = lmdb_dataset(dataset_target_path_test+'_dict_short_numpy'+suffix, compressed=False)
for el in enumerate(dataset_target):
    a = el

CPU times: user 45 ms, sys: 8.04 ms, total: 53 ms
Wall time: 51 ms


In [6]:
%%time
dataset_target = lmdb_dataset(dataset_target_path_test+'_dict_short_numpy_zip'+suffix, compressed=True)
for el in enumerate(dataset_target):
    a = el

CPU times: user 51.9 ms, sys: 8 ms, total: 59.9 ms
Wall time: 58.5 ms


In [None]:
dataset_target = lmdb_dataset(dataset_target_path_test'_dict_short_numpy_zip'+suffix, compressed=True)
dataset_target[0]['edge_angles'][0]

#### Compressed pickle

In [54]:
%%time
with open('data_10k.pkl', 'wb') as f:
    f.write(pickle.dumps(dataset_target[0]))

CPU times: user 217 ms, sys: 0 ns, total: 217 ms
Wall time: 215 ms


In [60]:
%%time
with open('data_10k.pkl', 'rb') as f:
    data = f.read()
    data = pickle.loads(data)

<class 'bytes'>
CPU times: user 106 ms, sys: 14 µs, total: 106 ms
Wall time: 103 ms


In [56]:
%%time
with open('data_10k.pbz2', 'wb') as f:
    f.write(zlib.compress(pickle.dumps(dataset_target[0]), level = 1))

CPU times: user 218 ms, sys: 51 µs, total: 218 ms
Wall time: 216 ms


In [59]:
%%time
with open('data_10k.pbz2', 'rb') as f:
    data = f.read()
    data = pickle.loads(zlib.decompress(data))

<class 'bytes'>
CPU times: user 187 ms, sys: 3.95 ms, total: 191 ms
Wall time: 189 ms


#### Feather file-format

In [118]:
import pyarrow.feather as feather

#### restore_angles

In [None]:
def convert_angles(array):
    array[:, 1] = np.pi - array[:, 1]
    array[:, 3] = -array[:, 3]
    return array

def restore_edge_angles(list_of_arrays):
    el_new= []
    for el in list_of_arrays:
        el_new.append(el)
        el_new.append(convert_angles(el.copy()))        
    return el_new

### Benchmark preprocessing

#### mod2

In [2]:
dataset_target_path ='../../ocp_datasets/data/is2re/10k/train/data_mod2.lmdbz'
dataset_target = lmdb_dataset(dataset_target_path)
suffix = '.lmdb'

suffix = dataset_target_path.split('.')[-1]
print(suffix)

compressed = (
    True if suffix == 'lmdbz'
    else False
)

print(compressed)

lmdbz
True


In [None]:
%%time
print(dataset_target_path)
dataset_target = lmdb_dataset(dataset_target_path, compressed=True)
for el in dataset_target:
    a = preprocessing(el)

#### mod1

In [2]:
dataset_target_path ='../../ocp_datasets/data/is2re/10k/train/data_mod.lmdb'
suffix = '.lmdb'

In [3]:
%%time
print(dataset_target_path)
dataset_target = lmdb_dataset(dataset_target_path, compressed=False)
for el in dataset_target:
    a = preprocessing(el, opt='edges_only')

../../ocp_datasets/data/is2re/10k/train/data_mod.lmdb
CPU times: user 8.79 s, sys: 679 ms, total: 9.46 s
Wall time: 9.46 s


**multiprocessing (do not work with n_jobs > 1)**