In [1]:
import os
import pickle
import lmdb

In [2]:
class LMDBDataset:
    def __init__(self, db_path):
        self.db_path = db_path
        assert os.path.isfile(self.db_path), "{} not found".format(self.db_path)
        self.env = self.connect_db(self.db_path)
        with self.env.begin() as txn:
            self._keys = list(txn.cursor().iternext(values=False))

    def connect_db(self, lmdb_path, save_to_self=False):
        env = lmdb.open(
            lmdb_path,
            subdir=False,
            readonly=True,
            lock=False,
            readahead=False,
            meminit=False,
            max_readers=256,
        )
        if not save_to_self:
            return env
        else:
            self.env = env
            
    def __len__(self):
        return len(self._keys)

    def __getitem__(self, idx):
        # TDOO:
        # idx = 1
        datapoint_pickled = self.env.begin().get(bytes(str(idx), 'utf-8'))
        data = pickle.loads(datapoint_pickled)
        return data

In [3]:
# lmdb_dataset = LMDBDataset(db_path="/fs01/home/haotian/SDL-LNP/model/unimol/notebooks/4CR/train.lmdb")
lmdb_dataset = LMDBDataset(db_path="/scratch/ssd004/datasets/cellxgene/3d_molecule_data/1920-lib/train.lmdb")
# lmdb_dataset = LMDBDataset(db_path="/scratch/ssd004/datasets/cellxgene/3d_molecule_data/220k-lib/lmdb/0/db.lmdb")

In [4]:
lmdb_dataset[1]

{'atoms': ['C',
  'C',
  'C',
  'C',
  'C',
  'C',
  'C',
  'C',
  'C',
  'C',
  'C',
  'C',
  'C',
  'C',
  'C',
  'C',
  'C',
  'C',
  'O',
  'N',
  'C',
  'C',
  'C',
  'C',
  'C',
  'C',
  'C',
  'C',
  'C',
  'C',
  'C',
  'C',
  'N',
  'C',
  'C',
  'C',
  'O',
  'N',
  'C',
  'C',
  'C',
  'C',
  'C',
  'C',
  'C',
  'C',
  'H',
  'H',
  'H',
  'H',
  'H',
  'H',
  'H',
  'H',
  'H',
  'H',
  'H',
  'H',
  'H',
  'H',
  'H',
  'H',
  'H',
  'H',
  'H',
  'H',
  'H',
  'H',
  'H',
  'H',
  'H',
  'H',
  'H',
  'H',
  'H',
  'H',
  'H',
  'H',
  'H',
  'H',
  'H',
  'H',
  'H',
  'H',
  'H',
  'H',
  'H',
  'H',
  'H',
  'H',
  'H',
  'H',
  'H',
  'H',
  'H',
  'H',
  'H',
  'H',
  'H',
  'H',
  'H',
  'H',
  'H',
  'H',
  'H',
  'H',
  'H',
  'H',
  'H',
  'H',
  'H',
  'H',
  'H',
  'H',
  'H',
  'H',
  'H',
  'H',
  'H',
  'H',
  'H',
  'H',
  'H',
  'H',
  'H',
  'H',
  'H',
  'H',
  'H'],
 'coordinates': [array([[ 1.4965242e+01, -7.7599216e-01,  6.6564023e-01],
         [ 1.

In [9]:
lmdb_dataset[0].keys()

dict_keys(['atoms', 'coordinates', 'mol', 'smi', 'target'])