In [25]:
import os
import lmdb
import pickle
import numpy as np
import pymatgen
import torch
from pymatgen.core import Structure
from pymatgen.symmetry.groups import SpaceGroup
import pandas as pd
from tqdm.notebook import tqdm
import re
from ocpmodels.datasets import SinglePointLmdbDataset
import zlib
from DataClasses import lmdb_dataset

In [29]:
additional_data = pd.read_csv('binary_final.csv', index_col=0).set_index('material_id')
additional_data['space_group'] = list(map(lambda x: SpaceGroup(x).int_number, additional_data['space_group']))
print(additional_data.head())

dataset_target = lmdb.open(
    'structures.lmdbz',
    map_size=int(1e9*50), #~ 50 Gbyte
    subdir=False,
    meminit=False,
    map_async=True,)

for idx, file in enumerate(tqdm(os.listdir('cif_filtered/'))):
    
    #print(new_data_key)
    if re.sub('\.cif','', file) in additional_data.index:
        txn = dataset_target.begin(write=True)
        data = {}
        BG, energy_above_hull, composition, space_group = additional_data.loc[re.sub('\.cif','', file)].values
        struct = Structure.from_file('cif_filtered/'+file)
        data['sid'] = re.sub('\.cif','', file)
        data['cell'] = torch.tensor([struct.lattice.matrix])
        data['natoms'] = torch.tensor([len(struct)])
        data['atomic_numbers'] = torch.tensor(np.array(list(map(lambda x: x.Z, struct.species))))
        data['pos'] = torch.tensor(np.array([x.coords for x in struct]))
        data['band_gap'] = torch.tensor([BG])
        data['energy_above_hull'] = torch.tensor([energy_above_hull])
        #data['composition'] = torch.tensor(composition)
        data['space_group'] = torch.tensor([space_group])
        #print(data)
        txn.put(f"{idx}".encode("ascii"),
                zlib.compress(pickle.dumps(data, protocol=-1), level=1))
        txn.commit()
        dataset_target.sync()
    else :
        print(file, 'was not found in binary_final.csv')
        
dataset_target.close() 

             band_gap  e_above_hull new_structure  space_group
material_id                                                   
mp-1080341     0.7132      0.517625      Ce8 Se16          132
mp-1219167     1.7192      0.215818        Sm2 O3          115
mp-1244922     0.6826      0.417091      Zn50 S50            1
mp-700         0.8969      0.000000       Ge4 Se4           62
mp-8956        0.6120      0.007877       Ho8 S12           62


  0%|          | 0/2032 [00:00<?, ?it/s]



In [30]:
data = lmdb_dataset('structures.lmdbz')
data[0]

{'map_addr': 0, 'map_size': 10485760, 'last_pgno': 1112, 'last_txnid': 24384, 'max_readers': 1000, 'num_readers': 0}


{'sid': 'mp-9588',
 'cell': tensor([[[ 4.9319e+00,  0.0000e+00, -2.6396e+00],
          [-3.0435e-16,  4.9704e+00,  3.0435e-16],
          [ 0.0000e+00,  0.0000e+00,  1.0251e+01]]], dtype=torch.float64),
 'natoms': tensor([16]),
 'atomic_numbers': tensor([ 3,  3,  3,  3,  3,  3,  3,  3, 15, 15, 15, 15, 15, 15, 15, 15]),
 'pos': tensor([[ 1.0703,  1.9297,  2.8028],
         [ 3.8617,  4.4149, -0.3170],
         [ 3.8617,  3.0406,  4.8084],
         [ 1.0703,  0.5555,  7.9281],
         [ 1.1135,  3.2721, -0.2906],
         [ 3.8185,  0.7869,  2.7764],
         [ 3.8185,  1.6983,  7.9018],
         [ 1.1135,  4.1835,  4.8348],
         [ 1.5617,  4.4480,  2.1668],
         [ 3.3703,  1.9628,  0.3190],
         [ 3.3703,  0.5224,  5.4444],
         [ 1.5617,  3.0076,  7.2922],
         [ 1.4953,  0.7755,  0.3452],
         [ 3.4367,  3.2607,  2.1406],
         [ 3.4367,  4.1949,  7.2660],
         [ 1.4953,  1.7097,  5.4705]], dtype=torch.float64),
 'band_gap': tensor([0.8537], dtype=torc