In [None]:
from ocpmodels.datasets import SinglePointLmdbDataset

import numpy as np
from mendeleev import element
from scipy.spatial import distance_matrix
from scipy.spatial.distance import euclidean
import matplotlib.pyplot as plt
import pandas as pd

def getAtomSequence (sequence) :
    result = list([[sequence[0], 1]])
    for i in range(1, len(sequence)) :
        if sequence[i] == result[-1][0] :
            result[-1][1] += 1
        else :
            result.append([sequence[i], 1])
    return dict(result)

def structureToVASP(structure, file='POSCAR', str_name='structure', relaxed=False) :
    with open(file + ('_relaxed' if relaxed else ''), 'w') as f :
        f.write(str_name + '\n')
        f.write(str(1.0) + '\n')
        for axis in np.array(structure.cell[0]) :
            for i in range(3) :
                f.write(str(axis[i]) + '   ')
                if i == 2 :
                    f.write('\n')
        atoms = getAtomSequence(np.array(structure.atomic_numbers, dtype=int))
        for k in atoms.keys() :
            f.write('   ' + element(round(k)).symbol)
        f.write('\n')
        for v in atoms.values() :
            f.write('   ' + str(round(v)))
        f.write('\n')        
        f.write('Cartesian\n')
        for position in np.array(structure.pos if not relaxed else structure.pos_relaxed)   :
            for i in range(3) :
                f.write(str(position[i]) + '   ')
                if i == 2 :
                    f.write('\n')            
    return None

def getTranslations (positions, cell) :
    result = np.array(positions)
    #print(result.shape)
    for i in range(-1,2) :
        for j in range(-1,2) :
            for k in range(-1,2) :
                if (i == 0) and (j == 0) and (k == 0) :
                    continue
                result = np.vstack((result, np.array(positions) + 
                                    i*np.array(cell[0][0]) + 
                                    j*np.array(cell[0][1]) +
                                    k*np.array(cell[0][2])) )
    return result

def getOffsets (positions) :
    result = np.zeros_like(positions, dtype=int)
    #print(result.shape)
    for i in range(-1,2) :
        for j in range(-1,2) :
            for k in range(-1,2) :
                if (i == 0) and (j == 0) and (k == 0) :
                    continue
                result = np.vstack((result, np.zeros_like(positions, dtype=int) + [i,j,k]) )
    return result

import functions, polyhedron, graph, geometry
from scipy.spatial import Voronoi
from polyhedron import Polyhedron
import geometry as gm
import numpy as np

 
class Voro:

    def __init__(self, points, central_ps, labels):
        """
        :param points: list of points coordinates, [[float, float, float], ...]
        :param central_ps: indexes of centrals points, list of int
        :param labels: list of labels
        """
        self.points = np.array(points)
        self.central_ps = central_ps
        self.labels = labels
        self.vor = Voronoi(points)
        self.p_adjacency = self.calc_p_adjacency()
        self.polyhedrons = self.construct_polyhedrons()
        self.rsds = np.array([(3 * p.volume / (4 * np.pi)) ** (1 / 3.) for p in self.polyhedrons])
        self.angles = self.calc_angles()
        self.direct_neighbors = self.find_direct_neighbors()

    def calc_p_adjacency(self):
        """
        Calculation of points adjacency list (points are adjacent if the domains of the points are adjacent)
        :return: self.p_adjacency: points adjacency list, [[int, ...], ...]
        """
        p_adjacency = [[] for _ in range(len(self.points))]
        for p1, p2 in self.vor.ridge_dict.keys():
            p_adjacency[p1] += [p2]
            p_adjacency[p2] += [p1]
        self.p_adjacency = p_adjacency
        return self.p_adjacency

    def construct_polyhedrons(self):
        """
        Construct of the polyhedrons
        :return: self.polyhedrons, Voronoi polyhedra, list of Polyhedron objects
        """
        self.polyhedrons = []
        for i in self.central_ps:
            faces = []
            region = self.vor.regions[self.vor.point_region[i]]
            new_ind = {o_i: n_i for n_i, o_i in enumerate(region)}
            if -1 in region:
                raise RuntimeError("The domain for \"" + str(i) + "\" point is not closed!")
            for j in self.p_adjacency[i]:
                common_vs = self.vor.ridge_dict.get((i, j))
                if common_vs is None:
                    common_vs = self.vor.ridge_dict[(j, i)][::-1]
                if i != j and common_vs is not None and len(common_vs) > 2:
                    faces += [[new_ind[o_i] for o_i in common_vs]]
                else:
                    raise RuntimeError("The Voronoi decomposition is failed!")
            faces = np.array([np.array(f) for f in faces])
            self.polyhedrons += [Polyhedron(self.vor.vertices[region], region, faces, find_order=False)]
        return self.polyhedrons

    def calc_angles(self):
        """
        Calculation of solid angles
        :return: self.angles, solid angles between adjacent Voronoi polyhedra, {(int, int): float, ...}
        """
        self.angles = {}
        for i, i1 in enumerate(self.central_ps):
            angles = []
            cp = self.points[i1]
            for j, i2 in enumerate(self.p_adjacency[i1]):
                angles += [sum([abs(gm.calc_solid_angle(cp, s)) for s in self.polyhedrons[i].faces[j].simplexes])]
            angles = 100 * np.array(angles) / sum(angles)
            for j, i2 in enumerate(self.p_adjacency[i1]):
                self.angles[(i1, i2)] = angles[j]
                self.angles[(i2, i1)] = angles[j]
        return self.angles

    def find_direct_neighbors(self):
        """
        Find the direct neighbors
        :return: self.direct_neighbors: direct Voronoi polyhedra neighbors, {(int, int): bool}
        """
        self.direct_neighbors = {}
        for i, i1 in enumerate(self.central_ps):
            p1 = self.points[i1]
            for j, i2 in enumerate(self.p_adjacency[i1]):
                p2 = self.points[i2]
                if self.polyhedrons[i].faces[j].is_inside(gm.calc_centroid([p1, p2])):
                    self.direct_neighbors[(i1, i2)] = True
                    self.direct_neighbors[(i2, i1)] = True
        return self.direct_neighbors

from joblib import Parallel, delayed
from tqdm.notebook import tqdm
import pickle
import lmdb
import torch

In [None]:
import re

def newProperties(data) :
    keys = []
    result = []
    
    points = getTranslations(data.pos, data.cell)
    atom_index = np.array(list(range(data.natoms)) * round(len(points) / len(data.pos)))
    offsets = getOffsets(data.pos)
    voro = Voro(points, range(data.natoms), list(range(len(points))))
    
    volumes = np.array(list(map(lambda x: x.volume, voro.polyhedrons)))
    surface_areas = np.array(list(map(lambda x: x.area, voro.polyhedrons)))
    rsds = voro.rsds
    
    keys.extend(['voronoi_volumes', 'voronoi_surface_areas',
                'spherical_domain_radii'])
    result.extend([volumes, surface_areas, rsds])    
    
    df = pd.DataFrame(voro.angles.keys(), columns=['VA_p1', 'VA_p2'])
    #print(df)
    df['cell_offsets'] = list(map(tuple, offsets[df['VA_p1']] - offsets[df['VA_p2']])) 
    #sign correspond to original data
    df['VA_p1_corr'] = atom_index[df['VA_p1']]
    df['VA_p2_corr'] = atom_index[df['VA_p2']]
    df['direct_neighbor'] = list(map(lambda x: 1 if x in voro.direct_neighbors.keys() else 0, voro.angles.keys()))
    k = np.array(list(voro.angles.keys()))
    df['distance'] = np.array([euclidean(points[pair[0]], points[pair[1]]) for pair in k])
    df['solid_angle'] = voro.angles.values()
    df['to_keep'] = ~df.duplicated(subset=['VA_p1_corr', 'VA_p2_corr', 'cell_offsets'], keep='first')
    df = df[df['to_keep']].drop(labels=['to_keep', 'VA_p1', 'VA_p2'], axis=1).reset_index(drop=True)
    
    keys.extend(['cell_offsets_new', 'distances_new', 'contact_solid_angles',
                'direct_neighbor', 'edge_index_new'])
    result.extend([np.array(list(map(np.array, df['cell_offsets'].values))), df['distance'].values, 
                   df['solid_angle'].values, df['direct_neighbor'].values, 
                   df[['VA_p1_corr', 'VA_p2_corr']].values.T])  
    
    return dict(zip(keys, result))

def processDataset (dataset_path, dataset_batch_size=5004) :
    dataset_path_modified = re.sub('\.lmdb', '_mod.lmdb', dataset_path)
    print(dataset_path_modified)
    print()
    
    dataset = SinglePointLmdbDataset({"src": dataset_path})
    print('Original dataset size is {}'.format(len(dataset)))
    print()
    
    dataset_target = lmdb.open(
        dataset_path_modified,
        map_size=int(1e9*200), #~ 200 Gbyte
        subdir=False,
        meminit=False,
        map_async=True,)
    
    print('Batch info:')
    
    for batch in range(len(dataset) // dataset_batch_size + 
                       (1 if len(dataset) % dataset_batch_size != 0 else 0)) :
        print('{}: from {} to {}'.format(batch, batch * dataset_batch_size, 
                                         min(len(dataset), (batch+1) * dataset_batch_size) - 1))
    
    print()
    for batch in range(len(dataset) // dataset_batch_size + 
                       (1 if len(dataset) % dataset_batch_size != 0 else 0)) :
        
        dataset_under_process = [dataset[i] for i in 
                                 range(batch * dataset_batch_size, 
                                       min(len(dataset), (batch+1) * dataset_batch_size))]
        print('Structures from {} to {} are under process...'.format(batch * dataset_batch_size, 
                                                batch * dataset_batch_size + len(dataset_under_process) - 1))
        
        res = Parallel(n_jobs=-1)(delayed(newProperties)(dataset_under_process[i]) 
                                  for i in tqdm(range(len(dataset_under_process))))
        #res = [dict() for i in range(len(dataset_under_process))]
        print('are stored to a file...')
        
        for structure_id in tqdm(range(len(dataset_under_process))) :
            txn = dataset_target.begin(write=True)
            data = dataset_under_process[structure_id]
            
            for new_data_key in res[0].keys() :
                #print(new_data_key)
                data[new_data_key] = torch.from_numpy(res[structure_id][new_data_key])
                #print(data[new_data_key])
            txn.put(f"{structure_id + batch * dataset_batch_size}".encode("ascii"), 
                    pickle.dumps(data, protocol=-1))
            txn.commit()
            dataset_target.sync()
    
    dataset_target.close()
    return None

In [None]:
# processDataset('/Users/Eremin/OneDrive/Share/10k/train/data.lmdb')
processDataset('../../../ocp_datasets/data/is2re/100k/train/data.lmdb')
processDataset('../../../ocp_datasets/data/is2re/all/train/data.lmdb')

processDataset('../../../ocp_datasets/data/is2re/all/val_ood_cat/data.lmdb')
processDataset('../../../ocp_datasets/data/is2re/all/val_ood_ads/data.lmdb')
processDataset('../../../ocp_datasets/data/is2re/all/val_ood_both/data.lmdb')
processDataset('../../../ocp_datasets/data/is2re/all/test_ood_cat/data.lmdb')
processDataset('../../../ocp_datasets/data/is2re/all/val_ood_cat/data.lmdb')


# processDataset('../../ocp_datasets/data/is2re/all/test_ood_both/data.lmdb')