In [43]:
import sisl
import pandas as pd
import csv
import ast

from pathlib import Path
from tqdm import tqdm

# Read the dataset using SISL
path= Path('./dataset')

# Get all subdirectories (atom types)
n_atoms_paths = list(path.glob('*/'))

# Then all structures
i=0
structure_paths = [list(n_atoms_paths[i].glob('*/')) for i in range(len(n_atoms_paths))]
structure_paths = [item for sublist in structure_paths for item in sublist]

# Create Padnas DataFrame
columns = ["n_atoms", "atomic_number", "atomic_symbol", "atomic_positions", "lattice_nsc", "lattice_origin", "lattice_vectors", "boundary_condition"]
df = pd.DataFrame(columns=columns)
for structure in tqdm(structure_paths):
    file = sisl.get_sile(structure / "aiida.HSX")
    geometry = file.read_geometry()
    lattice_vectors = geometry.lattice

    # Get atom symbols
    atomic_symbols = []
    for atom in geometry.atoms:
        atomic_symbols.append(atom.symbol)

    row_dict = {
        "n_atoms": len(geometry),
        "atomic_number": geometry.atoms.Z,
        "atomic_symbol": atomic_symbols,
        "atomic_positions": geometry.xyz,
        "lattice_nsc": lattice_vectors.nsc,
        "lattice_origin": lattice_vectors.origin,
        "lattice_vectors": lattice_vectors.cell,
        "boundary_condition": lattice_vectors.boundary_condition
    }

    df = pd.concat([df, pd.DataFrame([row_dict])], ignore_index=True)
    
df.to_pickle("dataset.pkl")

100%|██████████| 1311/1311 [01:34<00:00, 13.92it/s]


In [44]:
# Read the dataset back
df = pd.read_pickle("dataset.pkl")
df.head()

Unnamed: 0,n_atoms,atomic_number,atomic_symbol,atomic_positions,lattice_nsc,lattice_origin,lattice_vectors,boundary_condition
0,2,"[7, 5]","[N, B]","[[0.0, 0.0, 3.3314667459], [0.0, 0.0, 1.071501...","[5, 5, 5]","[0.0, 0.0, 0.0]","[[3.5999899999999996, 0.0, 0.0], [0.0, 3.59998...","[[2, 2], [2, 2], [2, 2]]"
1,2,"[6, 6]","[C, C]","[[0.0, 0.0, 3.4481784217], [0.0, 0.0, 1.836174...","[5, 5, 5]","[0.0, 0.0, 0.0]","[[3.5999899999999996, 0.0, 0.0], [0.0, 3.59998...","[[2, 2], [2, 2], [2, 2]]"
2,2,"[7, 5]","[N, B]","[[0.0, 0.0, 3.3314667459], [0.0, 0.0, 1.827498...","[5, 5, 5]","[0.0, 0.0, 0.0]","[[3.5999899999999996, 0.0, 0.0], [0.0, 3.59998...","[[2, 2], [2, 2], [2, 2]]"
3,2,"[7, 5]","[N, B]","[[0.0, 0.0, 3.3314667459], [0.0, 0.0, 0.720501...","[5, 5, 5]","[0.0, 0.0, 0.0]","[[3.5999899999999996, 0.0, 0.0], [0.0, 3.59998...","[[2, 2], [2, 2], [2, 2]]"
4,2,"[6, 7]","[C, N]","[[0.0, 0.0, 3.3314667459], [0.0, 0.0, 2.394497...","[5, 5, 5]","[0.0, 0.0, 0.0]","[[3.5999899999999996, 0.0, 0.0], [0.0, 3.59998...","[[2, 2], [2, 2], [2, 2]]"


In [50]:
# Print some stats.

# Unique atom types
unique_atom_types = []
for row in df["atomic_number"]:
    for atom_type in row:
        if atom_type not in unique_atom_types:
            unique_atom_types.append(atom_type)

# Unique atom symbols
unique_atom_symbols = []
for row in df["atomic_symbol"]:
    for atom_type in row:
        if atom_type not in unique_atom_symbols:
            unique_atom_symbols.append(atom_type)

print(f"Analyzed {len(df)} elements. Found:")
print(unique_atom_types)
print(unique_atom_symbols)

Analyzed 1311 elements. Found:
[7, 5, 6]
['N', 'B', 'C']


There are some carbons. How many of them?