In [1]:
from pathlib import Path
import json
import numpy as np
from scipy import stats
from pymatgen.core.structure import Structure

import pandas as pd
from ase.db import connect

from mlip_arena.models import REGISTRY, MLIPEnum


def load_wbm_structures():
    """
    Load the WBM structures from a ASE DB file.
    """
    with connect("../wbm_structures.db") as db:
        for row in db.select():
            yield row.toatoms(add_additional_information=True)



In [2]:

for model in MLIPEnum:

    if "wbm_ev" not in REGISTRY[model.name].get("gpu-tasks", []):
        continue

    all_data = []

    for atoms in load_wbm_structures():

        fpath = Path(model.name) / f"{atoms.info['key_value_pairs']['wbm_id']}.json"
        if not fpath.exists():
            continue

        all_data.append(pd.read_json(fpath))

    df = pd.concat(all_data, ignore_index=True)
    df.to_parquet(f"{model.name}.parquet")

In [3]:
df

Unnamed: 0,method,id,eos
0,ALIGNN,WBM-71714,"{'volumes': [40.333815232, 43.4351074938, 46.6..."
1,ALIGNN,WBM-172583,"{'volumes': [92.1431942935, 99.2281420922, 106..."
2,ALIGNN,WBM-10,"{'volumes': [39.098824674, 42.10515774, 45.261..."
3,ALIGNN,WBM-59675,"{'volumes': [117.2610350101, 126.2773092802, 1..."
4,ALIGNN,WBM-29372,"{'volumes': [98.9789198484, 106.5894708574, 11..."
...,...,...,...
995,ALIGNN,WBM-70378,"{'volumes': [117.3132036885, 126.3334892408, 1..."
996,ALIGNN,WBM-236771,"{'volumes': [58.587809792, 63.0926631043, 67.8..."
997,ALIGNN,WBM-144452,"{'volumes': [101.4072920483, 109.2045621134, 1..."
998,ALIGNN,WBM-220368,"{'volumes': [65.1104867795, 70.1168728021, 75...."
