In [1]:
export_configs = {
    "WyFomer generated datasets": {
        "mp_20": (
            ("WyckoffTransformer", ),
            ("WyckoffTransformer", "DiffCSP++10k"),
            ("WyckoffTransformer", "DiffCSP++10k", "CHGNet_free", "DFT"),
            ("WyckoffTransformer", "DiffCSP++10k", "CHGNet_free", "DFT-GGA-relax-1"),
            ("WyckoffTransformer", "CrySPR", "CHGNet_fix"),
            ("WyckoffTransformer", "CrySPR", "CHGNet_fix", "DFT"),
            ("WyckoffTransformer", "DiffCSP++"),
            ("WyckoffTransformer", "DiffCSP++", "DFT"),
        ),
        "mpts_52": (
            ("WyckoffTransformer", ),
            ("WyckoffTransformer", "CrySPR", "CHGNet_fix"))
    }}

In [2]:
import sys
sys.path.append("../..")
from evaluation.generated_dataset import GeneratedDataset, DATA_KEYS

In a future release, impute_nan will be set to True by default.
                    This means that features that are missing or are NaNs for elements
                    from the data source will be replaced by the average of that value
                    over the available elements.
                    This avoids NaNs after featurization that are often replaced by
                    dataset-dependent averages.


In [3]:
from pathlib import Path
from omegaconf import OmegaConf
all_data = OmegaConf.load("../../generated/datasets.yaml")
del all_data['mp_20']["FlowMM"]
# These data are not used and might contain errors
del all_data["carbon_24"]
del all_data["perov_5"]
from collections import defaultdict
export_configs["generated_public"] = defaultdict(set)
def flatten_config(dataset, config, prefix=[]):
    for key, value in config.items():
        if key in DATA_KEYS:
            export_configs["generated_public"][dataset].add(tuple(prefix))
            if "path" in value:
                value["path"] = str(Path(value["path"]).parent / "data.csv.gz")
                value["storage_type"] = "monty"
            if "cache_key" in value:
                del value["cache_key"]
            if "storage_key" in value:
                del value["storage_key"]
        else:
            flatten_config(dataset, value, prefix + [key])
    if "structures" in config and "wyckoffs" in config:
        # No need to export two times
        del config["wyckoffs"]
    
for dataset_name, dataset_config in all_data.items():
    flatten_config(dataset_name, dataset_config)

In [4]:
from pathlib import Path
from monty.json import MontyEncoder
encoder = MontyEncoder()
def to_json(obj):
    if isinstance(obj, str):
        return obj
    if isinstance(obj, frozenset):
        obj = tuple(obj)
    return encoder.encode(obj)

In [5]:
from tqdm.auto import tqdm
from gzip import BadGzipFile
from pickle import UnpicklingError
from scripts.cache_generated_datasets import compute_fields_and_cache
def export_data(export_path, export_config):
    export_path = Path(export_path)
    export_path.mkdir(parents=True, exist_ok=True)
    for dataset, transformation_tuples in tqdm(export_config.items()):
        for these_transformations in tqdm(transformation_tuples):
            dataset_path = export_path.joinpath(dataset).joinpath(*these_transformations) / "data.csv.gz"
            print(f"Exporting {dataset_path}")
            dataset_path.parent.mkdir(parents=True, exist_ok=True)
            try:
                dataset_processed = GeneratedDataset.from_cache(
                    transformations=these_transformations,
                    dataset=dataset)
            except (FileNotFoundError, UnpicklingError, BadGzipFile):
                dataset_raw = GeneratedDataset.from_transformations(
                    transformations=these_transformations,
                    dataset=dataset)
                dataset_processed = compute_fields_and_cache(dataset_raw)
            if "CHGNet" in these_transformations[-1]:
                dataset_processed.data.rename(columns={
                    "energy_per_atom": "chgnet_energy_per_atom",
                    "corrected_chgnet_ehull": "chgnet_e_above_hull_corrected",
                }, inplace=True)
            elif "DFT" in these_transformations[-1]:
                dataset_processed.data.rename(columns={
                    "e_above_hull_corrected": "dft_e_above_hull_corrected",
                    "e_uncorrected": "dft_e_uncorrected",
                    "e_corrected": "dft_e_corrected",
                }, inplace=True)
            export_filter = dataset_processed.data.filter(
                ["cdvae_crystal", "fingerprint", "composition", "naive_validity",
                "spacegroup_number", "density"], axis=1)
            dataset_processed.data.drop(export_filter, axis=1).map(to_json).to_csv(
                dataset_path, index_label="material_id")            

In [6]:
export_data("generated_public", export_configs["generated_public"])
OmegaConf.save(all_data, "generated_public/datasets.yaml")

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/60 [00:00<?, ?it/s]

Exporting generated_public/mp_20/WyckoffLLM-naive/DiffCSP++/data.csv.gz
Exporting generated_public/mp_20/DiffCSP++/CHGNet_fix_release/data.csv.gz
Exporting generated_public/mp_20/DiffCSP/DFT/data.csv.gz
Exporting generated_public/mp_20/split/test/CHGNet_fix/data.csv.gz
Exporting generated_public/mp_20/SymmCD/data.csv.gz
Exporting generated_public/mp_20/WyckoffTransformer-letters/DiffCSP++/CHGNet_fix/data.csv.gz
Exporting generated_public/mp_20/SymmCD/CHGNet_fix/data.csv.gz
Exporting generated_public/mp_20/CrystalFormer/CHGNet_fix_release/data.csv.gz
Exporting generated_public/mp_20/WyckoffTransformer/DiffCSP++10k/data.csv.gz
Exporting generated_public/mp_20/UN-DiffCSP++43/data.csv.gz
Exporting generated_public/mp_20/WyckoffLLM-vanilla/DiffCSP++/data.csv.gz
Exporting generated_public/mp_20/WyckoffLLM-naive/DiffCSP++/CHGNet_fix/data.csv.gz
Exporting generated_public/mp_20/DiffCSP/1k-sample/eq-V2_free/DFT/data.csv.gz
Exporting generated_public/mp_20/DiffCSP/1k-sample/CHGNet_free/DFT/data.

spglib: No centring was found.
spglib: ssm_get_exact_positions failed.
spglib: get_bravais_exact_positions_and_lattice failed.
spglib: ssm_get_exact_positions failed.
spglib: get_bravais_exact_positions_and_lattice failed.
spglib: ssm_get_exact_positions failed.
spglib: get_bravais_exact_positions_and_lattice failed.
spglib: ssm_get_exact_positions failed.
spglib: get_bravais_exact_positions_and_lattice failed.
spglib: ssm_get_exact_positions failed.
spglib: get_bravais_exact_positions_and_lattice failed.
spglib: ssm_get_exact_positions failed.
spglib: get_bravais_exact_positions_and_lattice failed.
spglib: ssm_get_exact_positions failed.
spglib: get_bravais_exact_positions_and_lattice failed.
spglib: ssm_get_exact_positions failed.
spglib: get_bravais_exact_positions_and_lattice failed.
spglib: ssm_get_exact_positions failed.
spglib: get_bravais_exact_positions_and_lattice failed.
spglib: ssm_get_exact_positions failed.
spglib: get_bravais_exact_positions_and_lattice failed.
spglib: s



  0%|          | 0/9887 [00:00<?, ?it/s]

Exporting generated_public/mp_20/WyckoffTransformer/CrySPR/CHGNet_fix/data.csv.gz
Exporting generated_public/mp_20/DiffCSP/CHGNet_fix/data.csv.gz
Exporting generated_public/mp_20/WyckoffTransformer/CrySPR/CHGNet_fix_release/data.csv.gz
Exporting generated_public/mp_20/WyckoffTransformer/data.csv.gz
Exporting generated_public/mp_20/WyckoffLLM-site-symmetry/data.csv.gz
Exporting generated_public/mp_20/DiffCSP++/DFT/data.csv.gz
Exporting generated_public/mp_20/WyckoffTransformer/DiffCSP++/data.csv.gz
Exporting generated_public/mp_20/UN-DiffCSP++17/data.csv.gz
Exporting generated_public/mp_20/WyckoffTransformer-harmonic/DiffCSP++/CHGNet_fix/data.csv.gz
Exporting generated_public/mp_20/WyckoffTransformer-letters/DiffCSP++/DFT/data.csv.gz
Exporting generated_public/mp_20/CrystalFormer/DFT/data.csv.gz
Exporting generated_public/mp_20/WyckoffTransformer-letters/DiffCSP++/data.csv.gz
Exporting generated_public/mp_20/DiffCSP/1k-sample/DFT/data.csv.gz
Exporting generated_public/mp_20/WyckoffTrans

Attempt 0 failed to convert structure Full Formula (K16 Co8 Se32)
Reduced Formula: K2CoSe4
abc   :  11.708235  11.708235  11.708235
angles:  90.000000  90.000000  90.000000
pbc   :       True       True       True
Sites (56)
  #  SP           a         b         c
---  ----  --------  --------  --------
  0  K     0.5       0.5       0.5
  1  K     0.25      0.75      0
  2  K     0.75      0         0.25
  3  K     0         0.25      0.75
  4  K     0.5       0         0
  5  K     0.25      0.25      0.5
  6  K     0.75      0.5       0.75
  7  K     0         0.75      0.25
  8  K     0         0.5       0
  9  K     0.75      0.75      0.5
 10  K     0.25      0         0.75
 11  K     0.5       0.25      0.25
 12  K     0         0         0.5
 13  K     0.75      0.25      0
 14  K     0.25      0.5       0.25
 15  K     0.5       0.75      0.75
 16  Co    0.125     0.125     0.125
 17  Co    0.875     0.375     0.375
 18  Co    0.125     0.625     0.625
 19  Co    0.875     0.8



  0%|          | 0/27135 [00:00<?, ?it/s]

Exporting generated_public/mp_20/MiAD/data.csv.gz
Exporting generated_public/mp_20/MiAD/CHGNet_free/data.csv.gz
Exporting generated_public/mp_20/WyckoffTransformer/DiffCSP++/DFT/data.csv.gz
Exporting generated_public/mp_20/WyckoffTransformer-letters/data.csv.gz


  0%|          | 0/8 [00:00<?, ?it/s]

Exporting generated_public/mp_20_biternary/split/train/data.csv.gz
Exporting generated_public/mp_20_biternary/WyCryst/CrySPR/CHGNet_fix/data.csv.gz
Exporting generated_public/mp_20_biternary/split/test/data.csv.gz
Exporting generated_public/mp_20_biternary/split/val/data.csv.gz
Exporting generated_public/mp_20_biternary/WyCryst/CrySPR/CHGNet_fix/DFT/data.csv.gz
Exporting generated_public/mp_20_biternary/WyckoffTransformer/CrySPR/CHGNet_fix/data.csv.gz
Exporting generated_public/mp_20_biternary/WyCryst/data.csv.gz
Exporting generated_public/mp_20_biternary/WyckoffTransformer/data.csv.gz


  0%|          | 0/7 [00:00<?, ?it/s]

Exporting generated_public/mpts_52/split/train/data.csv.gz
Exporting generated_public/mpts_52/SymmCD/data.csv.gz
Exporting generated_public/mpts_52/split/test/data.csv.gz
Exporting generated_public/mpts_52/split/val/data.csv.gz
Exporting generated_public/mpts_52/SymmCD/CHGNet_fix/data.csv.gz
Exporting generated_public/mpts_52/WyckoffTransformer/CrySPR/CHGNet_fix/data.csv.gz
Exporting generated_public/mpts_52/WyckoffTransformer/data.csv.gz


  0%|          | 0/3 [00:00<?, ?it/s]

Exporting generated_public/mp_2022/split/train/data.csv.gz
Exporting generated_public/mp_2022/split/val/data.csv.gz
Exporting generated_public/mp_2022/split/test/data.csv.gz
