In [1]:
import os
from mp_api.client import MPRester
from dask.distributed import Client
from dask_jobqueue import SLURMCluster
from prefect import task, flow
from prefect_dask import DaskTaskRunner
from pymatgen.core.structure import Structure
from dotenv import load_dotenv
from ase import Atoms
from ase.io import write, read
from pathlib import Path
import pandas as pd
from prefect.futures import wait

from mlip_arena.tasks.eos.run import fit as EOS
from mlip_arena.models.utils import REGISTRY, MLIPEnum

load_dotenv()

MP_API_KEY = os.environ.get("MP_API_KEY", None)

In [2]:

with MPRester(MP_API_KEY) as mpr:
    print("MP Database version:", mpr.get_database_version())

    summary_docs = mpr.materials.summary.search(
        num_elements=(1, 2),
        is_stable=True,
        fields=["material_id", "structure", "formula_pretty"]
    )


MP Database version: 2023.11.1


Retrieving SummaryDoc documents:   0%|          | 0/5135 [00:00<?, ?it/s]

In [8]:

atoms_list = []

for doc in summary_docs:

    structure = doc.structure
    assert isinstance(structure, Structure)

    atoms = structure.to_ase_atoms()

    atoms_list.append(atoms)


In [5]:
write("all.extxyz", atoms_list)

In [3]:
atoms_list = read("all.extxyz", index=':')

In [4]:
nodes_per_alloc = 1
gpus_per_alloc = 4
ntasks = 1

cluster_kwargs = {
    "cores": 1,
    "memory": "64 GB",
    "shebang": "#!/bin/bash",
    "account": "matgen",
    "walltime": "00:30:00",
    "job_mem": "0",
    "job_script_prologue": [
        "source ~/.bashrc",
        "module load python",
        "source activate /pscratch/sd/c/cyrusyc/.conda/mlip-arena",
    ],
    "job_directives_skip": ["-n", "--cpus-per-task", "-J"],
    "job_extra_directives": [f"-N {nodes_per_alloc}", f"-G {gpus_per_alloc}", "-q debug", "-C gpu", "-J eos"],
}
cluster = SLURMCluster(**cluster_kwargs)

print(cluster.job_script())
cluster.adapt(minimum_jobs=2, maximum_jobs=2)
client = Client(cluster)


#!/bin/bash

#SBATCH -A matgen
#SBATCH --mem=0
#SBATCH -t 00:30:00
#SBATCH -N 1
#SBATCH -G 4
#SBATCH -q debug
#SBATCH -C gpu
#SBATCH -J eos
source ~/.bashrc
module load python
source activate /pscratch/sd/c/cyrusyc/.conda/mlip-arena
/pscratch/sd/c/cyrusyc/.conda/mlip-arena/bin/python -m distributed.cli.dask_worker tcp://128.55.64.41:45323 --name dummy-name --nthreads 1 --memory-limit 59.60GiB --nanny --death-timeout 60



Exception in thread Profile:
Traceback (most recent call last):
  File "/pscratch/sd/c/cyrusyc/.conda/mlip-arena/lib/python3.11/threading.py", line 1045, in _bootstrap_inner
    self.run()
  File "/pscratch/sd/c/cyrusyc/.conda/mlip-arena/lib/python3.11/site-packages/ipykernel/ipkernel.py", line 766, in run_closure
    _threading_Thread_run(self)
  File "/pscratch/sd/c/cyrusyc/.conda/mlip-arena/lib/python3.11/threading.py", line 982, in run
    self._target(*self._args, **self._kwargs)
  File "/pscratch/sd/c/cyrusyc/.conda/mlip-arena/lib/python3.11/site-packages/distributed/profile.py", line 366, in _watch
    process(frame, None, recent, omit=omit)
  File "/pscratch/sd/c/cyrusyc/.conda/mlip-arena/lib/python3.11/site-packages/distributed/profile.py", line 183, in process
    if any(frame.f_code.co_filename.endswith(o) for o in omit):
       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/pscratch/sd/c/cyrusyc/.conda/mlip-arena/lib/python3.11/site-packages/distributed/pr

In [5]:


def postprocess(output, model: str, formula: str):
    row = {
        "formula": formula,
        "method": model,
        "volumes": output["eos"]["volumes"],
        "energies": output["eos"]["energies"],
        "K": output["K"],
    }

    fpath = Path(REGISTRY[model]["family"]) / f"{model}.parquet"

    if not fpath.exists():
        fpath.parent.mkdir(parents=True, exist_ok=True)
        df = pd.DataFrame([row])  # Convert the dictionary to a DataFrame with a list
    else:
        df = pd.read_parquet(fpath)
        new_row = pd.DataFrame([row])  # Convert dictionary to DataFrame with a list
        df = pd.concat([df, new_row], ignore_index=True)

    df.drop_duplicates(subset=["formula", "method"], keep='last', inplace=True)
    df.to_parquet(fpath)

from prefect.runtime import flow_run, task_run
EOS = EOS.with_options(timeout_seconds=240, result_storage=None)

def generate_task_run_name():
    task_name = task_run.task_name

    parameters = task_run.parameters

    atoms = parameters["atoms"]
    
    return f"{task_name}: {atoms.get_chemical_formula()}"


@task(task_run_name=generate_task_run_name, result_storage=None)
def fit_one(atoms: Atoms):
    
    outputs = []
    for model in MLIPEnum:
        try:
            eos = EOS(
                atoms=atoms,
                calculator_name=model.name,
                calculator_kwargs={},
                device=None,
                optimizer="QuasiNewton",
                optimizer_kwargs=None,
                filter="FrechetCell",
                filter_kwargs=None,
                criterion=dict(
                    fmax=0.1,
                ),
                max_abs_strain=0.1,
                npoints=7,
            )
            if isinstance(eos, dict):
                postprocess(output=eos, model=model.name, formula=atoms.get_chemical_formula())
                eos["method"] = model.name
                outputs.append(eos)
        except:
            continue
    
    return outputs


@flow(task_runner=DaskTaskRunner(address=client.scheduler.address), log_prints=True, result_storage=None)
def fit_all(atoms_list: list[Atoms]):
    
    futures = []
    for atoms in atoms_list:
        future = fit_one.submit(atoms)
        futures.append(future)
            
    wait(futures)
    
    return [f.result(raise_on_failure=False) for f in futures]

In [6]:
# import os
# import tempfile
# import shutil
# from contextlib import contextmanager

# @contextmanager
# def twd():
    
#     pwd = os.getcwd()
#     temp_dir = tempfile.mkdtemp()
    
#     try:
#         os.chdir(temp_dir)
#         yield
#     finally:
#         os.chdir(pwd)
#         shutil.rmtree(temp_dir)

# with twd():

fit_all(atoms_list)

KeyboardInterrupt: 

In [7]:
import pandas as pd

df = pd.read_parquet('mace-mp/MACE-MP(M).parquet')

In [8]:
df

Unnamed: 0,formula,method,volumes,energies,K
0,Ac2O3,MACE-MP(M),"[82.36010147441682, 85.41047560309894, 88.4608...","[-39.47665786743164, -39.65583419799805, -39.7...",95.869141
1,Ac4,MACE-MP(M),"[166.09086069175856, 172.2423740507126, 178.39...","[-16.326059341430664, -16.406923294067383, -16...",25.409891
3,Ac16S24,MACE-MP(M),"[1006.5670668063424, 1043.84732853991, 1081.12...","[-249.42129516601562, -250.79556274414062, -25...",61.372858
