In [54]:
import pandas as pd
import io
import ase.io
from pymatgen.core.structure import Structure
from textcat.slice.slice_encoder import SLICES
from pandarallel import pandarallel


In [2]:
# iniialize parallel computing on all cores
pandarallel.initialize(progress_bar=True)

INFO: Pandarallel will run on 10 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [3]:
def ase_atoms_to_cif(ase_atoms):
    with io.BytesIO() as fd:
        ase.io.write(fd, ase_atoms, format="cif")
        return fd.getvalue().decode()


def cif_to_slice(cif_string, slice_encoder):
    cif = Structure.from_str(cif_string, fmt="cif")
    slice = slice_encoder.structure2SLICES(cif)
    return slice


def ase_atoms_to_slice(ase_atoms, slice_encoder):
    cif = ase_atoms_to_cif(ase_atoms)
    slice = cif_to_slice(cif, slice_encoder)
    return slice


In [50]:
# Build slice encoder
slice_encoder = SLICES()


# Read Bulk pickle
bulk = pd.read_pickle("../data/bulk/bulks.pkl")
bulk_dict = {item["src_id"]: item["atoms"] for item in bulk}

df_temp = pd.DataFrame()
df_temp["unique_bulk_mpid"] = [item["src_id"] for item in bulk]


# add ase_atoms table from bulk.pkl
df_temp["ase_atoms"] = [bulk_dict[mpid] for mpid in df_temp["unique_bulk_mpid"].tolist()]


# Fix for mp-33046
df_temp = df_temp.drop(df_temp.loc[df_temp['unique_bulk_mpid'] == 'mp-33046'].index)


df_temp["slice"] = df_temp["ase_atoms"].parallel_apply(
    ase_atoms_to_slice,slice_encoder=slice_encoder
)



struct_mp_33046 = Structure.from_file(filename='TiN.cif')
slice_mp_33046 = slice_encoder.structure2SLICES(struct_mp_33046)
df_temp = df_temp._append({'slice':slice_mp_33046,'unique_bulk_mpid':'mp-33046'},ignore_index=True)




# for atom in [bulk_dict[mpid] for mpid in df_temp["unique_bulk_mpid"].tolist()]:
#     try:
#         cif_to_slice(atom,slice_encoder)
#     except:
#         print(atom)


# Create mapping table for mp_id to slice
map_bulk_id_to_slice = {
    bulk_id: slice
    for bulk_id, slice in zip(
        df_temp["unique_bulk_mpid"].tolist(), df_temp["slice"].tolist()
    )
}

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=1141), Label(value='0 / 1141'))), …



In [53]:
pd.to_pickle(map_bulk_id_to_slice,'../data/bulk/mapping_mpid_to_slice.pkl')


In [29]:
# Read File
df = pd.read_parquet("../data/is2res_train_val_test_dfs/100k/train/data.parquet")
# Add slice column
df["slice"] = df["bulk_mpid"].map(map_bulk_id_to_slice)
# save new dataframe with slice column
df.to_parquet('test.parquet')