In [12]:
import os

import numpy as np
import pandas as pd
import math

from MeLting.statistics_helpers import *

In [13]:
src = "../MeLting/data/examples"

In [14]:
# initial data
initial_filename = "example_initial_data.csv"
initial_data = pd.read_csv(os.path.join(src, initial_filename))

In [15]:
initial_data.head()

Unnamed: 0,Compound,Melting point (K),Melting point (C),mp_id,Cohesive energy,Bulk modulus,Shear modulus,Formation energy per atom,Energy per atom,Density,...,row_1,row_2,group_1,group_2,eneg_1,eneg_2,vol_1,vol_2,melting_point_1,melting_point_2
0,AlSb,1333.15,1060.0,mp-2624,3.283296,34.394074,21.663768,-0.159486,-4.096774,4.078412,...,3,5,13,15,1.61,2.05,10,18.19,933.47,903.78
1,AlAs,2013.15,1740.0,mp-2172,3.736597,54.431468,30.184973,-0.484607,-4.686649,3.590699,...,3,4,13,15,1.61,2.18,10,12.95,933.47,1090.0
2,AlBr3,370.6,97.45,mp-23288,3.061638,16.893024,6.077907,-1.117532,-3.281621,2.650776,...,3,4,13,17,1.61,2.96,10,19.78,933.47,265.8
3,Al4C3,2473.15,2200.0,mp-1591,5.523125,132.479679,95.364515,-0.092081,-6.18674,2.930804,...,3,2,13,14,1.61,2.55,10,5.29,933.47,3800.0
4,AlF3,1563.15,1290.0,mp-468,5.464326,116.117921,52.993785,-3.892047,-5.893853,3.021262,...,3,2,13,17,1.61,3.98,10,11.2,933.47,53.53


In [16]:
# constructed model outputs and materials features
final_filename = "example_final_data.csv"
final_data = pd.DataFrame()  # to construct

In [17]:
# compound/materials/structure information

final_data["Compound"] = initial_data["Compound"]
final_data["mp-id"] = initial_data["mp_id"]

In [18]:
# potential model outputs

final_data["log10_melt_temp_K"] = initial_data["Melting point (K)"].apply(
    lambda x: math.log10(x)
)

final_data["delta_T_vegard"] = initial_data["Melting point (K)"] - (
    initial_data["Comp_1"] * initial_data["melting_point_1"]
    + initial_data["Comp_2"] * initial_data["melting_point_2"]
) / (initial_data["Comp_1"] + initial_data["Comp_2"])

final_data["melt_temp_K"] = initial_data["Melting point (K)"]
final_data["melt_temp_C"] = initial_data["Melting point (C)"]

In [19]:
# compound features

# from DFT
final_data[
    ["coh_en", "bulk_m", "shear_m", "form_e_per_atom", "e_per_atom", "density"]
] = initial_data[
    [
        "Cohesive energy",
        "Bulk modulus",
        "Shear modulus",
        "Formation energy per atom",
        "Energy per atom",
        "Density",
    ]
]

# ionic character features
# this works for binary materials only
final_data["%ic"] = 100 * (
    1 - np.exp(-((initial_data["eneg_1"] - initial_data["eneg_2"]) ** 2) / 4)
)

In [20]:
# features constructed from elemental properties using statistical averaging methods

initial_column_names = [
    "M",
    "rad_at",
    "atomic_num",
    "row",
    "group",
    "eneg",
    "vol",
    "melting_point",
]
final_column_names = [
    "M",
    "radius",
    "atomic_num",
    "row",
    "group",
    "electro",
    "vol",
    "melt_temp",
]
prepositions = ["ave", "sd", "harm", "geo", "quad"]
functions = [
    "arithmetic_mean",
    "standard_deviation_mean",
    "harmonic_mean",
    "geometric_mean",
    "quadratic_mean",
]

for col_f, col_i in zip(final_column_names, initial_column_names):
    for prep, func in zip(prepositions, functions):
        final_data[col_f + "_" + prep] = globals()[func](
            initial_data, col_i + "_1", col_i + "_2"
        )
        final_data[col_f + "_" + prep + "X"] = globals()[func](
            initial_data, col_i + "_1", col_i + "_2", weighted=True
        )

In [21]:
# compound class
# we do not report this in the paper but provide for class-specific study

nonmetals = [
    "H",
    "B",
    "C",
    "N",
    "O",
    "F",
    "Si",
    "P",
    "S",
    "Cl",
    "Ge",
    "As",
    "Se",
    "Br",
    "Sb",
    "Te",
    "I",
    "At",
]
chalcogenides = ["S", "Se", "Te"]  # without oxides
halides = ["F", "Cl", "Br", "I", "At"]
nitr_gr = ["N", "P", "As", "Sb"]  # N-group nonmetals
carb_gr = ["C", "Si", "Ge"]  # C-group nonmetals
final_data["compound_type"] = [
    (
        "mix"
        if (x in nonmetals) & (y in nonmetals)
        else (
            "ox"
            if (x == "O") | (y == "O")
            else (
                "chalc"
                if (x in chalcogenides) | (y in chalcogenides)
                else (
                    "hal"
                    if (x in halides) | (y in halides)
                    else (
                        "nitr_gr"
                        if (x in nitr_gr) | (y in nitr_gr)
                        else (
                            "carb_gr"
                            if (x in carb_gr) | (y in carb_gr)
                            else (
                                "bor"
                                if (x == "B") | (y == "B")
                                else "hyd" if (x == "H") | (y == "H") else "other"
                            )
                        )
                    )
                )
            )
        )
    )
    for x, y in zip(initial_data["El_1"], initial_data["El_2"])
]

In [22]:
final_data.head()

Unnamed: 0,Compound,mp-id,log10_melt_temp_K,delta_T_vegard,melt_temp_K,melt_temp_C,coh_en,bulk_m,shear_m,form_e_per_atom,...,melt_temp_aveX,melt_temp_sd,melt_temp_sdX,melt_temp_harm,melt_temp_harmX,melt_temp_geo,melt_temp_geoX,melt_temp_quad,melt_temp_quadX,compound_type
0,AlSb,mp-2624,3.124879,414.525,1333.15,1060.0,3.283296,34.394074,21.663768,-0.159486,...,918.864896,14.845,29.69,918.385104,918.625,918.505044,918.744961,918.74494,918.984773,nitr_gr
1,AlAs,mp-2172,3.303876,1001.415,2013.15,1740.0,3.736597,54.431468,30.184973,-0.484607,...,1017.789362,78.265,156.53,1005.680638,1011.735,1008.703276,1014.76975,1014.757666,1020.776146,nitr_gr
2,AlBr3,mp-23288,2.568905,-62.1175,370.6,97.45,3.061638,16.893024,6.077907,-1.117532,...,785.491074,333.835,667.67,413.778926,599.635,498.112764,706.6242,686.300182,833.006335,hal
3,Al4C3,mp-1591,3.39325,311.167143,2473.15,2200.0,5.523125,132.479679,95.364515,-0.092081,...,3234.702288,1433.265,2866.53,1498.767712,2366.735,1883.397462,2881.046288,2766.890515,3429.895077,carb_gr
4,AlF3,mp-468,3.194001,1289.635,1563.15,1290.0,5.464326,116.117921,52.993785,-3.892047,...,885.746405,439.97,879.94,101.253595,493.5,223.536684,799.405946,661.147375,907.889339,hal


In [23]:
final_data.to_csv(os.path.join(src, final_filename), index=False)