### ðŸŽ¯ ObjectiveThis notebook facilitates the transition from raw structural data to a machine-learning-ready dataset. It focuses on transforming physical and chemical properties into numerical descriptors.

#### ðŸ›  Workflow SummaryData Ingestion: Load the initial dataset from data/crystal_datas_v2.csv.
- Magpie Descriptors: Generate three specifically selected features using the Magpie framework to represent compositional properties. 

- SOC Analysis: Produce Spin-Orbit Coupling (SOC) features to capture relativistic electronic effects.
- Feature Consolidation: Merge Magpie and SOC descriptors into a unified intermediate file.
- Chemical Delta Calculation: Compute the Electronegativity Difference using the Pauling scale to quantify bond ionicity:$$\Delta \chi = |\chi_{A} - \chi_{B}|$$
- K-Fold Target Encoding: Apply robust categorical encoding using a K-Fold strategy to prevent data leakage during model training.
- Final Export: Generate the master dataset designated for model training and validation.

In [None]:
import os
import json
import ast
from pathlib import Path
from datetime import datetime

import numpy as np
import pandas as pd
from tqdm import tqdm

from matminer.featurizers.composition import ElementProperty
from pymatgen.core import Composition
from sklearn.model_selection import KFold

# Paths
IN_PATH = Path("data/crystal_datas_v3.csv")

BASE_OUT = Path("data/interim/base_features.csv")
MAIN_OUT = Path("data/processed/main_data.csv")

BASE_OUT.parent.mkdir(parents=True, exist_ok=True)
MAIN_OUT.parent.mkdir(parents=True, exist_ok=True)


In [None]:
df = pd.read_csv(IN_PATH)

# notebook artÄ±ÄŸÄ± index kolonu varsa sil
if "Unnamed: 0" in df.columns:
    df = df.drop(columns=["Unnamed: 0"])

# build_main_data tarafÄ± ordering_encoded bekliyor -> Ã¼ret
if "ordering_encoded" not in df.columns:
    if "ordering" in df.columns:
        df["ordering_encoded"] = df["ordering"].fillna("unknown").astype(str)
    else:
        df["ordering_encoded"] = "unknown"

df.shape, df.columns[:20]


In [None]:
SOC_constants = {
    'B': 0.000117, 'C': 0.000312, 'N': 0.000679, 'O': 0.001292, 'F': 0.002238,
    'Ne': 0.003617, 'Na': 0.006227, 'Mg': 0.009989, 'Al': 0.000574, 'Si': 0.001108,
    'P': 0.001873, 'S': 0.00292, 'Cl': 0.004304, 'Ar': 0.006082, 'K': 0.0092,
    'Ca': 0.013192, 'Sc': 0.017465, 'Ti': 0.022455, 'V': 0.028284, 'Cr': 0.034444,
    'Mn': 0.041882, 'Fe': 0.051856, 'Co': 0.062086, 'Ni': 0.072772, 'Cu': 0.085778,
    'Zn': 0.101469, 'Ga': 0.003926, 'Ge': 0.006323, 'As': 0.009208, 'Se': 0.012632,
    'Br': 0.016646, 'Kr': 0.021299, 'Rb': 0.02933, 'Sr': 0.038614, 'Y': 0.048237,
    'Zr': 0.058686, 'Nb': 0.069149, 'Mo': 0.081638, 'Tc': 0.096648, 'Ru': 0.110458,
    'Rh': 0.126974, 'Pd': 0.143804, 'Ag': 0.16468, 'Cd': 0.188027, 'In': 0.010318,
    'Sn': 0.014687, 'Sb': 0.02005, 'Te': 0.026076, 'I': 0.032819, 'Xe': 0.040329,
    'Cs': 0.053231, 'Ba': 0.067507, 'La': 0.081524, 'Ce': 0.089015, 'Pr': 0.096628,
    'Nd': 0.104454, 'Pm': 0.11255, 'Sm': 0.120959, 'Eu': 0.129712, 'Gd': 0.13884,
    'Tb': 0.148371, 'Dy': 0.15833, 'Ho': 0.168743, 'Er': 0.179638, 'Tm': 0.191041,
    'Yb': 0.202981, 'Lu': 0.215484, 'Hf': 0.242776, 'Ta': 0.272703, 'W': 0.30522,
    'Re': 0.340355, 'Os': 0.378177, 'Ir': 0.418774, 'Pt': 0.458688, 'Au': 0.504948,
    'Hg': 0.55836, 'Tl': 0.032763, 'Pb': 0.046128, 'Bi': 0.060456, 'Po': 0.07598,
    'At': 0.092821, 'Rn': 0.111067, 'Fr': 0.141911, 'Ra': 0.175003, 'Ac': 0.14126,
    'Th': 0.174196, 'Pa': 0.207118, 'U': 0.244961, 'Lr': 0.551552, 'Rf': 0.612087,
    'Db': 0.677678, 'Sg': 0.7484, 'Bh': 0.824425, 'Hs': 0.905992, 'Mt': 0.993387,
    'Ds': 1.086945, 'Rg': 1.187045, 'Cn': 1.294108, 'Nh': 0.09331, 'Fl': 0.125507,
    'Mc': 0.159595, 'Lv': 0.196282, 'Ts': 0.235978, 'Og': 0.278994,
    'Np': 0.0, 'Pu': 0.0, 'Am': 0.0, 'Cm': 0.0, 'Bk': 0.0, 'Cf': 0.0, 'Es': 0.0,
    'Fm': 0.0, 'Md': 0.0, 'No': 0.0
}

def safe_literal_eval(x):
    if isinstance(x, str):
        try:
            return ast.literal_eval(x)
        except Exception:
            return None
    return x

def soc_weighted_avg(composition, soc_table=SOC_constants) -> float:
    if isinstance(composition, str):
        composition = safe_literal_eval(composition)

    if not isinstance(composition, dict) or not composition:
        return 0.0

    total_atoms = sum(composition.values())
    if total_atoms == 0:
        return 0.0

    soc_sum = 0.0
    for el, cnt in composition.items():
        soc_sum += soc_table.get(el, 0.0) * cnt

    return soc_sum / total_atoms


In [None]:
id_col = "material_id"
composition_col = "composition"

soc_df = df[[id_col, composition_col]].copy()

tqdm.pandas(desc="SOC hesaplanÄ±yor")
soc_df["soc_eV_avg"] = soc_df[composition_col].progress_apply(soc_weighted_avg)

soc_df["soc_eV_log10"] = np.log10(soc_df["soc_eV_avg"].replace(0, np.nan)).fillna(0)

denom = soc_df["soc_eV_avg"].max() - soc_df["soc_eV_avg"].min()
soc_df["soc_eV_norm"] = 0.0 if denom == 0 else (soc_df["soc_eV_avg"] - soc_df["soc_eV_avg"].min()) / denom

soc_df = soc_df[[id_col, "soc_eV_avg", "soc_eV_log10", "soc_eV_norm"]]
soc_df.head()


In [None]:
# Selected Magpie features (senin seÃ§imin)
selected = [
    "MagpieData maximum Electronegativity",
    "MagpieData range CovalentRadius",
    "MagpieData maximum GSbandgap",
]

ep = ElementProperty.from_preset(preset_name="magpie")
labels = ep.feature_labels()

# Matminer sÃ¼rÃ¼m farkÄ± kontrolÃ¼
for s in selected:
    if s not in labels:
        raise RuntimeError(
            f"Selected Magpie feature label not found: '{s}'. "
            f"Check ep.feature_labels() output; matminer version mismatch olabilir."
        )

magpie_rows = []
for mid, formula in tqdm(df[[id_col, "formula_pretty"]].itertuples(index=False, name=None),
                         total=len(df), desc="Magpie featurize"):
    try:
        comp = Composition(str(formula))
        feats_all = ep.featurize(comp)
        feat_map = dict(zip(labels, feats_all))
        magpie_rows.append({
            id_col: mid,
            selected[0]: feat_map.get(selected[0], np.nan),
            selected[1]: feat_map.get(selected[1], np.nan),
            selected[2]: feat_map.get(selected[2], np.nan),
        })
    except Exception:
        magpie_rows.append({id_col: mid, selected[0]: np.nan, selected[1]: np.nan, selected[2]: np.nan})

magpie_df = pd.DataFrame(magpie_rows)
magpie_df.head()


In [None]:
soc_df 

In [None]:
keep_cols = [c for c in df.columns if c in [
    "material_id", "band_gap", "formula_pretty", "composition", "ordering", "ordering_encoded"
]]

base = df[keep_cols].copy()
base = base.merge(magpie_df, on="material_id", how="left")
base = base.merge(soc_df, on="material_id", how="left")

base.to_csv(BASE_OUT, index=False)

meta = {
    "created_at": datetime.utcnow().isoformat(),
    "input": str(IN_PATH),
    "base_out": str(BASE_OUT),
    "rows": int(len(base)),
    "cols": list(base.columns),
}
BASE_OUT.with_suffix(".meta.json").write_text(
    json.dumps(meta, indent=2, ensure_ascii=False),
    encoding="utf-8"
)

base.shape, base.columns


In [None]:
def electronegativity_diff_from_formula(formula):
    if pd.isna(formula):
        return np.nan
    try:
        comp = Composition(str(formula))
        xs = [el.X for el in comp.elements if el.X is not None]
        if len(xs) == 0:
            return np.nan
        return float(max(xs) - min(xs))
    except Exception:
        return np.nan

# EÄŸer zaten varsa overwrite etme
if "electronegativity_diff" not in base.columns:
    base["electronegativity_diff"] = base["formula_pretty"].apply(electronegativity_diff_from_formula)

base[["formula_pretty", "electronegativity_diff"]].head()


In [None]:
def target_encode_kfold_single(df, cat_col, target_col, n_splits=5, random_state=42):
    out = df.copy()
    te_col = f"{cat_col}_te"
    out[te_col] = np.nan

    kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)

    for train_idx, val_idx in kf.split(out):
        train_data = out.iloc[train_idx]
        val_data = out.iloc[val_idx]
        means = train_data.groupby(cat_col)[target_col].mean()
        out.iloc[val_idx, out.columns.get_loc(te_col)] = val_data[cat_col].map(means)

    global_means = out.groupby(cat_col)[target_col].mean()
    out[te_col] = out[te_col].fillna(out[cat_col].map(global_means))
    out[te_col] = out[te_col].fillna(out[target_col].mean()).astype(float)

    return out

# uygulama
main_df = target_encode_kfold_single(
    base,
    cat_col="ordering_encoded",
    target_col="band_gap",
    n_splits=5,
    random_state=42
)

main_df[["ordering_encoded", "ordering_encoded_te", "band_gap"]].head()


In [None]:
main_df.to_csv(MAIN_OUT, index=False)

meta = {
    "created_at": datetime.utcnow().isoformat(),
    "input_base": str(BASE_OUT),
    "main_out": str(MAIN_OUT),
    "rows": int(len(main_df)),
    "cols": list(main_df.columns),
    "kfold": 5,
    "seed": 42,
}
MAIN_OUT.with_suffix(".meta.json").write_text(
    json.dumps(meta, indent=2, ensure_ascii=False),
    encoding="utf-8"
)

print("âœ… Saved:", MAIN_OUT)
main_df.shape
