In [None]:
!pip -q install --upgrade pip
!pip -q install ydata-profiling
!pip -q install pandas==2.2.2 numpy==1.26.4 scikit-learn==1.5.2
!pip -q install matplotlib==3.9.2 seaborn==0.13.2
!pip -q install tensorflow==2.17.0
!pip -q install imbalanced-learn==0.12.3


In [None]:
import os, json, warnings, numpy as np, pandas as pd
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
import seaborn as sns

try:
    from ydata_profiling import ProfileReport
except Exception:
    from pandas_profiling import ProfileReport  # fallback

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.utils.class_weight import compute_class_weight
from sklearn.impute import SimpleImputer

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, callbacks

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)
tf.random.set_seed(RANDOM_STATE)


In [None]:
from google.colab import files
print("Upload your CSV nowâ€¦")
_ = files.upload()

CSV_PATH = "new dataset v1 - HEA Phase DataSet v1d(1).csv"  # change if needed
assert os.path.exists(CSV_PATH), f"CSV not found at: {CSV_PATH}"

df = pd.read_csv(CSV_PATH)
print("Shape:", df.shape)
df.head()


In [None]:
print("Columns:", df.columns.tolist())
print("\nMicrostructure value counts:")
print(df.get("Microstructure", pd.Series(dtype=object)).value_counts(dropna=False).head(10))
print("\nPhases value counts:")
print(df["Phases"].value_counts(dropna=False))


In [None]:
def map_phase_to_3c(x):
    if pd.isna(x): return np.nan
    s = str(x).strip().upper()
    if s == "FCC_SS": return "FCC"
    if s == "BCC_SS": return "BCC"
    if s in ["FCC_PLUS_BCC", "IM"]: return "MULTI"
    return "MULTI"

if "Phases" not in df.columns:
    raise ValueError("Expected column 'Phases' not found.")

df["Target3"] = df["Phases"].apply(map_phase_to_3c)
print(df["Target3"].value_counts(dropna=False))

df = df[~df["Target3"].isna()].copy().reset_index(drop=True)


In [None]:
non_feature_like = {
    "Alloy ID","Alloy","References","Reference","Sythesis_Route","Hot-Cold_Working",
    "IM_Structure","Microstructure_","Microstructure","Phases","Target3",
    "Multiphase","HPR","Quenching"
}
more_non_feats = {"Homogenization_Temp","Homogenization_Time","Annealing_Temp","Annealing_Time_(min)"}
drop_cols = non_feature_like.union(more_non_feats)

num_df = df.select_dtypes(include=[np.number]).copy()
feature_cols = [c for c in num_df.columns if c not in drop_cols]

ELEMENTS = ["Al","Co","Cr","Fe","Ni","Cu","Mn","Ti","Zr","Nb","Mo","Ta","V","Hf","W","Si","C","N","B"]
composition_cols = [c for c in feature_cols if c in ELEMENTS]

extra_feats = [c for c in feature_cols if c in
               ["Hmix","Sconf","Omega","Delta","VEC","Atom.Size.Diff","Elect.Diff","rA/rX"]]

chosen_features = sorted(list(set(composition_cols + extra_feats))) or feature_cols
print(f"{len(chosen_features)} features selected:\n", chosen_features)

X_all = df[chosen_features].copy()
y_all = df["Target3"].copy()


In [None]:
print("Missing values (top 20):")
print(X_all.isna().sum().sort_values(ascending=False).head(20))

display(X_all.describe(percentiles=[0.01,0.05,0.25,0.5,0.75,0.95,0.99]).T)


In [None]:
plt.figure(figsize=(min(len(chosen_features)+4, 18), min(len(chosen_features)+4, 18)))
corr = X_all.corr(numeric_only=True)
sns.heatmap(corr, annot=False, cmap="coolwarm")
plt.title("Feature Correlation Heatmap")
plt.tight_layout(); plt.show()
profile = ProfileReport(df[chosen_features + ["Target3"]],
                        title="HEA Profiling (selected features)", explorative=True, minimal=True)
profile.to_file("hea_profile_report.html")
print("Saved: hea_profile_report.html")


In [None]:
le = LabelEncoder()
y_encoded = le.fit_transform(y_all)
print("Classes:", list(le.classes_))  # expected ['BCC','FCC','MULTI']

X_train, X_tmp, y_train, y_tmp = train_test_split(
    X_all, y_encoded, test_size=0.30, random_state=RANDOM_STATE, stratify=y_encoded
)
X_val, X_test, y_val, y_test = train_test_split(
    X_tmp, y_tmp, test_size=0.50, random_state=RANDOM_STATE, stratify=y_tmp
)
print("Shapes ->", X_train.shape, X_val.shape, X_test.shape)


In [None]:
from joblib import dump

imputer = SimpleImputer(strategy="median")
scaler  = StandardScaler()

X_train_imp = imputer.fit_transform(X_train)
X_val_imp   = imputer.transform(X_val)
X_test_imp  = imputer.transform(X_test)

X_train_s = scaler.fit_transform(X_train_imp)
X_val_s   = scaler.transform(X_val_imp)
X_test_s  = scaler.transform(X_test_imp)

input_dim   = X_train_s.shape[1]
num_classes = len(np.unique(y_encoded))

dump(imputer, "imputer.joblib"); dump(scaler, "scaler.joblib"); dump(le, "label_encoder.joblib")
print("Preprocessors saved.")


In [None]:
classes = np.unique(y_train)
cw = compute_class_weight(class_weight="balanced", classes=classes, y=y_train)
class_weights = {int(c): float(w) for c, w in zip(classes, cw)}
class_weights


In [None]:
def build_model(input_dim, num_classes):
    model = keras.Sequential([
        layers.Input(shape=(input_dim,)),
        layers.Dense(256, activation="relu"),
        layers.BatchNormalization(),
        layers.Dropout(0.30),

        layers.Dense(128, activation="relu"),
        layers.BatchNormalization(),
        layers.Dropout(0.25),

        layers.Dense(64, activation="relu"),
        layers.BatchNormalization(),
        layers.Dropout(0.20),

        layers.Dense(num_classes, activation="softmax")
    ])
    model.compile(optimizer=keras.optimizers.Adam(1e-3),
                  loss="sparse_categorical_crossentropy",
                  metrics=["accuracy"])
    return model

model = build_model(input_dim, num_classes)
model.summary()


In [None]:
early_stop = callbacks.EarlyStopping(monitor="val_accuracy", patience=25,
                                     restore_best_weights=True, mode="max")
reduce_lr  = callbacks.ReduceLROnPlateau(monitor="val_loss", factor=0.5,
                                         patience=10, verbose=1, min_lr=1e-6)
checkpoint = callbacks.ModelCheckpoint("best_model.keras", monitor="val_accuracy",
                                       save_best_only=True, mode="max")

BATCH, EPOCHS = 64, 300
history = model.fit(
    X_train_s, y_train,
    validation_data=(X_val_s, y_val),
    epochs=EPOCHS, batch_size=BATCH,
    class_weight=class_weights,
    callbacks=[early_stop, reduce_lr, checkpoint],
    verbose=1
)


In [None]:
from itertools import product

def plot_cm(y_true, y_pred, labels, title):
    cm = confusion_matrix(y_true, y_pred, labels=range(len(labels)))
    plt.figure(figsize=(6,5))
    sns.heatmap(cm, annot=True, fmt="d", xticklabels=labels, yticklabels=labels)
    plt.xlabel("Predicted"); plt.ylabel("True"); plt.title(title)
    plt.tight_layout(); plt.show()

print("Validation:")
val_preds = np.argmax(model.predict(X_val_s), axis=1)
print("Val Acc:", round(accuracy_score(y_val, val_preds)*100, 2), "%")
print(classification_report(y_val, val_preds, target_names=le.classes_))
plot_cm(y_val, val_preds, le.classes_, "Validation CM")

print("Test:")
test_preds = np.argmax(model.predict(X_test_s), axis=1)
print("Test Acc:", round(accuracy_score(y_test, test_preds)*100, 2), "%")
print(classification_report(y_test, test_preds, target_names=le.classes_))
plot_cm(y_test, test_preds, le.classes_, "Test CM")


In [None]:
model.save("final_model.keras")
print("Saved final_model.keras")

inference_py = r'''
import joblib, numpy as np, pandas as pd, tensorflow as tf
from tensorflow import keras

imputer = joblib.load("imputer.joblib")
scaler  = joblib.load("scaler.joblib")
le      = joblib.load("label_encoder.joblib")
model   = keras.models.load_model("final_model.keras")

EXPECTED = {cols}

def predict_phase(df_features: pd.DataFrame):
    missing = set(EXPECTED) - set(df_features.columns)
    if missing:
        raise ValueError(f"Missing features: {missing}")
    X = df_features[list(EXPECTED)].copy()
    X_imp = imputer.transform(X)
    X_scl = scaler.transform(X_imp)
    probs = model.predict(X_scl)
    preds = probs.argmax(axis=1)
    labels = le.inverse_transform(preds)
    return labels, probs
'''.format(cols=json.dumps(chosen_features))

with open("predict_helper.py","w") as f:
    f.write(inference_py)
print("Wrote predict_helper.py")


In [None]:
ex = pd.DataFrame([X_train.median().to_dict()])  # fake row
from joblib import load
imputer = load("imputer.joblib"); scaler = load("scaler.joblib")
ex_pred = np.argmax(model.predict(scaler.transform(imputer.transform(ex))), axis=1)
print("Example prediction:", [LabelEncoder().fit(le.classes_).inverse_transform(ex_pred)[0]])
