In [None]:
import json
import numpy as np
import pandas as pd
import pickle
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import r2_score, mean_squared_error

# === Load JSON ===
with open(".../nmc_simulated_site_avg_xas.json", "r") as f:
    data = json.load(f)
df = pd.DataFrame(data)

# === Define CDF featurization ===
def cdf_featurize(intensity_array, n_bins=100):
    intensity = np.array(intensity_array)
    intensity = np.maximum(intensity, 0)
    cumsum = np.cumsum(intensity)
    cdf = cumsum / cumsum[-1] if cumsum[-1] != 0 else cumsum
    grid = np.linspace(0, 1, n_bins)
    return np.interp(grid, np.linspace(0, 1, len(cdf)), cdf)

# === Build feature matrix ===
X = np.array([cdf_featurize(x, n_bins=100) for x in df["intensity_avg"]])
y_ox = df["avg_oxidation_state"].values
y_bond = df["avg_bond_length"].values

# === Split into Train/Val/Test (70/15/15) ===
def split_data(X, y):
    X_trainval, X_test, y_trainval, y_test = train_test_split(
        X, y, test_size=0.15, random_state=42
    )
    X_train, X_val, y_train, y_val = train_test_split(
        X_trainval, y_trainval, test_size=0.1765, random_state=42
    )
    return X_train, X_val, X_test, y_train, y_val, y_test

# ====================================================
# 1. Oxidation State Regressor
# ====================================================
X_train, X_val, X_test, y_train, y_val, y_test = split_data(X, y_ox)

param_dist = {
    "n_estimators": [100, 200, 800],
    "max_depth": [None, 10, 20, 40],
    "max_features": ["sqrt", "log2", None],
    "min_samples_split": [2, 5, 10],
}

rfr_ox = RandomForestRegressor(random_state=42, n_jobs=-1)
rfr_ox_search = RandomizedSearchCV(
    rfr_ox, param_distributions=param_dist,
    n_iter=10, cv=3, verbose=2, n_jobs=-1, random_state=42
)
rfr_ox_search.fit(X_train, y_train)

best_rfr_ox = rfr_ox_search.best_estimator_

print("Oxidation regressor best params:", rfr_ox_search.best_params_)
print("Val R²:", r2_score(y_val, best_rfr_ox.predict(X_val)))
print("Test R²:", r2_score(y_test, best_rfr_ox.predict(X_test)))
print("Test RMSE:", mean_squared_error(y_test, best_rfr_ox.predict(X_test), squared=False))

with open("oxidation_regressor.pkl", "wb") as f:
    pickle.dump(best_rfr_ox, f)

# ====================================================
# 2. Bond Length Regressor
# ====================================================
X_train, X_val, X_test, y_train, y_val, y_test = split_data(X, y_bond)

rfr_bond = RandomForestRegressor(random_state=42, n_jobs=-1)
rfr_bond_search = RandomizedSearchCV(
    rfr_bond, param_distributions=param_dist,
    n_iter=10, cv=3, verbose=2, n_jobs=-1, random_state=42
)
rfr_bond_search.fit(X_train, y_train)

best_rfr_bond = rfr_bond_search.best_estimator_

print("Bond regressor best params:", rfr_bond_search.best_params_)
print("Val R²:", r2_score(y_val, best_rfr_bond.predict(X_val)))
print("Test R²:", r2_score(y_test, best_rfr_bond.predict(X_test)))
print("Test RMSE:", mean_squared_error(y_test, best_rfr_bond.predict(X_test), squared=False))

with open("bondlength_regressor.pkl", "wb") as f:
    pickle.dump(best_rfr_bond, f)
