In [None]:
%pip install -U pandas numpy scikit-learn matplotlib shap openpyxl scikeras

In [None]:
import sys, tensorflow as tf
print(sys.executable)  
print("TF:", tf.__version__)
print("GPUs:", tf.config.list_physical_devices("GPU"))

In [None]:

import os, random, numpy as np, pandas as pd, matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, regularizers


random.seed(42); np.random.seed(42); tf.random.set_seed(42)


os.makedirs("outputs", exist_ok=True)

print("TensorFlow:", tf.__version__)


In [None]:


TARGET = "Grid  Power"   
FILTER_Y_EQ_ZERO = True 
K_NEIGHBORS = 1       
PAIR_SAMPLE_FRAC = 0.10  
PAIR_SAMPLE_MAX = 200_000
EPOCHS_STAGE2 = 100
EPOCHS_STAGE3 = 100


TUNING_TRIALS = 20  
CV_SPLITS     = 3   


In [None]:


raw_data = pd.read_excel('raw data.xlsx')

raw_data = raw_data.drop(columns=['times'], errors='ignore').copy()


if FILTER_Y_EQ_ZERO:
    raw_data = raw_data[raw_data[TARGET] != 0].copy()


num_cols = [c for c in raw_data.columns if c != TARGET]
raw_data[num_cols] = raw_data[num_cols].apply(pd.to_numeric, errors='coerce')
raw_data = raw_data.dropna().reset_index(drop=True)

print("Data shape:", raw_data.shape)
display(raw_data.head(3))


In [None]:


train_dataset = raw_data.sample(frac=0.8, random_state=0)
test_dataset  = raw_data.drop(train_dataset.index)

test_x_data = test_dataset.drop(columns=[TARGET]).copy()
test_y_data = test_dataset[TARGET].copy()


labeled_train_data   = train_dataset.sample(frac=0.6, random_state=0).copy()
unlabeled_train_data = train_dataset.drop(labeled_train_data.index).copy()


unlabeled_train_data_actual = unlabeled_train_data.pop(TARGET).copy()
labeled_data_labels         = labeled_train_data.pop(TARGET).copy()

len(train_dataset), len(labeled_train_data), len(unlabeled_train_data), len(test_dataset)


In [None]:


train_stats = train_dataset.describe().transpose()
train_stats = train_stats.drop(index=TARGET, errors='ignore')

def norm(df):
    return (df - train_stats['mean']) / train_stats['std'].replace(0, 1.0)

normed_labeled_train_data   = norm(labeled_train_data).astype('float32')
normed_unlabeled_train_data = norm(unlabeled_train_data).astype('float32')
normed_test_data            = norm(test_x_data).astype('float32')

INPUT_DIM = normed_labeled_train_data.shape[1]
print("Input dim:", INPUT_DIM)


In [None]:

from sklearn.neighbors import NearestNeighbors


nbrs = NearestNeighbors(n_neighbors=K_NEIGHBORS, metric="euclidean").fit(normed_labeled_train_data.values)
dists, idxs = nbrs.kneighbors(normed_unlabeled_train_data.values, return_distance=True)


initial_labels = labeled_data_labels.iloc[idxs.ravel()].reset_index(drop=True)
unlabeled_train_data_actual = unlabeled_train_data_actual.reset_index(drop=True)

pd.DataFrame(initial_labels, columns=[TARGET]).to_excel('outputs/20230315_initial_labels.xlsx', index=False)
pd.DataFrame(unlabeled_train_data_actual, columns=[TARGET]).to_excel('outputs/20230315_actual_labels.xlsx', index=False)

print("Stage-1 finished：initial tag num", len(initial_labels))


In [None]:

ZL = normed_labeled_train_data.values.astype('float32')  # (m, p)
yL = labeled_data_labels.values.astype('float32')
m, p = ZL.shape


PAIR_SAMPLE_FRAC = 0.10
PAIR_SAMPLE_MAX  = 200_000
BATCH_GEN        = 50_000 

total_pairs = m * (m - 1) // 2
n_sample = min(max(1, int(PAIR_SAMPLE_FRAC * total_pairs)), PAIR_SAMPLE_MAX)

rng = np.random.default_rng(9)
Xs, ys, made = [], [], 0
while made < n_sample:
    b = min(BATCH_GEN, n_sample - made)
    i = rng.integers(0, m-1, size=b, endpoint=False)
    j = rng.integers(0, m,   size=b)
    mask = i < j
    if not np.any(mask):
        continue
    i = i[mask]; j = j[mask]

    Xs.append(ZL[i] - ZL[j])         # (b, p), float32
    ys.append(yL[i] - yL[j])         # (b,)
    made += len(i)

s_feature_diff_labeled = pd.DataFrame(np.vstack(Xs).astype('float32'))
s_target_diff          = pd.Series(np.concatenate(ys), dtype='float32')
print("Stage-2 number of sampled differences:", len(s_target_diff))
print("Stage-2 number of sampled differences:", s_feature_diff_labeled.shape[0])


In [None]:
'''
def build_model(input_dim):
    inputs = keras.Input(shape=(input_dim,))
    x = layers.Dense(32, activation="relu")(inputs)
    x = layers.Dense(32, activation="relu")(x)
    x = layers.Dense(32, activation="relu")(x)
    outputs = layers.Dense(1)(x)
    model = keras.Model(inputs, outputs)
    model.compile(optimizer="adam", loss="mae", metrics=["mae","mse"])
    return model
'''

# Tuning
def build_model(input_dim,
                layers_: int = 3,
                units: int = 32,
                dropout: float = 0.0,
                l2: float = 0.0,
                activation: str = "relu",
                learning_rate: float = 1e-3):
    inputs = keras.Input(shape=(input_dim,))
    x = inputs
    kreg = regularizers.l2(l2) if (l2 and l2 > 0) else None
    for _ in range(layers_):
        x = layers.Dense(units, kernel_regularizer=kreg)(x)
        if activation == "leaky_relu":
            x = layers.LeakyReLU(alpha=0.2)(x)
        else:
            x = layers.Activation(activation)(x)
        if dropout and dropout > 0:
            x = layers.Dropout(dropout)(x)
    outputs = layers.Dense(1)(x)
    model = keras.Model(inputs, outputs)
    model.compile(optimizer=keras.optimizers.Adam(learning_rate=learning_rate),
                  loss="mae", metrics=["mae","mse"])
    return model



modification_model = build_model(INPUT_DIM)

class PrintDot(keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs=None):
        if epoch % 25 == 0: print('')
        print('.', end='')

history = modification_model.fit(
    s_feature_diff_labeled, s_target_diff,
    epochs=EPOCHS_STAGE2, validation_split=0.2, verbose=0, callbacks=[PrintDot()]
)

hist2 = pd.DataFrame(history.history); hist2['epoch'] = history.epoch
pd.DataFrame(hist2).to_excel('outputs/stage2_errors.xlsx', index=False)
hist2.tail()


In [None]:

modification_model.summary()
print("X shape:", s_feature_diff_labeled.shape, "y shape:", s_target_diff.shape, s_target_diff.dtype)


eval_loss, eval_mae, eval_mse = modification_model.evaluate(s_feature_diff_labeled, s_target_diff, verbose=0)
print("Eval -> loss/mae/mse:", eval_loss, eval_mae, eval_mse)

In [None]:


closestL_idx = idxs.ravel()


ZL_df = pd.DataFrame(ZL)
ZU_df = normed_unlabeled_train_data.reset_index(drop=True)
feature_diff_most_sim = (ZL_df.iloc[closestL_idx].values - ZU_df.values).astype('float32')


modification_values = modification_model.predict(feature_diff_most_sim, verbose=0).ravel().astype('float32')


final_labels_unlabeled = pd.Series(initial_labels.values + modification_values, name=TARGET).astype('float32')
pd.DataFrame(final_labels_unlabeled).to_excel('outputs/20230315_adjusted_labels.xlsx', index=False)

print("Stage-2 finished：Number of modified labels generated", len(final_labels_unlabeled))


In [None]:


final_xtrain_data = pd.concat(
    [normed_labeled_train_data.reset_index(drop=True),
     normed_unlabeled_train_data.reset_index(drop=True)],
    axis=0
).astype('float32')


final_ytrain_data = pd.concat(
    [labeled_data_labels.reset_index(drop=True).astype('float32'),
     final_labels_unlabeled.reset_index(drop=True).astype('float32')],
    axis=0
).squeeze()

print("Final training data shape:", final_xtrain_data.shape, final_ytrain_data.shape)

final_model = build_model(final_xtrain_data.shape[1])

stage3_callbacks = [
    PrintDot(),
    keras.callbacks.EarlyStopping(monitor="val_loss", patience=10, restore_best_weights=True),
    keras.callbacks.ReduceLROnPlateau(monitor="val_loss", patience=5, factor=0.5, min_lr=1e-6),
]

history = final_model.fit(
    final_xtrain_data, final_ytrain_data,
    epochs=EPOCHS_STAGE3, validation_split=0.2, verbose=0, callbacks=[stage3_callbacks]
)

hist3 = pd.DataFrame(history.history); hist3['epoch'] = history.epoch
pd.DataFrame(hist3).to_excel('outputs/stage3_errors.xlsx', index=False)
hist3.tail()


In [None]:

test_predictions = final_model.predict(normed_test_data, verbose=0).ravel()
pd.DataFrame({"y_pred": test_predictions}).to_excel('outputs/20230406_DSSL_predictions.xlsx', index=False)
pd.DataFrame(test_y_data).to_excel('outputs/20230406_test_actual.xlsx', index=False)
pd.DataFrame(test_x_data).to_excel('outputs/20230406_test_X.xlsx', index=False)

loss, mae, mse = final_model.evaluate(normed_test_data, test_y_data, verbose=0)
print("loss:", loss, "MAE:", mae, "MSE:", mse)

from sklearn.metrics import r2_score, mean_absolute_percentage_error
print("R2 :", r2_score(test_y_data, test_predictions))
print("MAPE:", mean_absolute_percentage_error(test_y_data, test_predictions))

In [None]:

import math
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
from tensorflow import keras
from tensorflow.keras import layers, regularizers


def make_callbacks():
    return [
        keras.callbacks.EarlyStopping(monitor="val_loss", patience=10, restore_best_weights=True),
        keras.callbacks.ReduceLROnPlateau(monitor="val_loss", patience=5, factor=0.5, min_lr=1e-6),
    ]


X_train = final_xtrain_data.values.astype("float32")
y_train = final_ytrain_data.values.astype("float32").ravel()
X_test  = normed_test_data.values.astype("float32")
y_test  = test_y_data.values.astype("float32").ravel()



def build_model_tunable(input_dim,
                        layers_: int = 3,
                        units: int = 128,
                        dropout: float = 0.2,
                        l2: float = 1e-5,
                        activation: str = "relu",
                        learning_rate: float = 1e-3):
    inputs = keras.Input(shape=(input_dim,))
    x = inputs
    kreg = regularizers.l2(l2) if l2 and l2 > 0 else None
    for _ in range(layers_):
        x = layers.Dense(units, kernel_regularizer=kreg)(x)
        if activation == "leaky_relu":
            x = layers.LeakyReLU(alpha=0.2)(x)
        else:
            x = layers.Activation(activation)(x)
        if dropout and dropout > 0:
            x = layers.Dropout(dropout)(x)
    outputs = layers.Dense(1)(x)
    m = keras.Model(inputs, outputs)
    m.compile(optimizer=keras.optimizers.Adam(learning_rate=learning_rate),
              loss="mae", metrics=["mae"])
    return m


In [None]:


rng = np.random.default_rng(42)
def sample_params():
    return {
        "layers_":       int(rng.choice([2, 3, 4])),
        "units":         int(rng.choice([64, 128, 256])),
        "dropout":       float(rng.uniform(0.0, 0.5)),
        "l2":            float(10 ** rng.uniform(-6, -3)),
        "activation":    str(rng.choice(["relu", "leaky_relu"])),
        "learning_rate": float(10 ** rng.uniform(-4, math.log10(3e-2))),
        "batch_size":    int(rng.choice([32, 64, 128])),
    }


def cv_mae(params, splits=CV_SPLITS):
    kf = KFold(n_splits=splits, shuffle=True, random_state=42)
    maes = []
    for tr_idx, va_idx in kf.split(X_train):
        m = build_model_tunable(INPUT_DIM, **{k: v for k, v in params.items() if k != "batch_size"})
        m.fit(
            X_train[tr_idx], y_train[tr_idx],
            epochs=EPOCHS_STAGE3, batch_size=params["batch_size"],
            validation_data=(X_train[va_idx], y_train[va_idx]),
            callbacks=make_callbacks(), verbose=0
        )
        yv = m.predict(X_train[va_idx], verbose=0).ravel()
        maes.append(mean_absolute_error(y_train[va_idx], yv))
    return float(np.mean(maes))

In [None]:
# Random Search（n = TUNING_TRIALS）
trials = []
for t in range(TUNING_TRIALS):
    p = sample_params()
    score = cv_mae(p, splits=CV_SPLITS)
    trials.append({"trial": t+1, "cv_mae": score, **p})

trials_df = pd.DataFrame(trials).sort_values("cv_mae")
best = trials_df.iloc[0].to_dict()
best_params = {k: best[k] for k in best if k not in ["trial", "cv_mae"]}
print("Best CV MAE:", best["cv_mae"])
print("Best params:", best_params)


In [None]:

tuned_model = build_model_tunable(
    INPUT_DIM, **{k: v for k, v in best_params.items() if k != "batch_size"}
)

hist_tuned = tuned_model.fit(
    X_train, y_train,
    epochs=EPOCHS_STAGE3,
    batch_size=int(best_params["batch_size"]),
    validation_split=0.2,
    callbacks=make_callbacks(),
    verbose=0
)


hist_tuned_df = pd.DataFrame(hist_tuned.history)


hist_tuned_df["epoch"] = np.arange(1, len(hist_tuned_df["loss"]) + 1)
os.makedirs("outputs", exist_ok=True)
hist_tuned_df.to_excel("outputs/stage3_tuned_errors.xlsx", index=False)


best_epoch = int(np.argmin(hist_tuned.history["val_loss"])) + 1
best_val   = float(np.min(hist_tuned.history["val_loss"]))
print(f"[tuned] best_epoch={best_epoch}, best_val_MAE={best_val:.6f}")


from sklearn.metrics import mean_absolute_error
tuned_pred = tuned_model.predict(X_test, verbose=0).ravel()
tuned_test_mae = mean_absolute_error(y_test, tuned_pred)
print("Tuned Test MAE:", tuned_test_mae)


tuned_model.save("outputs/stage3_tuned_model.keras")


In [None]:

os.makedirs("outputs", exist_ok=True)
trials_df.to_excel("outputs/stage3_randomsearch_trials.xlsx", index=False)
pd.DataFrame([{
    "Method": "RandomSearch (Stage-3)",
    "CV MAE (best)": best["cv_mae"],
    "Test MAE": tuned_test_mae,
    "Best Params": best_params
}]).to_excel("outputs/stage3_tuning_summary.xlsx", index=False)

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, mean_absolute_percentage_error

def quick_eval(model, X, y):
    y_pred = model.predict(X, verbose=0).ravel()
    mae  = mean_absolute_error(y, y_pred)
    mse  = mean_squared_error(y, y_pred)
    r2   = r2_score(y, y_pred)
    mape = mean_absolute_percentage_error(y, y_pred)
    return mae, mse, r2, mape

print("Baseline (final_model):", *quick_eval(final_model, normed_test_data, np.asarray(test_y_data).ravel()))
if "tuned_model" in globals():
    print("Tuned:", *quick_eval(tuned_model, normed_test_data, np.asarray(test_y_data).ravel()))
