In [None]:
%pip install -U pandas numpy scikit-learn matplotlib openpyxl scikeras

In [None]:
import sys, tensorflow as tf
print(sys.executable)  
print("TF:", tf.__version__)
print("GPUs:", tf.config.list_physical_devices("GPU"))

In [None]:

import os, random, numpy as np, pandas as pd, matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, regularizers


random.seed(42); np.random.seed(42); tf.random.set_seed(42)


os.makedirs("outputs", exist_ok=True)

print("TensorFlow:", tf.__version__)

In [None]:
TARGET = "Grid  Power"  
FILTER_Y_EQ_ZERO = True   


raw_data = pd.read_excel('raw data.xlsx')


raw_data = raw_data.drop(columns=['times'], errors='ignore').copy()


if FILTER_Y_EQ_ZERO:
    raw_data = raw_data[raw_data[TARGET] != 0].copy()


num_cols = [c for c in raw_data.columns if c != TARGET]
raw_data[num_cols] = raw_data[num_cols].apply(pd.to_numeric, errors='coerce')
raw_data = raw_data.dropna().reset_index(drop=True)

print("Data shape:", raw_data.shape)
display(raw_data.head(3))


In [None]:

# Train/Test (80/20)
train_dataset = raw_data.sample(frac=0.8, random_state=0)
test_dataset  = raw_data.drop(train_dataset.index)


test_x_data = test_dataset.drop(columns=[TARGET]).copy()
test_y_data = test_dataset[TARGET].copy()


labeled_train_data   = train_dataset.sample(frac=0.6, random_state=0).copy()
unlabeled_train_data = train_dataset.drop(labeled_train_data.index).copy()


unlabeled_train_data_actual = unlabeled_train_data.pop(TARGET).copy()
labeled_data_labels         = labeled_train_data.pop(TARGET).copy()

len(train_dataset), len(labeled_train_data), len(unlabeled_train_data), len(test_dataset)


In [None]:


from sklearn.preprocessing import StandardScaler


print("Columns:", train_dataset.columns.tolist())


feature_cols = [c for c in train_dataset.columns if c != TARGET]


scaler_fs = StandardScaler().fit(labeled_train_data[feature_cols])

X_fs_tr = scaler_fs.transform(labeled_train_data[feature_cols]).astype("float32")
y_fs_tr = labeled_data_labels.values.astype("float32")  


X_fs_te = scaler_fs.transform(test_dataset[feature_cols]).astype("float32")
y_fs_te = test_dataset[TARGET].values.astype("float32")

INPUT_DIM_FS = X_fs_tr.shape[1]
print("Supervised shapes → X_tr:", X_fs_tr.shape, " X_te:", X_fs_te.shape, "  INPUT_DIM_FS:", INPUT_DIM_FS)


In [None]:
# === data prep ===


X_u = scaler_fs.transform(unlabeled_train_data[feature_cols]).astype("float32")


n_all = X_fs_tr.shape[0]
n_val = int(round(n_all * 0.20))
X_l_tr, y_l_tr = X_fs_tr[:n_all - n_val], y_fs_tr[:n_all - n_val]
X_l_va, y_l_va = X_fs_tr[n_all - n_val:], y_fs_tr[n_all - n_val:]

X_u.shape, X_l_tr.shape, X_l_va.shape

In [None]:

try:
    build_model
except NameError:
    from tensorflow import keras
    from tensorflow.keras import layers, regularizers
    def build_regressor(input_dim, layers_=3, units=32, dropout=0.0, l2=0.0,
                        activation="relu", learning_rate=1e-3, loss="mae",
                        metrics=("mae","mse")):
        inputs = keras.Input(shape=(input_dim,))
        x = inputs
        kreg = regularizers.l2(l2) if (l2 and l2 > 0) else None
        for _ in range(layers_):
            x = layers.Dense(units, kernel_regularizer=kreg)(x)
            if activation == "leaky_relu":
                x = layers.LeakyReLU(0.2)(x)
            else:
                x = layers.Activation(activation)(x)
            if dropout and dropout > 0:
                x = layers.Dropout(dropout)(x)
        outputs = layers.Dense(1)(x)
        model = keras.Model(inputs, outputs)
        model.compile(optimizer=keras.optimizers.Adam(learning_rate=learning_rate),
                      loss=loss, metrics=list(metrics))
        return model
    build_model = build_regressor

import tensorflow as tf
def mc_predict(model, X, T=20, batch_size=512):
    """
    
    """
    preds = []
    for _ in range(T):
        y = model(X, training=True)  
        preds.append(tf.squeeze(y, axis=-1).numpy())
    preds = np.stack(preds, axis=0)  # [T, N]
    return preds.mean(0), preds.std(0)


In [None]:


LR_WARM   = 1e-3    
EPOCHS_W  = 100
DROPOUT_M = 0.2      
T_MC      = 30       
PCT_KEEP  = 0.3     
MIN_KEEP  = max(200, int(0.1 * len(X_u)))  

warm = build_model(INPUT_DIM_FS, layers_=3, units=32, dropout=DROPOUT_M, l2=0.0,
                   activation="relu", learning_rate=LR_WARM, loss="mae")
warm.fit(
    X_l_tr, y_l_tr,
    validation_data=(X_l_va, y_l_va),
    epochs=EPOCHS_W, batch_size=64, verbose=0,
    callbacks=[
        keras.callbacks.EarlyStopping(monitor="val_mae", patience=10, restore_best_weights=True),
        keras.callbacks.ReduceLROnPlateau(monitor="val_mae", factor=0.5, patience=5, min_lr=1e-5),
    ],
)


u_mean, u_std = mc_predict(warm, X_u, T=T_MC, batch_size=1024)


k = max(int(len(X_u) * PCT_KEEP), MIN_KEEP)
idx = np.argsort(u_std)[:k]
X_pseudo = X_u[idx]
y_pseudo = u_mean[idx]
std_pseudo = u_std[idx]

print(f"Unlabeled = {len(X_u)}  →  kept {len(X_pseudo)} pseudo-labels (PCT_KEEP={PCT_KEEP})")


In [None]:
from sklearn.metrics import r2_score, mean_absolute_percentage_error, mean_absolute_error, mean_squared_error


ALPHA_PSEUDO = 0.5  
X_pl_tr = np.concatenate([X_l_tr, X_pseudo], axis=0)
y_pl_tr = np.concatenate([y_l_tr, y_pseudo], axis=0)
w_pl_tr = np.concatenate([np.ones(len(X_l_tr)), ALPHA_PSEUDO * np.ones(len(X_pseudo))], axis=0)


final_pl = build_model(INPUT_DIM_FS, layers_=3, units=32, dropout=0.0, l2=0.0,
                       activation="relu", learning_rate=1e-3, loss="mae")

final_hist = final_pl.fit(
    X_pl_tr, y_pl_tr,
    validation_data=(X_l_va, y_l_va),
    epochs=100, batch_size=64, verbose=0,
    sample_weight=w_pl_tr,
    callbacks=[
        keras.callbacks.EarlyStopping(monitor="val_mae", patience=12, restore_best_weights=True),
        keras.callbacks.ReduceLROnPlateau(monitor="val_mae", factor=0.5, patience=6, min_lr=1e-5),
    ],
)

In [None]:

def mb(y_true, y_pred):
    mae  = mean_absolute_error(y_true, y_pred)
    mse  = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2   = r2_score(y_true, y_pred)
    mape = mean_absolute_percentage_error(y_true, y_pred)
    return {"Loss": mae, "MAE": mae, "MSE": mse, "RMSE": rmse, "R2": r2, "MAPE": mape}

yhat_tr = final_pl.predict(X_l_tr, verbose=0).ravel()
yhat_va = final_pl.predict(X_l_va, verbose=0).ravel()
yhat_te = final_pl.predict(X_fs_te, verbose=0).ravel()

m_tr = mb(y_l_tr, yhat_tr)
m_va = mb(y_l_va, yhat_va)
m_te = mb(y_fs_te, yhat_te)

summary_df = pd.DataFrame.from_dict({"Train": m_tr, "Val": m_va, "Test": m_te}, orient="index")[["Loss","MAE","MSE","RMSE","R2","MAPE"]]
print("[Pseudo-Labeling] Summary:\n", summary_df)

In [None]:

os.makedirs("outputs", exist_ok=True)
pd.DataFrame({"y_true": y_fs_te, "y_pred": yhat_te}).to_excel("outputs/pseudolabel_test_preds.xlsx", index=False)

with pd.ExcelWriter("outputs/pseudolabel_errors.xlsx", engine="openpyxl") as w:
    summary_df.to_excel(w, sheet_name="summary")
    pd.DataFrame({"y_true": y_l_tr, "y_pred": yhat_tr, "residual": y_l_tr - yhat_tr}).to_excel(w, sheet_name="pred_train", index=False)
    pd.DataFrame({"y_true": y_l_va, "y_pred": yhat_va, "residual": y_l_va - yhat_va}).to_excel(w, sheet_name="pred_val", index=False)
    pd.DataFrame({"y_true": y_fs_te, "y_pred": yhat_te, "residual": y_fs_te - yhat_te}).to_excel(w, sheet_name="pred_test", index=False)
    
    pd.DataFrame({"idx_in_unlabeled": idx, "pseudo_y": y_pseudo, "std": std_pseudo}).to_excel(w, sheet_name="pseudo_selected", index=False)


hist_df = pd.DataFrame(final_hist.history); hist_df["epoch"] = np.arange(1, len(hist_df)+1)
hist_df.to_excel("outputs/pseudolabel_history.xlsx", index=False)