In [None]:
#%pip install pandas numpy scikit-learn matplotlib tensorflow shap openpyxl jupyterlab

In [None]:
import sys, tensorflow as tf
print(sys.executable)  
print("TF:", tf.__version__)
print("GPUs:", tf.config.list_physical_devices("GPU"))

In [None]:

import os, random, numpy as np, pandas as pd, matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers


random.seed(42); np.random.seed(42); tf.random.set_seed(42)


os.makedirs("outputs", exist_ok=True)

print("TensorFlow:", tf.__version__)


In [None]:


TARGET = "Grid  Power"   
FILTER_Y_EQ_ZERO = True 
K_NEIGHBORS = 1          
PAIR_SAMPLE_FRAC = 0.10  
PAIR_SAMPLE_MAX = 200_000
EPOCHS_STAGE2 = 100
EPOCHS_STAGE3 = 100
SHAP_BACKGROUND_N = 200
SHAP_SAMPLE_N = 500
SHAP_NSAMPLES = 200      


In [None]:


raw_data = pd.read_excel('raw data.xlsx')

raw_data = raw_data.drop(columns=['times'], errors='ignore').copy()


if FILTER_Y_EQ_ZERO:
    raw_data = raw_data[raw_data[TARGET] != 0].copy()


num_cols = [c for c in raw_data.columns if c != TARGET]
raw_data[num_cols] = raw_data[num_cols].apply(pd.to_numeric, errors='coerce')
raw_data = raw_data.dropna().reset_index(drop=True)

print("Data shape:", raw_data.shape)
display(raw_data.head(3))


In [None]:

# Train/Test (80/20)
train_dataset = raw_data.sample(frac=0.8, random_state=0)
test_dataset  = raw_data.drop(train_dataset.index)


test_x_data = test_dataset.drop(columns=[TARGET]).copy()
test_y_data = test_dataset[TARGET].copy()


labeled_train_data   = train_dataset.sample(frac=0.6, random_state=0).copy()
unlabeled_train_data = train_dataset.drop(labeled_train_data.index).copy()


unlabeled_train_data_actual = unlabeled_train_data.pop(TARGET).copy()
labeled_data_labels         = labeled_train_data.pop(TARGET).copy()

len(train_dataset), len(labeled_train_data), len(unlabeled_train_data), len(test_dataset)


In [None]:


train_stats = train_dataset.describe().transpose()
train_stats = train_stats.drop(index=TARGET, errors='ignore')

def norm(df):
    return (df - train_stats['mean']) / train_stats['std'].replace(0, 1.0)

normed_labeled_train_data   = norm(labeled_train_data).astype('float32')
normed_unlabeled_train_data = norm(unlabeled_train_data).astype('float32')
normed_test_data            = norm(test_x_data).astype('float32')

INPUT_DIM = normed_labeled_train_data.shape[1]
print("Input dim:", INPUT_DIM)


In [None]:

from sklearn.neighbors import NearestNeighbors


nbrs = NearestNeighbors(n_neighbors=K_NEIGHBORS, metric="euclidean").fit(normed_labeled_train_data.values)
dists, idxs = nbrs.kneighbors(normed_unlabeled_train_data.values, return_distance=True)


initial_labels = labeled_data_labels.iloc[idxs.ravel()].reset_index(drop=True)
unlabeled_train_data_actual = unlabeled_train_data_actual.reset_index(drop=True)

pd.DataFrame(initial_labels, columns=[TARGET]).to_excel('outputs/20230315_initial_labels.xlsx', index=False)
pd.DataFrame(unlabeled_train_data_actual, columns=[TARGET]).to_excel('outputs/20230315_actual_labels.xlsx', index=False)

print("Stage-1 finished：initial tag num", len(initial_labels))


In [None]:

ZL = normed_labeled_train_data.values.astype('float32')  # (m, p)
yL = labeled_data_labels.values.astype('float32')
m, p = ZL.shape


PAIR_SAMPLE_FRAC = 0.10
PAIR_SAMPLE_MAX  = 200_000
BATCH_GEN        = 50_000  

total_pairs = m * (m - 1) // 2
n_sample = min(max(1, int(PAIR_SAMPLE_FRAC * total_pairs)), PAIR_SAMPLE_MAX)

rng = np.random.default_rng(9)
Xs, ys, made = [], [], 0
while made < n_sample:
    b = min(BATCH_GEN, n_sample - made)
    i = rng.integers(0, m-1, size=b, endpoint=False)
    j = rng.integers(0, m,   size=b)
    mask = i < j
    if not np.any(mask):
        continue
    i = i[mask]; j = j[mask]

    Xs.append(ZL[i] - ZL[j])         # (b, p), float32
    ys.append(yL[i] - yL[j])         # (b,)
    made += len(i)

s_feature_diff_labeled = pd.DataFrame(np.vstack(Xs).astype('float32'))
s_target_diff          = pd.Series(np.concatenate(ys), dtype='float32')
print("Stage-2 number of sampled differences:", len(s_target_diff))
print("Stage-2 number of sampled differences:", s_feature_diff_labeled.shape[0])


In [None]:

'''
def build_model(input_dim):
    model = keras.Sequential([
        layers.Dense(32, activation="relu", input_shape=(input_dim,)),
        layers.Dense(32, activation="relu"),
        layers.Dense(32, activation="relu"),
        layers.Dense(1)
    ])
    model.compile(loss='mae', optimizer="adam", metrics=['mae','mse'])
    return model
'''

def build_model(input_dim):
    inputs = keras.Input(shape=(input_dim,))
    x = layers.Dense(32, activation="relu")(inputs)
    x = layers.Dense(32, activation="relu")(x)
    x = layers.Dense(32, activation="relu")(x)
    outputs = layers.Dense(1)(x)
    model = keras.Model(inputs, outputs)
    model.compile(optimizer="adam", loss="mae", metrics=["mae","mse"])
    return model


modification_model = build_model(INPUT_DIM)

class PrintDot(keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs=None):
        if epoch % 25 == 0: print('')
        print('.', end='')

history = modification_model.fit(
    s_feature_diff_labeled, s_target_diff,
    epochs=EPOCHS_STAGE2, validation_split=0.2, verbose=0, callbacks=[PrintDot()]
)

hist2 = pd.DataFrame(history.history); hist2['epoch'] = history.epoch
pd.DataFrame(hist2).to_excel('outputs/stage2_errors.xlsx', index=False)
hist2.tail()


In [None]:

modification_model.summary()
print("X shape:", s_feature_diff_labeled.shape, "y shape:", s_target_diff.shape, s_target_diff.dtype)


eval_loss, eval_mae, eval_mse = modification_model.evaluate(s_feature_diff_labeled, s_target_diff, verbose=0)
print("Eval -> loss/mae/mse:", eval_loss, eval_mae, eval_mse)

In [None]:


closestL_idx = idxs.ravel()


ZL_df = pd.DataFrame(ZL)
ZU_df = normed_unlabeled_train_data.reset_index(drop=True)
feature_diff_most_sim = (ZL_df.iloc[closestL_idx].values - ZU_df.values).astype('float32')


modification_values = modification_model.predict(feature_diff_most_sim, verbose=0).ravel().astype('float32')


final_labels_unlabeled = pd.Series(initial_labels.values + modification_values, name=TARGET).astype('float32')
pd.DataFrame(final_labels_unlabeled).to_excel('outputs/20230315_adjusted_labels.xlsx', index=False)

print("Stage-2 finished：Number of modified labels generated", len(final_labels_unlabeled))


In [None]:


final_xtrain_data = pd.concat(
    [normed_labeled_train_data.reset_index(drop=True),
     normed_unlabeled_train_data.reset_index(drop=True)],
    axis=0
).astype('float32')


final_ytrain_data = pd.concat(
    [labeled_data_labels.reset_index(drop=True).astype('float32'),
     final_labels_unlabeled.reset_index(drop=True).astype('float32')],
    axis=0
).squeeze()

print("Final training data shape:", final_xtrain_data.shape, final_ytrain_data.shape)

final_model = build_model(final_xtrain_data.shape[1])
history = final_model.fit(
    final_xtrain_data, final_ytrain_data,
    epochs=EPOCHS_STAGE3, validation_split=0.2, verbose=0, callbacks=[PrintDot()]
)

hist3 = pd.DataFrame(history.history); hist3['epoch'] = history.epoch
pd.DataFrame(hist3).to_excel('outputs/stage3_errors.xlsx', index=False)
hist3.tail()


In [None]:


test_predictions = final_model.predict(normed_test_data, verbose=0).ravel()
pd.DataFrame({"y_pred": test_predictions}).to_excel('outputs/20230406_DSSL_predictions.xlsx', index=False)
pd.DataFrame(test_y_data).to_excel('outputs/20230406_test_actual.xlsx', index=False)
pd.DataFrame(test_x_data).to_excel('outputs/20230406_test_X.xlsx', index=False)

loss, mae, mse = final_model.evaluate(normed_test_data, test_y_data, verbose=0)
print("loss:", loss, "MAE:", mae, "MSE:", mse)

from sklearn.metrics import r2_score, mean_absolute_percentage_error
print("R2 :", r2_score(test_y_data, test_predictions))
print("MAPE:", mean_absolute_percentage_error(test_y_data, test_predictions))


In [None]:


import shap

feat_names = final_xtrain_data.columns.tolist()


bg_n = min(SHAP_BACKGROUND_N, len(final_xtrain_data))
smpl_n = min(SHAP_SAMPLE_N, len(final_xtrain_data))
background = final_xtrain_data.sample(n=bg_n, random_state=42)
X_sample  = final_xtrain_data.sample(n=smpl_n, random_state=7)

def f_predict(x):
    return final_model.predict(x, verbose=0)

explainer   = shap.KernelExplainer(f_predict, background)
shap_values = explainer.shap_values(X_sample, nsamples=SHAP_NSAMPLES)
if isinstance(shap_values, list):
    shap_values = shap_values[0]

print("SHAP finished：", np.array(shap_values).shape)


In [None]:
os.makedirs("outputs", exist_ok=True)


sv = np.array(shap_values).squeeze()   # e.g. (500, 25)
assert sv.ndim == 2, f"sv ndim should be 2, got {sv.shape}"


base = explainer.expected_value
if isinstance(base, (list, np.ndarray)):
    base = float(np.squeeze(base))


imp = np.mean(np.abs(sv), axis=0).ravel()      # (M,)
order = np.argsort(imp)[::-1]
top3  = [feat_names[i] for i in order[:3]]


plt.figure()
shap.summary_plot(sv, X_sample, feature_names=feat_names, plot_type="bar", show=False)
plt.tight_layout(); plt.savefig("outputs/shap_top10_bar.png", dpi=220); plt.close()

plt.figure()
shap.summary_plot(sv, X_sample, feature_names=feat_names, show=False)
plt.tight_layout(); plt.savefig("outputs/shap_beeswarm.png", dpi=220); plt.close()


pd.DataFrame({"feature": feat_names, "mean_abs_shap": imp}) \
  .sort_values("mean_abs_shap", ascending=False) \
  .to_csv("outputs/shap_global_importance.csv", index=False)


for f in top3:
    plt.figure()
    shap.dependence_plot(f, sv, X_sample, feature_names=feat_names,
                         interaction_index="auto", show=False)
    plt.tight_layout(); plt.savefig(f"outputs/shap_dependence_{f}.png", dpi=220); plt.close()


for k, (_, row) in enumerate(X_sample.head(3).iterrows(), start=1):
    sv_row = sv[k-1]  
    exp = shap.Explanation(values=sv_row,
                           base_values=base,
                           data=row.values,
                           feature_names=feat_names)
    shap.plots.waterfall(exp, max_display=15, show=False)
    plt.tight_layout(); plt.savefig(f"outputs/shap_waterfall_case{k}.png", dpi=220); plt.close()


pred = final_model.predict(X_sample, verbose=0).reshape(-1)
gap  = np.max(np.abs(pred - (base + sv.sum(axis=1))))
print("Top-3 characteristic：", ", ".join(top3))
print("max|pred - (base + sum(shap))| =", float(gap))
print("SHAP image file and CSV export to outputs/ files")
