In [71]:
import sys, os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
import time
import networkx as nx
import numpy as np
import pandas as pd

import torch
from torch.utils.data import DataLoader
from torch_geometric.utils import from_networkx

from utils.prep_data import load_data, split_data, mask_data, Experiment
from utils.train import train
from utils.dataset import WindFarmDataset
from GCGRU.GRU import GRU
from GCGRU.GCGRU import GCGRU
from copy import deepcopy

device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)
full_graph = True

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, root_mean_squared_error, median_absolute_error, mean_squared_error
import pandas as pd
import numpy as np
import math

cuda


In [72]:
data = load_data(columns=["TurbID", "Wspd", "Wdir", "Etmp", "Itmp", "Ndir", "Pab1", "Pab2", "Pab3", "Prtv", "Patv", "datetime", "P_norm"])
nan_mask = ~data["Patv"].isna().to_numpy()
# subset of turbines for faster experiments
if full_graph:
    turbines_idx = data.TurbID.unique()
else:
    turbines_idx = [9, 10, 11, 12, 31, 32, 33, 34, 35, 52, 53, 54, 55, 56, 57]
data = data[data["TurbID"].isin(turbines_idx)]

data = data.sort_values(["datetime", "TurbID"]).reset_index(drop=True)
data['T'] = data.groupby("TurbID").cumcount()

# normalize features
features = ["Wspd", "Wdir", "Etmp", "Itmp", "Ndir", "Pab1", "Pab2", "Pab3", "Prtv", "Patv"]
data[features] = data[features].apply(lambda col: ((col - col.min()) / (col.max() - col.min())))

train_data, val_data, test_data = split_data(data, splits=[0.7, 0.2, 0.1])

In [73]:
datasets = {
    "train": train_data.copy(),
    "val": val_data.copy(),
    "test": test_data.copy()
}

In [74]:
turbines = pd.read_csv('../data/turbines.csv', index_col=False)
turbines = turbines[turbines['TurbID'].isin(turbines_idx)]
turbines.info()

<class 'pandas.core.frame.DataFrame'>
Index: 133 entries, 0 to 133
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   TurbID  133 non-null    int64  
 1   x       133 non-null    float64
 2   y       133 non-null    float64
dtypes: float64(2), int64(1)
memory usage: 4.2 KB


In [75]:
x_min = turbines['x'].min()
x_max = turbines['x'].max()
y_min = turbines['y'].min()
y_max = turbines['y'].max()
turbines['x'] = (turbines['x'] - x_min) / (x_max - x_min)
turbines['y'] = (turbines['y'] - y_min) / (y_max - y_min)

In [76]:
def MAE(y_true, y_pred):
    return mean_absolute_error(y_true, y_pred)

def RMSE(y_true, y_pred):
    return root_mean_squared_error(y_true, y_pred)

def MedAE(y_true, y_pred):
    return median_absolute_error(y_true, y_pred)

In [77]:
def impute_linear_interpolation(df):
    copy = df.copy()
    copy['Patv'] = (
        df.groupby('TurbID')['Patv']
        .apply(lambda group: group.interpolate(method='linear', limit_direction='both'))
        .reset_index(level=0, drop=True)
    )
    return copy

In [78]:
knn_turbines = turbines.set_index('TurbID')
turbine_ids = knn_turbines.index
distance_df = pd.DataFrame(index=turbine_ids, columns=turbine_ids, dtype=float)

for i in turbine_ids:
    xi, yi = knn_turbines.loc[i, ['x', 'y']].astype(float)
    for j in turbine_ids:
        if i == j:
            distance_df.loc[i, i] = np.inf
        xj, yj = knn_turbines.loc[j, ['x', 'y']].astype(float)
        distance_df.loc[i, j] = np.sqrt((xi - xj) ** 2 + (yi - yj) ** 2)
    
order = np.argsort(distance_df.values, axis=1)
sorted_ids = distance_df.columns.to_numpy()[order]
sorted_neighbors = pd.DataFrame(sorted_ids, index=distance_df.index)

def impute_knn_spatial_mean_parallel(df, k=5):
    result = df.copy()
    patv_mat = result.pivot(index='T', columns='TurbID', values='Patv')

    for tid in sorted_neighbors.index:
        neighbors = sorted_neighbors.loc[tid]
        mask = (result['TurbID'] == tid) & (result['Patv'].isna())
        missing_idx = result.index[mask]

        for idx in missing_idx:
            t = result.at[idx, 'T']
            vals = patv_mat.loc[t, neighbors].dropna()
            if not vals.empty:
                result.at[idx, 'Patv'] = vals.iloc[:k].mean()

    return result

In [79]:
def Baseline_experiment(baseline_method, experiments: dict, experiment: Experiment, target_column: str):
    exp = experiments[experiment]
    for name, masks in exp.items():
        df = datasets[name]
        # keep original order/index; compute y_true once
        nan_mask = df[target_column].notna()
        y_true = df.loc[nan_mask, target_column].to_numpy()

        for size, mask in masks.items():

            # cheap copy + mask in one pass (no deepcopy)
            input_data = df.copy()
            input_data[target_column] = df[target_column].where(mask, np.nan)

            # run baseline; force same row order/index as original
            runtime = time.time()
            result = baseline_method(input_data)
            runtime = time.time() - runtime

            # If baseline returns same index as input (ideal):
            if result.index.equals(df.index):
                y_pred = result.loc[nan_mask, target_column].to_numpy()
            else:
                # Fallback: enforce strict alignment by original index
                result = result.reindex(df.index)
                y_pred = result.loc[nan_mask, target_column].to_numpy()

            # use numpy once; avoids extra pandas copies
            diff = y_true - y_pred
            mse = np.mean(diff ** 2)
            mae = np.mean(np.abs(diff))
            mdae = np.median(np.abs(diff))
            rmse = np.sqrt(mse)

            print(f"{name}, size {size}, MSE {mse}, MAE {mae}, MDAE {mdae}, RMSE {rmse}, time {runtime}s")


In [80]:
random_percentages = [(0.01, None), (0.02, None), (0.05, None), (0.1, None)]
blackout_periods = [(30, 0.01), (60, 0.01), (150, 0.01), (300, 0.01)]
# blackout_periods = [(30, 0.01), (30, 0.02), (30, 0.05), (30, 0.1), (60, 0.01), (60, 0.02), (60, 0.05), (60, 0.1), (150, 0.01), (150, 0.02), (150, 0.05), (150, 0.1), (300, 0.01), (300, 0.02), (300, 0.05), (300, 0.1)]
maintenance_periods = [(1, 0.01), (2, 0.01), (7, 0.01), (14, 0.01)]
# maintenance_periods = [(1, 0.01), (1, 0.02), (1, 0.05), (1, 0.1), (2, 0.01), (2, 0.02), (2, 0.05), (2, 0.1), (7, 0.01), (7, 0.02), (7, 0.05), (7, 0.1), (14, 0.01), (14, 0.02), (14, 0.05), (14, 0.1)]

In [81]:
experiments = {}
for experiment_name in [Experiment.RANDOM, Experiment.BLACKOUT, Experiment.MAINTENANCE]:
    experiments[experiment_name] = {}
    for stage in ["train", "val", "test"]:
        experiments[experiment_name][stage] = pd.read_csv(f"../data/masks_{experiment_name}_{stage}_{full_graph}.csv", index_col=0, header=[0,1]).to_dict(orient="list")

In [82]:
target_column = "Patv"

In [83]:
Baseline_experiment(impute_linear_interpolation, experiments, Experiment.RANDOM, target_column)

train, size ('0.01', 'nan'), MSE 3.4631746530067176e-05, MAE 0.0003241498488932848, MDAE 0.0, RMSE 0.005884874612092972, time 1.8175063133239746s
train, size ('0.02', 'nan'), MSE 6.903387111378834e-05, MAE 0.0006433293456211686, MDAE 0.0, RMSE 0.008308662101626396, time 1.9121384620666504s
train, size ('0.05', 'nan'), MSE 0.00017300104082096368, MAE 0.0015945014311000705, MDAE 0.0, RMSE 0.013152985833585262, time 1.712362289428711s
train, size ('0.1', 'nan'), MSE 0.00034987213439308107, MAE 0.003170161973685026, MDAE 0.0, RMSE 0.018704868853092194, time 1.5889980792999268s
val, size ('0.01', 'nan'), MSE 1.5135005014599301e-05, MAE 0.00019898793834727257, MDAE 0.0, RMSE 0.0038903732784092426, time 0.2716686725616455s
val, size ('0.02', 'nan'), MSE 3.059389564441517e-05, MAE 0.00040585477836430073, MDAE 0.0, RMSE 0.005531175062060356, time 0.2577388286590576s
val, size ('0.05', 'nan'), MSE 7.62489071348682e-05, MAE 0.0010133694158867002, MDAE 0.0, RMSE 0.008732061833143234, time 0.262206

In [84]:
Baseline_experiment(impute_linear_interpolation, experiments, Experiment.BLACKOUT, target_column)

train, size ('30', '0.01'), MSE 5.711649282602593e-05, MAE 0.00042639413732104003, MDAE 0.0, RMSE 0.007557545322924852, time 1.6272509098052979s
train, size ('60', '0.01'), MSE 9.12529430934228e-05, MAE 0.0005450528697110713, MDAE 0.0, RMSE 0.009552640840411186, time 1.6888422966003418s
train, size ('150', '0.01'), MSE 0.00015617629105690867, MAE 0.0007595823262818158, MDAE 0.0, RMSE 0.012497051618993282, time 1.6896514892578125s
train, size ('300', '0.01'), MSE 0.0002564067835919559, MAE 0.001014005159959197, MDAE 0.0, RMSE 0.016012707725167274, time 1.5976872444152832s
val, size ('30', '0.01'), MSE 2.7658034014166333e-05, MAE 0.0002880264073610306, MDAE 0.0, RMSE 0.005259090568870306, time 0.2624821662902832s
val, size ('60', '0.01'), MSE 4.261787762516178e-05, MAE 0.0003673278843052685, MDAE 0.0, RMSE 0.0065282369032502174, time 0.2689025402069092s
val, size ('150', '0.01'), MSE 9.081931784749031e-05, MAE 0.0005373232415877283, MDAE 0.0, RMSE 0.009529916569590569, time 0.26500630378

In [85]:
Baseline_experiment(impute_linear_interpolation, experiments, Experiment.MAINTENANCE, target_column)

train, size ('1', '0.01'), MSE 0.0005810655420646071, MAE 0.0016745227621868253, MDAE 0.0, RMSE 0.02410530112683773, time 1.7002778053283691s
train, size ('2', '0.01'), MSE 0.000790853111539036, MAE 0.002035893499851227, MDAE 0.0, RMSE 0.028122110292315483, time 1.6494288444519043s
train, size ('7', '0.01'), MSE 0.001151049043983221, MAE 0.0025099324993789196, MDAE 0.0, RMSE 0.03392711281776428, time 1.4936809539794922s
train, size ('14', '0.01'), MSE 0.0008460974786430597, MAE 0.0021516738925129175, MDAE 0.0, RMSE 0.02908775396645069, time 1.54876708984375s
val, size ('1', '0.01'), MSE 0.00036724103847518563, MAE 0.001349155674688518, MDAE 0.0, RMSE 0.01916353404521942, time 0.28838539123535156s
val, size ('2', '0.01'), MSE 0.0007494323654100299, MAE 0.001964401686564088, MDAE 0.0, RMSE 0.027375763282179832, time 0.25763869285583496s
val, size ('7', '0.01'), MSE 0.001155832433141768, MAE 0.0024403680581599474, MDAE 0.0, RMSE 0.033997535705566406, time 0.28048038482666016s
val, size ('

In [86]:
Baseline_experiment(impute_knn_spatial_mean_parallel, experiments, Experiment.RANDOM, target_column)

train, size ('0.01', 'nan'), MSE 8.510350016877055e-05, MAE 0.0004488828999456018, MDAE 0.0, RMSE 0.009225156158208847, time 25.241014003753662s
train, size ('0.02', 'nan'), MSE 0.00016578668146394193, MAE 0.0008866882999427617, MDAE 0.0, RMSE 0.012875817716121674, time 42.994091510772705s
train, size ('0.05', 'nan'), MSE 0.000403846672270447, MAE 0.002183789387345314, MDAE 0.0, RMSE 0.020095936954021454, time 99.16823244094849s
train, size ('0.1', 'nan'), MSE 0.0007914036395959556, MAE 0.004290601704269648, MDAE 0.0, RMSE 0.028131896629929543, time 234.54023599624634s
val, size ('0.01', 'nan'), MSE 6.940601451788098e-05, MAE 0.000394445814890787, MDAE 0.0, RMSE 0.008331026881933212, time 11.129739761352539s
val, size ('0.02', 'nan'), MSE 0.00013813329860568047, MAE 0.0007949030841700733, MDAE 0.0, RMSE 0.011753012426197529, time 19.663541078567505s
val, size ('0.05', 'nan'), MSE 0.00032560373074375093, MAE 0.0019357696874067187, MDAE 0.0, RMSE 0.018044492229819298, time 29.19277572631

In [87]:
Baseline_experiment(impute_knn_spatial_mean_parallel, experiments, Experiment.BLACKOUT, target_column)

train, size ('30', '0.01'), MSE 7.807107613189146e-05, MAE 0.00043906827340833843, MDAE 0.0, RMSE 0.008835784159600735, time 26.507506132125854s
train, size ('60', '0.01'), MSE 7.559671939816326e-05, MAE 0.00043390819337219, MDAE 0.0, RMSE 0.008694637566804886, time 27.637680292129517s
train, size ('150', '0.01'), MSE 8.908532618079334e-05, MAE 0.0004636491648852825, MDAE 0.0, RMSE 0.009438502602279186, time 26.700582265853882s
train, size ('300', '0.01'), MSE 7.849034591345116e-05, MAE 0.00044677723781205714, MDAE 0.0, RMSE 0.008859477937221527, time 26.543150663375854s
val, size ('30', '0.01'), MSE 6.859884888399392e-05, MAE 0.00038804044015705585, MDAE 0.0, RMSE 0.008282442577183247, time 7.308529376983643s
val, size ('60', '0.01'), MSE 7.695193198742345e-05, MAE 0.0004125447594560683, MDAE 0.0, RMSE 0.008772225119173527, time 7.395443439483643s
val, size ('150', '0.01'), MSE 8.206293568946421e-05, MAE 0.00041881168726831675, MDAE 0.0, RMSE 0.009058859199285507, time 7.3425855636596

In [88]:
Baseline_experiment(impute_knn_spatial_mean_parallel, experiments, Experiment.MAINTENANCE, target_column)

train, size ('1', '0.01'), MSE 7.134660700103268e-05, MAE 0.000451301340945065, MDAE 0.0, RMSE 0.008446692489087582, time 23.81619954109192s
train, size ('2', '0.01'), MSE 9.926556958816946e-05, MAE 0.00047659731353633106, MDAE 0.0, RMSE 0.009963210672140121, time 23.67454218864441s
train, size ('7', '0.01'), MSE 5.9296686231391504e-05, MAE 0.00040570474811829627, MDAE 0.0, RMSE 0.0077004339545965195, time 23.24683427810669s
train, size ('14', '0.01'), MSE 7.016405288595706e-05, MAE 0.0004217710520606488, MDAE 0.0, RMSE 0.008376398123800755, time 23.251004457473755s
val, size ('1', '0.01'), MSE 4.1572722693672404e-05, MAE 0.00035599598777480423, MDAE 0.0, RMSE 0.006447691470384598, time 7.317458152770996s
val, size ('2', '0.01'), MSE 4.2536339606158435e-05, MAE 0.0003592688008211553, MDAE 0.0, RMSE 0.0065219891257584095, time 8.105987310409546s
val, size ('7', '0.01'), MSE 3.804194784606807e-05, MAE 0.000342783605447039, MDAE 0.0, RMSE 0.0061678155325353146, time 7.259477138519287s
val