In [19]:
import sys, os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
import time
import networkx as nx
import numpy as np
import pandas as pd

import torch
from torch.utils.data import DataLoader
from torch_geometric.utils import from_networkx

from utils.prep_data import load_data, split_data, mask_data, Experiment
from utils.train import train
from utils.dataset import WindFarmDataset
from GCGRU.GRU import GRU
from GCGRU.GCGRU import GCGRU
from copy import deepcopy

device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)
full_graph = False

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, root_mean_squared_error, median_absolute_error, mean_squared_error
import pandas as pd
import numpy as np
import math

cuda


In [20]:
data = load_data(columns=["TurbID", "Wspd", "Wdir", "Etmp", "Itmp", "Ndir", "Pab1", "Pab2", "Pab3", "Prtv", "Patv", "datetime", "P_norm"])
nan_mask = ~data["Patv"].isna().to_numpy()
# subset of turbines for faster experiments
if full_graph:
    turbines_idx = data.TurbID.unique()
else:
    turbines_idx = [9, 10, 11, 12, 31, 32, 33, 34, 35, 52, 53, 54, 55, 56, 57]
data = data[data["TurbID"].isin(turbines_idx)]

data = data.sort_values(["datetime", "TurbID"]).reset_index(drop=True)
data['T'] = data.groupby("TurbID").cumcount()

# normalize features
features = ["Wspd", "Wdir", "Etmp", "Itmp", "Ndir", "Pab1", "Pab2", "Pab3", "Prtv", "Patv"]
data[features] = data[features].apply(lambda col: ((col - col.min()) / (col.max() - col.min())))

train_data, val_data, test_data = split_data(data, splits=[0.7, 0.2, 0.1])

In [21]:
datasets = {
    "train": train_data.copy(),
    "val": val_data.copy(),
    "test": test_data.copy()
}

In [22]:
turbines = pd.read_csv('../data/turbines.csv', index_col=False)
turbines = turbines[turbines['TurbID'].isin(turbines_idx)]
turbines.info()

<class 'pandas.core.frame.DataFrame'>
Index: 15 entries, 8 to 56
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   TurbID  15 non-null     int64  
 1   x       15 non-null     float64
 2   y       15 non-null     float64
dtypes: float64(2), int64(1)
memory usage: 480.0 bytes


In [23]:
x_min = turbines['x'].min()
x_max = turbines['x'].max()
y_min = turbines['y'].min()
y_max = turbines['y'].max()
turbines['x'] = (turbines['x'] - x_min) / (x_max - x_min)
turbines['y'] = (turbines['y'] - y_min) / (y_max - y_min)

In [24]:
def MAE(y_true, y_pred):
    return mean_absolute_error(y_true, y_pred)

def RMSE(y_true, y_pred):
    return root_mean_squared_error(y_true, y_pred)

def MedAE(y_true, y_pred):
    return median_absolute_error(y_true, y_pred)

In [25]:
def impute_linear_interpolation(df):
    copy = df.copy()
    copy['Patv'] = (
        df.groupby('TurbID')['Patv']
        .apply(lambda group: group.interpolate(method='linear', limit_direction='both'))
        .reset_index(level=0, drop=True)
    )
    return copy

In [26]:
knn_turbines = turbines.set_index('TurbID')
turbine_ids = knn_turbines.index
distance_df = pd.DataFrame(index=turbine_ids, columns=turbine_ids, dtype=float)

for i in turbine_ids:
    xi, yi = knn_turbines.loc[i, ['x', 'y']].astype(float)
    for j in turbine_ids:
        if i == j:
            distance_df.loc[i, i] = np.inf
        xj, yj = knn_turbines.loc[j, ['x', 'y']].astype(float)
        distance_df.loc[i, j] = np.sqrt((xi - xj) ** 2 + (yi - yj) ** 2)
    
order = np.argsort(distance_df.values, axis=1)
sorted_ids = distance_df.columns.to_numpy()[order]
sorted_neighbors = pd.DataFrame(sorted_ids, index=distance_df.index)

def impute_knn_spatial_mean_parallel(df, k=5):
    result = df.copy()
    patv_mat = result.pivot(index='T', columns='TurbID', values='Patv')

    for tid in sorted_neighbors.index:
        neighbors = sorted_neighbors.loc[tid]
        mask = (result['TurbID'] == tid) & (result['Patv'].isna())
        missing_idx = result.index[mask]

        for idx in missing_idx:
            t = result.at[idx, 'T']
            vals = patv_mat.loc[t, neighbors].dropna()
            if not vals.empty:
                result.at[idx, 'Patv'] = vals.iloc[:k].mean()

    return result

In [27]:
def Baseline_experiment(baseline_method, experiments: dict, experiment: Experiment, target_column: str):
    exp = experiments[experiment]
    for name, masks in exp.items():
        df = datasets[name]
        # keep original order/index; compute y_true once
        nan_mask = df[target_column].notna()
        y_true = df.loc[nan_mask, target_column].to_numpy()

        for size, mask in masks.items():

            # cheap copy + mask in one pass (no deepcopy)
            input_data = df.copy()
            input_data[target_column] = df[target_column].where(mask, np.nan)

            # run baseline; force same row order/index as original
            runtime = time.time()
            result = baseline_method(input_data)
            runtime = time.time() - runtime

            # If baseline returns same index as input (ideal):
            if result.index.equals(df.index):
                y_pred = result.loc[nan_mask, target_column].to_numpy()
            else:
                # Fallback: enforce strict alignment by original index
                result = result.reindex(df.index)
                y_pred = result.loc[nan_mask, target_column].to_numpy()

            # use numpy once; avoids extra pandas copies
            test_mask = ~((pd.Series(mask,index=df.index)) & (nan_mask))
            diff = df.loc[test_mask, target_column] - result.loc[test_mask, target_column]
            mse = np.mean(diff ** 2)
            mae = np.mean(np.abs(diff))
            mdae = np.median(np.abs(diff))
            rmse = np.sqrt(mse)

            print(f"{name}, size {size}, MSE {mse}, MAE {mae}, MDAE {mdae}, RMSE {rmse}, time {runtime}s")


In [28]:
random_percentages = [(0.01, None), (0.02, None), (0.05, None), (0.1, None)]
blackout_periods = [(30, 0.01), (60, 0.01), (150, 0.01), (300, 0.01)]
# blackout_periods = [(30, 0.01), (30, 0.02), (30, 0.05), (30, 0.1), (60, 0.01), (60, 0.02), (60, 0.05), (60, 0.1), (150, 0.01), (150, 0.02), (150, 0.05), (150, 0.1), (300, 0.01), (300, 0.02), (300, 0.05), (300, 0.1)]
maintenance_periods = [(1, 0.01), (2, 0.01), (7, 0.01), (14, 0.01)]
# maintenance_periods = [(1, 0.01), (1, 0.02), (1, 0.05), (1, 0.1), (2, 0.01), (2, 0.02), (2, 0.05), (2, 0.1), (7, 0.01), (7, 0.02), (7, 0.05), (7, 0.1), (14, 0.01), (14, 0.02), (14, 0.05), (14, 0.1)]

In [29]:
experiments = {}
for experiment_name in [Experiment.RANDOM, Experiment.BLACKOUT, Experiment.MAINTENANCE]:
    experiments[experiment_name] = {}
    for stage in ["train", "val", "test"]:
        experiments[experiment_name][stage] = pd.read_csv(f"../data/masks_{experiment_name}_{stage}_{full_graph}.csv", index_col=0, header=[0,1]).to_dict(orient="list")

In [30]:
target_column = "Patv"

In [31]:
Baseline_experiment(impute_linear_interpolation, experiments, Experiment.RANDOM, target_column)

train, size ('0.01', 'nan'), MSE 0.00349165010266006, MAE 0.03270787000656128, MDAE nan, RMSE 0.05909018591046333, time 0.08013415336608887s
train, size ('0.02', 'nan'), MSE 0.0035488458815962076, MAE 0.03299558907747269, MDAE nan, RMSE 0.059572190046310425, time 0.07430505752563477s
train, size ('0.05', 'nan'), MSE 0.003608542028814554, MAE 0.03344316780567169, MDAE nan, RMSE 0.06007114052772522, time 0.08127284049987793s
train, size ('0.1', 'nan'), MSE 0.003786897985264659, MAE 0.03423624858260155, MDAE nan, RMSE 0.06153777614235878, time 0.08286881446838379s
val, size ('0.01', 'nan'), MSE 0.001776294782757759, MAE 0.022508366033434868, MDAE nan, RMSE 0.04214611276984215, time 0.03733181953430176s
val, size ('0.02', 'nan'), MSE 0.0016628537559881806, MAE 0.021914981305599213, MDAE nan, RMSE 0.04077810421586037, time 0.023487091064453125s
val, size ('0.05', 'nan'), MSE 0.0016262243734672666, MAE 0.0218808576464653, MDAE nan, RMSE 0.040326472371816635, time 0.025513648986816406s
val, s

In [32]:
Baseline_experiment(impute_linear_interpolation, experiments, Experiment.BLACKOUT, target_column)

train, size ('30', '0.01'), MSE 0.006059122737497091, MAE 0.044501058757305145, MDAE nan, RMSE 0.07784036546945572, time 0.08127427101135254s
train, size ('60', '0.01'), MSE 0.010189984925091267, MAE 0.055213168263435364, MDAE nan, RMSE 0.10094545781612396, time 0.08303236961364746s
train, size ('150', '0.01'), MSE 0.01712166890501976, MAE 0.07974667102098465, MDAE nan, RMSE 0.13084979355335236, time 0.08397436141967773s
train, size ('300', '0.01'), MSE 0.02771264873445034, MAE 0.10702142864465714, MDAE nan, RMSE 0.16647116839885712, time 0.08286070823669434s
val, size ('30', '0.01'), MSE 0.0025306541938334703, MAE 0.02790585719048977, MDAE nan, RMSE 0.05030560865998268, time 0.023787975311279297s
val, size ('60', '0.01'), MSE 0.00394788570702076, MAE 0.036661505699157715, MDAE nan, RMSE 0.06283220648765564, time 0.029929637908935547s
val, size ('150', '0.01'), MSE 0.012604308314621449, MAE 0.06668122112751007, MDAE nan, RMSE 0.11226890981197357, time 0.03208518028259277s
val, size ('3

In [33]:
Baseline_experiment(impute_linear_interpolation, experiments, Experiment.MAINTENANCE, target_column)

train, size ('1', '0.01'), MSE 0.05999856814742088, MAE 0.17983081936836243, MDAE nan, RMSE 0.24494604766368866, time 0.09128761291503906s
train, size ('2', '0.01'), MSE 0.11318053305149078, MAE 0.2508905827999115, MDAE nan, RMSE 0.3364231586456299, time 0.0745387077331543s
train, size ('7', '0.01'), MSE 0.13830654323101044, MAE 0.28828975558280945, MDAE nan, RMSE 0.3718958795070648, time 0.08707976341247559s
train, size ('14', '0.01'), MSE 0.12679354846477509, MAE 0.29942333698272705, MDAE nan, RMSE 0.3560808300971985, time 0.08510017395019531s
val, size ('1', '0.01'), MSE 0.05563722550868988, MAE 0.1633707731962204, MDAE nan, RMSE 0.2358754426240921, time 0.02322101593017578s
val, size ('2', '0.01'), MSE 0.11103416234254837, MAE 0.23702524602413177, MDAE nan, RMSE 0.33321788907051086, time 0.028112173080444336s
val, size ('7', '0.01'), MSE 0.07528451830148697, MAE 0.2272246927022934, MDAE nan, RMSE 0.27438023686408997, time 0.028307199478149414s
val, size ('14', '0.01'), MSE nan, MAE

In [34]:
Baseline_experiment(impute_knn_spatial_mean_parallel, experiments, Experiment.RANDOM, target_column)

train, size ('0.01', 'nan'), MSE 0.007669189944863319, MAE 0.04213070496916771, MDAE nan, RMSE 0.08757390826940536, time 2.5991642475128174s
train, size ('0.02', 'nan'), MSE 0.0073060267604887486, MAE 0.04131431132555008, MDAE nan, RMSE 0.08547529578208923, time 4.437418699264526s
train, size ('0.05', 'nan'), MSE 0.007338338065892458, MAE 0.04120982438325882, MDAE nan, RMSE 0.08566410094499588, time 10.29090404510498s
train, size ('0.1', 'nan'), MSE 0.007029582746326923, MAE 0.04099378362298012, MDAE nan, RMSE 0.08384260535240173, time 20.018139839172363s
val, size ('0.01', 'nan'), MSE 0.006702912971377373, MAE 0.036290984600782394, MDAE nan, RMSE 0.08187132328748703, time 0.9141845703125s
val, size ('0.02', 'nan'), MSE 0.007227784022688866, MAE 0.038168132305145264, MDAE nan, RMSE 0.08501637727022171, time 1.5409562587738037s
val, size ('0.05', 'nan'), MSE 0.005929775070399046, MAE 0.035306405276060104, MDAE nan, RMSE 0.07700503617525101, time 3.2039732933044434s
val, size ('0.1', 'na

In [35]:
Baseline_experiment(impute_knn_spatial_mean_parallel, experiments, Experiment.BLACKOUT, target_column)

train, size ('30', '0.01'), MSE 0.007870710454881191, MAE 0.04225628823041916, MDAE nan, RMSE 0.088717021048069, time 2.3769044876098633s
train, size ('60', '0.01'), MSE 0.005548826884478331, MAE 0.03726775199174881, MDAE nan, RMSE 0.07449045032262802, time 2.399712085723877s
train, size ('150', '0.01'), MSE 0.006158837582916021, MAE 0.040529388934373856, MDAE nan, RMSE 0.07847826182842255, time 2.408906936645508s
train, size ('300', '0.01'), MSE 0.004141385667026043, MAE 0.034774258732795715, MDAE nan, RMSE 0.06435360014438629, time 2.441628932952881s
val, size ('30', '0.01'), MSE 0.006319078151136637, MAE 0.034002020955085754, MDAE nan, RMSE 0.07949262857437134, time 0.9211337566375732s
val, size ('60', '0.01'), MSE 0.004646340385079384, MAE 0.033017031848430634, MDAE nan, RMSE 0.06816407293081284, time 0.9198276996612549s
val, size ('150', '0.01'), MSE 0.006465648766607046, MAE 0.03348272666335106, MDAE nan, RMSE 0.08040925860404968, time 0.9237582683563232s
val, size ('300', '0.01'

In [36]:
Baseline_experiment(impute_knn_spatial_mean_parallel, experiments, Experiment.MAINTENANCE, target_column)

train, size ('1', '0.01'), MSE 0.004190958570688963, MAE 0.036935437470674515, MDAE nan, RMSE 0.06473761051893234, time 2.38981556892395s
train, size ('2', '0.01'), MSE 0.004014675971120596, MAE 0.03354518488049507, MDAE nan, RMSE 0.06336147338151932, time 2.4518654346466064s
train, size ('7', '0.01'), MSE 0.004713601898401976, MAE 0.04067530855536461, MDAE nan, RMSE 0.06865567713975906, time 2.110034227371216s
train, size ('14', '0.01'), MSE 0.0049171228893101215, MAE 0.03522823750972748, MDAE nan, RMSE 0.07012219727039337, time 1.5575640201568604s
val, size ('1', '0.01'), MSE 0.020836368203163147, MAE 0.08367655426263809, MDAE nan, RMSE 0.14434808492660522, time 0.9441657066345215s
val, size ('2', '0.01'), MSE 0.004331232979893684, MAE 0.035110119730234146, MDAE nan, RMSE 0.06581210345029831, time 0.8307609558105469s
val, size ('7', '0.01'), MSE 0.0023317981977015734, MAE 0.028127534314990044, MDAE nan, RMSE 0.04828869551420212, time 0.9321002960205078s
val, size ('14', '0.01'), MSE 