In [35]:
import sys, os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
import time
import networkx as nx
import numpy as np
import pandas as pd

import torch
from torch.utils.data import DataLoader
from torch_geometric.utils import from_networkx

from utils.prep_data import load_data, split_data, mask_data, Experiment
from utils.train import train
from utils.dataset import WindFarmDataset
from GCGRU.GRU import GRU
from GCGRU.GCGRU import GCGRU
from copy import deepcopy

device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)
full_graph = False

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, root_mean_squared_error, median_absolute_error, mean_squared_error
import pandas as pd
import numpy as np
import math

cuda


In [36]:
data = load_data(columns=["TurbID", "Wspd", "Wdir", "Etmp", "Itmp", "Ndir", "Pab1", "Pab2", "Pab3", "Prtv", "Patv", "datetime", "P_norm"])
nan_mask = ~data["Patv"].isna().to_numpy()
# subset of turbines for faster experiments
if full_graph:
    turbines_idx = data.TurbID.unique()
else:
    turbines_idx = [9, 10, 11, 12, 31, 32, 33, 34, 35, 52, 53, 54, 55, 56, 57]
data = data[data["TurbID"].isin(turbines_idx)]

data = data.sort_values(["datetime", "TurbID"]).reset_index(drop=True)
data['T'] = data.groupby("TurbID").cumcount()

# normalize features
features = ["Wspd", "Wdir", "Etmp", "Itmp", "Ndir", "Pab1", "Pab2", "Pab3", "Prtv", "Patv"]
data[features] = data[features].apply(lambda col: ((col - col.min()) / (col.max() - col.min())))

train_data, val_data, test_data = split_data(data, splits=[0.7, 0.2, 0.1])

In [37]:
datasets = {
    "train": train_data.copy(),
    "val": val_data.copy(),
    "test": test_data.copy()
}

In [38]:
turbines = pd.read_csv('../data/turbines.csv', index_col=False)
turbines = turbines[turbines['TurbID'].isin(turbines_idx)]
turbines.info()

<class 'pandas.core.frame.DataFrame'>
Index: 15 entries, 8 to 56
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   TurbID  15 non-null     int64  
 1   x       15 non-null     float64
 2   y       15 non-null     float64
dtypes: float64(2), int64(1)
memory usage: 480.0 bytes


In [39]:
x_min = turbines['x'].min()
x_max = turbines['x'].max()
y_min = turbines['y'].min()
y_max = turbines['y'].max()
turbines['x'] = (turbines['x'] - x_min) / (x_max - x_min)
turbines['y'] = (turbines['y'] - y_min) / (y_max - y_min)

In [40]:
def MAE(y_true, y_pred):
    return mean_absolute_error(y_true, y_pred)

def RMSE(y_true, y_pred):
    return root_mean_squared_error(y_true, y_pred)

def MedAE(y_true, y_pred):
    return median_absolute_error(y_true, y_pred)

In [41]:
def impute_linear_interpolation(df):
    copy = df.copy()
    copy['Patv'] = (
        df.groupby('TurbID')['Patv']
        .apply(lambda group: group.interpolate(method='linear', limit_direction='both'))
        .reset_index(level=0, drop=True)
    )
    return copy

In [42]:
knn_turbines = turbines.set_index('TurbID')
turbine_ids = knn_turbines.index
distance_df = pd.DataFrame(index=turbine_ids, columns=turbine_ids, dtype=float)

for i in turbine_ids:
    xi, yi = knn_turbines.loc[i, ['x', 'y']].astype(float)
    for j in turbine_ids:
        if i == j:
            distance_df.loc[i, i] = np.inf
        xj, yj = knn_turbines.loc[j, ['x', 'y']].astype(float)
        distance_df.loc[i, j] = np.sqrt((xi - xj) ** 2 + (yi - yj) ** 2)
    
order = np.argsort(distance_df.values, axis=1)
sorted_ids = distance_df.columns.to_numpy()[order]
sorted_neighbors = pd.DataFrame(sorted_ids, index=distance_df.index)

def impute_knn_spatial_mean_parallel(df, k=5):
    result = df.copy()
    patv_mat = result.pivot(index='T', columns='TurbID', values='Patv')

    for tid in sorted_neighbors.index:
        neighbors = sorted_neighbors.loc[tid]
        mask = (result['TurbID'] == tid) & (result['Patv'].isna())
        missing_idx = result.index[mask]

        for idx in missing_idx:
            t = result.at[idx, 'T']
            vals = patv_mat.loc[t, neighbors].dropna()
            if not vals.empty:
                result.at[idx, 'Patv'] = vals.iloc[:k].mean()

    return result

In [43]:
def Baseline_experiment(baseline_method, experiments: dict, experiment: Experiment, target_column: str):
    exp = experiments[experiment]
    for name, masks in exp.items():
        df = datasets[name]
        # keep original order/index; compute y_true once
        nan_mask = df[target_column].notna()
        y_true = df.loc[nan_mask, target_column].to_numpy()

        for size, mask in masks.items():

            # cheap copy + mask in one pass (no deepcopy)
            input_data = df.copy()
            input_data[target_column] = df[target_column].where(mask, np.nan)

            # run baseline; force same row order/index as original
            result = baseline_method(input_data)

            # If baseline returns same index as input (ideal):
            if result.index.equals(df.index):
                y_pred = result.loc[nan_mask, target_column].to_numpy()
            else:
                # Fallback: enforce strict alignment by original index
                result = result.reindex(df.index)
                y_pred = result.loc[nan_mask, target_column].to_numpy()

            # use numpy once; avoids extra pandas copies
            diff = y_true - y_pred
            mse = np.mean(diff ** 2)
            mae = np.mean(np.abs(diff))
            mdae = np.median(np.abs(diff))
            rmse = np.sqrt(mse)

            print(f"{name}, size {size}, MSE {mse}, MAE {mae}, MDAE {mdae}, RMSE {rmse}")


In [44]:
random_percentages = [(0.01, None), (0.02, None), (0.05, None), (0.1, None)]
blackout_periods = [(30, 0.01), (60, 0.01), (150, 0.01), (300, 0.01)]
# blackout_periods = [(30, 0.01), (30, 0.02), (30, 0.05), (30, 0.1), (60, 0.01), (60, 0.02), (60, 0.05), (60, 0.1), (150, 0.01), (150, 0.02), (150, 0.05), (150, 0.1), (300, 0.01), (300, 0.02), (300, 0.05), (300, 0.1)]
maintenance_periods = [(1, 0.01), (2, 0.01), (7, 0.01), (14, 0.01)]
# maintenance_periods = [(1, 0.01), (1, 0.02), (1, 0.05), (1, 0.1), (2, 0.01), (2, 0.02), (2, 0.05), (2, 0.1), (7, 0.01), (7, 0.02), (7, 0.05), (7, 0.1), (14, 0.01), (14, 0.02), (14, 0.05), (14, 0.1)]

In [45]:
experiments = {}
for experiment_name in [Experiment.RANDOM, Experiment.BLACKOUT, Experiment.MAINTENANCE]:
    experiments[experiment_name] = {}
    for stage in ["train", "val", "test"]:
        experiments[experiment_name][stage] = pd.read_csv(f"../data/masks_{experiment_name}_{stage}_{full_graph}.csv", index_col=0, header=[0,1]).to_dict(orient="list")

In [46]:
target_column = "Patv"

In [47]:
Baseline_experiment(impute_linear_interpolation, experiments, Experiment.RANDOM, target_column)

train, size ('0.01', 'nan'), MSE 3.47112218150869e-05, MAE 0.0003251557645853609, MDAE 0.0, RMSE 0.0058916229754686356
train, size ('0.02', 'nan'), MSE 7.018256292212754e-05, MAE 0.0006525261560454965, MDAE 0.0, RMSE 0.008377503603696823
train, size ('0.05', 'nan'), MSE 0.00017604346794541925, MAE 0.0016315318644046783, MDAE 0.0, RMSE 0.013268137350678444
train, size ('0.1', 'nan'), MSE 0.0003604000376071781, MAE 0.0032582725398242474, MDAE 0.0, RMSE 0.018984204158186913
val, size ('0.01', 'nan'), MSE 1.7647678760113195e-05, MAE 0.00022362297750078142, MDAE 0.0, RMSE 0.004200913943350315
val, size ('0.02', 'nan'), MSE 3.291392931714654e-05, MAE 0.0004337772843427956, MDAE 0.0, RMSE 0.0057370662689208984
val, size ('0.05', 'nan'), MSE 7.924259989522398e-05, MAE 0.001066209631972015, MDAE 0.0, RMSE 0.008901831693947315
val, size ('0.1', 'nan'), MSE 0.00016475153097417206, MAE 0.0021250457502901554, MDAE 0.0, RMSE 0.012835557572543621
test, size ('0.01', 'nan'), MSE 9.441423571843188e-06,

In [48]:
Baseline_experiment(impute_linear_interpolation, experiments, Experiment.BLACKOUT, target_column)

train, size ('30', '0.01'), MSE 6.035055048414506e-05, MAE 0.0004432430141605437, MDAE 0.0, RMSE 0.007768561597913504
train, size ('60', '0.01'), MSE 0.00010193928028456867, MAE 0.0005523453000932932, MDAE 0.0, RMSE 0.010096498765051365
train, size ('150', '0.01'), MSE 0.00017114296497311443, MAE 0.0007971234736032784, MDAE 0.0, RMSE 0.013082162477076054
train, size ('300', '0.01'), MSE 0.00027617672458291054, MAE 0.0010665463050827384, MDAE 0.0, RMSE 0.01661856658756733
val, size ('30', '0.01'), MSE 2.5239205569960177e-05, MAE 0.0002783160307444632, MDAE 0.0, RMSE 0.005023863632231951
val, size ('60', '0.01'), MSE 3.948717494495213e-05, MAE 0.00036669219844043255, MDAE 0.0, RMSE 0.006283882074058056
val, size ('150', '0.01'), MSE 0.00012401872663758695, MAE 0.0006561026675626636, MDAE 0.0, RMSE 0.011136369779706001
val, size ('300', '0.01'), MSE 0.00019709811022039503, MAE 0.0009471580851823092, MDAE 0.0, RMSE 0.01403916347771883
test, size ('30', '0.01'), MSE 1.2774959031958133e-05, 

In [49]:
Baseline_experiment(impute_linear_interpolation, experiments, Experiment.MAINTENANCE, target_column)

train, size ('1', '0.01'), MSE 0.000587958435062319, MAE 0.0017622595187276602, MDAE 0.0, RMSE 0.024247854948043823
train, size ('2', '0.01'), MSE 0.0010650239419192076, MAE 0.0023608694318681955, MDAE 0.0, RMSE 0.03263470530509949
train, size ('7', '0.01'), MSE 0.0011383052915334702, MAE 0.0023727132938802242, MDAE 0.0, RMSE 0.03373878076672554
train, size ('14', '0.01'), MSE 0.000696045346558094, MAE 0.0016437132144346833, MDAE 0.0, RMSE 0.026382671669125557
val, size ('1', '0.01'), MSE 0.000536785984877497, MAE 0.0015761953545734286, MDAE 0.0, RMSE 0.023168642073869705
val, size ('2', '0.01'), MSE 0.0009182174107991159, MAE 0.0019601238891482353, MDAE 0.0, RMSE 0.030302101746201515
val, size ('7', '0.01'), MSE 0.0007263424340635538, MAE 0.0021922558080404997, MDAE 0.0, RMSE 0.026950741186738014
val, size ('14', '0.01'), MSE 0.0, MAE 0.0, MDAE 0.0, RMSE 0.0
test, size ('1', '0.01'), MSE 0.00010687011672416702, MAE 0.0005574191454797983, MDAE 0.0, RMSE 0.010337800718843937
test, size 

In [50]:
Baseline_experiment(impute_knn_spatial_mean_parallel, experiments, Experiment.RANDOM, target_column)

train, size ('0.01', 'nan'), MSE 7.624101272085682e-05, MAE 0.00041883010999299586, MDAE 0.0, RMSE 0.008731610141694546
train, size ('0.02', 'nan'), MSE 0.00014448519505094737, MAE 0.0008170385845005512, MDAE 0.0, RMSE 0.012020199559628963
train, size ('0.05', 'nan'), MSE 0.00035800228943116963, MAE 0.0020104297436773777, MDAE 0.0, RMSE 0.01892094872891903
train, size ('0.1', 'nan'), MSE 0.0006690070731565356, MAE 0.003901388496160507, MDAE 0.0, RMSE 0.02586517110466957
val, size ('0.01', 'nan'), MSE nan, MAE nan, MDAE nan, RMSE nan
val, size ('0.02', 'nan'), MSE nan, MAE nan, MDAE nan, RMSE nan
val, size ('0.05', 'nan'), MSE nan, MAE nan, MDAE nan, RMSE nan
val, size ('0.1', 'nan'), MSE nan, MAE nan, MDAE nan, RMSE nan
test, size ('0.01', 'nan'), MSE 4.4900418288307264e-05, MAE 0.0002537746331654489, MDAE 0.0, RMSE 0.0067007774487137794
test, size ('0.02', 'nan'), MSE 7.466707029379904e-05, MAE 0.00048409809824079275, MDAE 0.0, RMSE 0.008641011081635952
test, size ('0.05', 'nan'), MSE

In [51]:
Baseline_experiment(impute_knn_spatial_mean_parallel, experiments, Experiment.BLACKOUT, target_column)

train, size ('30', '0.01'), MSE 7.839446334401146e-05, MAE 0.00042088443296961486, MDAE 0.0, RMSE 0.008854065090417862
train, size ('60', '0.01'), MSE 5.550972855417058e-05, MAE 0.0003728217270690948, MDAE 0.0, RMSE 0.00745048513635993
train, size ('150', '0.01'), MSE 6.156186282169074e-05, MAE 0.0004051194409839809, MDAE 0.0, RMSE 0.007846136577427387
train, size ('300', '0.01'), MSE 4.127192005398683e-05, MAE 0.00034655077615752816, MDAE 0.0, RMSE 0.006424322724342346
val, size ('30', '0.01'), MSE 6.302264227997512e-05, MAE 0.0003391154750715941, MDAE 0.0, RMSE 0.007938680239021778
val, size ('60', '0.01'), MSE 4.647319656214677e-05, MAE 0.0003302398545201868, MDAE 0.0, RMSE 0.006817125249654055
val, size ('150', '0.01'), MSE 6.361805571941659e-05, MAE 0.0003294497146271169, MDAE 0.0, RMSE 0.007976092398166656
val, size ('300', '0.01'), MSE 0.0001048460544552654, MAE 0.0004395963333081454, MDAE 0.0, RMSE 0.0102394362911582
test, size ('30', '0.01'), MSE 3.410198405617848e-05, MAE 0.0

In [52]:
Baseline_experiment(impute_knn_spatial_mean_parallel, experiments, Experiment.MAINTENANCE, target_column)

train, size ('1', '0.01'), MSE 4.10694774473086e-05, MAE 0.0003619503404479474, MDAE 0.0, RMSE 0.006408547051250935
train, size ('2', '0.01'), MSE 3.7777928810101e-05, MAE 0.00031565871904604137, MDAE 0.0, RMSE 0.006146375089883804
train, size ('7', '0.01'), MSE 3.879439100273885e-05, MAE 0.0003347702440805733, MDAE 0.0, RMSE 0.006228514481335878
train, size ('14', '0.01'), MSE 2.6993018764187582e-05, MAE 0.0001933887688210234, MDAE 0.0, RMSE 0.0051954807713627815
val, size ('1', '0.01'), MSE 0.00020102853886783123, MAE 0.0008073084754869342, MDAE 0.0, RMSE 0.014178453013300896
val, size ('2', '0.01'), MSE 3.581792407203466e-05, MAE 0.0002903495915234089, MDAE 0.0, RMSE 0.005984807852655649
val, size ('7', '0.01'), MSE 2.249710632895585e-05, MAE 0.0002713734284043312, MDAE 0.0, RMSE 0.004743111319839954
val, size ('14', '0.01'), MSE 0.0, MAE 0.0, MDAE 0.0, RMSE 0.0
test, size ('1', '0.01'), MSE 2.336430588911753e-05, MAE 0.00022235691722016782, MDAE 0.0, RMSE 0.004833663813769817
test,