In [1]:
import sys, os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
import time
import networkx as nx
import numpy as np
import pandas as pd

import torch
from torch.utils.data import DataLoader
from torch_geometric.utils import from_networkx

from utils.prep_data import load_data, split_data, mask_data, Experiment
from utils.train import train
from utils.dataset import WindFarmDataset
from GCGRU.GRU import GRU
from GCGRU.GCGRU import GCGRU
from copy import deepcopy

device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)
full_graph = True

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, root_mean_squared_error, median_absolute_error, mean_squared_error
import pandas as pd
import numpy as np
import math

  from .autonotebook import tqdm as notebook_tqdm


cuda


In [2]:
data = load_data(columns=["TurbID", "Wspd", "Wdir", "Etmp", "Itmp", "Ndir", "Pab1", "Pab2", "Pab3", "Prtv", "Patv", "datetime", "P_norm"])
nan_mask = ~data["Patv"].isna().to_numpy()
# subset of turbines for faster experiments
if full_graph:
    turbines_idx = data.TurbID.unique()
else:
    turbines_idx = [9, 10, 11, 12, 31, 32, 33, 34, 35, 52, 53, 54, 55, 56, 57]
data = data[data["TurbID"].isin(turbines_idx)]

data = data.sort_values(["datetime", "TurbID"]).reset_index(drop=True)
data['T'] = data.groupby("TurbID").cumcount()

# normalize features
features = ["Wspd", "Wdir", "Etmp", "Itmp", "Ndir", "Pab1", "Pab2", "Pab3", "Prtv", "Patv"]
data[features] = data[features].apply(lambda col: ((col - col.min()) / (col.max() - col.min())))

train_data, val_data, test_data = split_data(data, splits=[0.7, 0.2, 0.1])

In [3]:
datasets = {
    "train": train_data.copy(),
    "val": val_data.copy(),
    "test": test_data.copy()
}

In [4]:
turbines = pd.read_csv('../data/turbines.csv', index_col=False)
turbines = turbines[turbines['TurbID'].isin(turbines_idx)]
turbines.info()

<class 'pandas.core.frame.DataFrame'>
Index: 133 entries, 0 to 133
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   TurbID  133 non-null    int64  
 1   x       133 non-null    float64
 2   y       133 non-null    float64
dtypes: float64(2), int64(1)
memory usage: 4.2 KB


In [5]:
x_min = turbines['x'].min()
x_max = turbines['x'].max()
y_min = turbines['y'].min()
y_max = turbines['y'].max()
turbines['x'] = (turbines['x'] - x_min) / (x_max - x_min)
turbines['y'] = (turbines['y'] - y_min) / (y_max - y_min)

In [6]:
def MAE(y_true, y_pred):
    return mean_absolute_error(y_true, y_pred)

def RMSE(y_true, y_pred):
    return root_mean_squared_error(y_true, y_pred)

def MedAE(y_true, y_pred):
    return median_absolute_error(y_true, y_pred)

In [7]:
def impute_linear_interpolation(df):
    copy = df.copy()
    copy['Patv'] = (
        df.groupby('TurbID')['Patv']
        .apply(lambda group: group.interpolate(method='linear', limit_direction='both'))
        .reset_index(level=0, drop=True)
    )
    return copy

In [8]:
knn_turbines = turbines.set_index('TurbID')
turbine_ids = knn_turbines.index
distance_df = pd.DataFrame(index=turbine_ids, columns=turbine_ids, dtype=float)

for i in turbine_ids:
    xi, yi = knn_turbines.loc[i, ['x', 'y']].astype(float)
    for j in turbine_ids:
        if i == j:
            distance_df.loc[i, i] = np.inf
        xj, yj = knn_turbines.loc[j, ['x', 'y']].astype(float)
        distance_df.loc[i, j] = np.sqrt((xi - xj) ** 2 + (yi - yj) ** 2)
    
order = np.argsort(distance_df.values, axis=1)
sorted_ids = distance_df.columns.to_numpy()[order]
sorted_neighbors = pd.DataFrame(sorted_ids, index=distance_df.index)

def impute_knn_spatial_mean_parallel(df, k=5):
    result = df.copy()
    patv_mat = result.pivot(index='T', columns='TurbID', values='Patv')

    for tid in sorted_neighbors.index:
        neighbors = sorted_neighbors.loc[tid]
        mask = (result['TurbID'] == tid) & (result['Patv'].isna())
        missing_idx = result.index[mask]

        for idx in missing_idx:
            t = result.at[idx, 'T']
            vals = patv_mat.loc[t, neighbors].dropna()
            if not vals.empty:
                result.at[idx, 'Patv'] = vals.iloc[:k].mean()

    return result

In [9]:
def Baseline_experiment(baseline_method, experiments: dict, experiment: Experiment, target_column: str):
    exp = experiments[experiment]
    for name, masks in exp.items():
        df = datasets[name]
        # keep original order/index; compute y_true once
        nan_mask = df[target_column].notna()
        y_true = df.loc[nan_mask, target_column].to_numpy()

        for size, mask in masks.items():

            # cheap copy + mask in one pass (no deepcopy)
            input_data = df.copy()
            input_data[target_column] = df[target_column].where(mask, np.nan)

            # run baseline; force same row order/index as original
            runtime = time.time()
            result = baseline_method(input_data)
            runtime = time.time() - runtime

            # If baseline returns same index as input (ideal):
            if result.index.equals(df.index):
                y_pred = result.loc[nan_mask, target_column].to_numpy()
            else:
                # Fallback: enforce strict alignment by original index
                result = result.reindex(df.index)
                y_pred = result.loc[nan_mask, target_column].to_numpy()

            # use numpy once; avoids extra pandas copies
            test_mask = ~((pd.Series(mask,index=df.index)) & (nan_mask))
            diff = df.loc[test_mask, target_column] - result.loc[test_mask, target_column]
            mse = np.mean(diff ** 2)
            mae = np.mean(np.abs(diff))
            mdae = np.median(np.abs(diff))
            rmse = np.sqrt(mse)

            print(f"{name}, size {size}, MSE {mse}, MAE {mae}, MDAE {mdae}, RMSE {rmse}, time {runtime}s")


In [10]:
random_percentages = [(0.01, None), (0.02, None), (0.05, None), (0.1, None)]
blackout_periods = [(30, 0.01), (60, 0.01), (150, 0.01), (300, 0.01)]
# blackout_periods = [(30, 0.01), (30, 0.02), (30, 0.05), (30, 0.1), (60, 0.01), (60, 0.02), (60, 0.05), (60, 0.1), (150, 0.01), (150, 0.02), (150, 0.05), (150, 0.1), (300, 0.01), (300, 0.02), (300, 0.05), (300, 0.1)]
maintenance_periods = [(1, 0.01), (2, 0.01), (7, 0.01), (14, 0.01)]
# maintenance_periods = [(1, 0.01), (1, 0.02), (1, 0.05), (1, 0.1), (2, 0.01), (2, 0.02), (2, 0.05), (2, 0.1), (7, 0.01), (7, 0.02), (7, 0.05), (7, 0.1), (14, 0.01), (14, 0.02), (14, 0.05), (14, 0.1)]

In [11]:
experiments = {}
for experiment_name in [Experiment.RANDOM, Experiment.BLACKOUT, Experiment.MAINTENANCE]:
    experiments[experiment_name] = {}
    for stage in ["train", "val", "test"]:
        experiments[experiment_name][stage] = pd.read_csv(f"../data/masks_{experiment_name}_{stage}_{full_graph}.csv", index_col=0, header=[0,1]).to_dict(orient="list")

In [12]:
target_column = "Patv"

In [13]:
Baseline_experiment(impute_linear_interpolation, experiments, Experiment.RANDOM, target_column)

train, size ('0.01', 'nan'), MSE 0.0034801913425326347, MAE 0.032574255019426346, MDAE nan, RMSE 0.0589931458234787, time 1.7894737720489502s
train, size ('0.02', 'nan'), MSE 0.0034878894221037626, MAE 0.032503776252269745, MDAE nan, RMSE 0.059058357030153275, time 1.543189525604248s
train, size ('0.05', 'nan'), MSE 0.0035488163121044636, MAE 0.03270843252539635, MDAE nan, RMSE 0.059571944177150726, time 1.7212767601013184s
train, size ('0.1', 'nan'), MSE 0.0036766340490430593, MAE 0.03331367298960686, MDAE nan, RMSE 0.06063525378704071, time 1.5197515487670898s
val, size ('0.01', 'nan'), MSE 0.0015214182203635573, MAE 0.02000289037823677, MDAE nan, RMSE 0.03900536149740219, time 0.26654648780822754s
val, size ('0.02', 'nan'), MSE 0.001543801510706544, MAE 0.020479874685406685, MDAE nan, RMSE 0.03929124027490616, time 0.23034191131591797s
val, size ('0.05', 'nan'), MSE 0.0015615444863215089, MAE 0.02075336128473282, MDAE nan, RMSE 0.039516381919384, time 0.27513623237609863s
val, size 

In [14]:
Baseline_experiment(impute_linear_interpolation, experiments, Experiment.BLACKOUT, target_column)

train, size ('30', '0.01'), MSE 0.005712533835321665, MAE 0.04264602065086365, MDAE nan, RMSE 0.07558130472898483, time 1.4627208709716797s
train, size ('60', '0.01'), MSE 0.009125585667788982, MAE 0.05450702831149101, MDAE nan, RMSE 0.09552793204784393, time 1.5304770469665527s
train, size ('150', '0.01'), MSE 0.015618128702044487, MAE 0.0759606659412384, MDAE nan, RMSE 0.12497251480817795, time 1.3586702346801758s
train, size ('300', '0.01'), MSE 0.02566596120595932, MAE 0.1015004962682724, MDAE nan, RMSE 0.16020599007606506, time 1.5022873878479004s
val, size ('30', '0.01'), MSE 0.0027670827694237232, MAE 0.02881596051156521, MDAE nan, RMSE 0.05260306969285011, time 0.25208449363708496s
val, size ('60', '0.01'), MSE 0.004262839909642935, MAE 0.03674185648560524, MDAE nan, RMSE 0.0652904286980629, time 0.3080027103424072s
val, size ('150', '0.01'), MSE 0.009113645181059837, MAE 0.0539199523627758, MDAE nan, RMSE 0.09546541422605515, time 0.2763824462890625s
val, size ('300', '0.01'),

In [15]:
Baseline_experiment(impute_linear_interpolation, experiments, Experiment.MAINTENANCE, target_column)

train, size ('1', '0.01'), MSE 0.05824809893965721, MAE 0.16786019504070282, MDAE nan, RMSE 0.24134643375873566, time 1.623286247253418s
train, size ('2', '0.01'), MSE 0.07916086167097092, MAE 0.2037838250398636, MDAE nan, RMSE 0.2813554108142853, time 1.5149908065795898s
train, size ('7', '0.01'), MSE 0.1163896769285202, MAE 0.25379475951194763, MDAE nan, RMSE 0.34115931391716003, time 1.3850679397583008s
train, size ('14', '0.01'), MSE 0.08550097793340683, MAE 0.21743382513523102, MDAE nan, RMSE 0.29240551590919495, time 1.4765796661376953s
val, size ('1', '0.01'), MSE 0.03734096139669418, MAE 0.13718175888061523, MDAE nan, RMSE 0.19323809444904327, time 0.22762608528137207s
val, size ('2', '0.01'), MSE 0.07593579590320587, MAE 0.19904184341430664, MDAE nan, RMSE 0.2755644917488098, time 0.2936210632324219s
val, size ('7', '0.01'), MSE 0.11831554025411606, MAE 0.24980565905570984, MDAE nan, RMSE 0.34397026896476746, time 0.2834615707397461s
val, size ('14', '0.01'), MSE 0.15825904905

In [16]:
Baseline_experiment(impute_knn_spatial_mean_parallel, experiments, Experiment.RANDOM, target_column)

train, size ('0.01', 'nan'), MSE 0.008552166633307934, MAE 0.0451088547706604, MDAE nan, RMSE 0.09247792512178421, time 23.46452569961548s
train, size ('0.02', 'nan'), MSE 0.008376258425414562, MAE 0.0447993203997612, MDAE nan, RMSE 0.09152190387248993, time 40.92345333099365s
train, size ('0.05', 'nan'), MSE 0.00828421488404274, MAE 0.0447966568171978, MDAE nan, RMSE 0.09101766347885132, time 93.7574508190155s
train, size ('0.1', 'nan'), MSE 0.008316470310091972, MAE 0.04508782550692558, MDAE nan, RMSE 0.09119468182325363, time 178.35557913780212s
val, size ('0.01', 'nan'), MSE 0.006976909935474396, MAE 0.03965092450380325, MDAE nan, RMSE 0.08352790027856827, time 7.359233140945435s
val, size ('0.02', 'nan'), MSE 0.006970355752855539, MAE 0.04011167958378792, MDAE nan, RMSE 0.08348865807056427, time 12.503087997436523s
val, size ('0.05', 'nan'), MSE 0.0066682216711342335, MAE 0.03964371979236603, MDAE nan, RMSE 0.08165918290615082, time 27.34220862388611s
val, size ('0.1', 'nan'), MSE

In [17]:
Baseline_experiment(impute_knn_spatial_mean_parallel, experiments, Experiment.BLACKOUT, target_column)

train, size ('30', '0.01'), MSE 0.007808317895978689, MAE 0.04391363263130188, MDAE nan, RMSE 0.08836469054222107, time 23.45202398300171s
train, size ('60', '0.01'), MSE 0.007559914141893387, MAE 0.043392207473516464, MDAE nan, RMSE 0.08694776892662048, time 23.500245809555054s
train, size ('150', '0.01'), MSE 0.00890881847590208, MAE 0.04636640101671219, MDAE nan, RMSE 0.09438654035329819, time 23.55931568145752s
train, size ('300', '0.01'), MSE 0.00785677507519722, MAE 0.044721778482198715, MDAE nan, RMSE 0.08863845467567444, time 24.956033945083618s
val, size ('30', '0.01'), MSE 0.00686305807903409, MAE 0.038821991533041, MDAE nan, RMSE 0.08284357935190201, time 7.948244094848633s
val, size ('60', '0.01'), MSE 0.007697091903537512, MAE 0.04126465693116188, MDAE nan, RMSE 0.08773307502269745, time 7.509190320968628s
val, size ('150', '0.01'), MSE 0.008234948851168156, MAE 0.042027413845062256, MDAE nan, RMSE 0.09074661880731583, time 7.498984336853027s
val, size ('300', '0.01'), MSE

In [18]:
Baseline_experiment(impute_knn_spatial_mean_parallel, experiments, Experiment.MAINTENANCE, target_column)

train, size ('1', '0.01'), MSE 0.007152040489017963, MAE 0.045240070670843124, MDAE nan, RMSE 0.08456973731517792, time 24.628334045410156s
train, size ('2', '0.01'), MSE 0.00993603840470314, MAE 0.047705259174108505, MDAE nan, RMSE 0.09967967867851257, time 24.007489681243896s
train, size ('7', '0.01'), MSE 0.005995853804051876, MAE 0.041023313999176025, MDAE nan, RMSE 0.07743290066719055, time 23.242685079574585s
train, size ('14', '0.01'), MSE 0.007090313360095024, MAE 0.04262137785553932, MDAE nan, RMSE 0.08420399576425552, time 23.42939782142639s
val, size ('1', '0.01'), MSE 0.004227102734148502, MAE 0.036197565495967865, MDAE nan, RMSE 0.06501617282629013, time 7.31433367729187s
val, size ('2', '0.01'), MSE 0.004309969954192638, MAE 0.03640270233154297, MDAE nan, RMSE 0.06565035879611969, time 7.338760137557983s
val, size ('7', '0.01'), MSE 0.003894123714417219, MAE 0.0350886806845665, MDAE nan, RMSE 0.062402915209531784, time 7.255992650985718s
val, size ('14', '0.01'), MSE 0.00