In [1]:
import sys, os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
import time
import networkx as nx
import numpy as np
import pandas as pd

import torch
from torch.utils.data import DataLoader
from torch_geometric.utils import from_networkx

from utils.prep_data import load_data, split_data, mask_data, Experiment
from utils.train import train
from utils.dataset import WindFarmDataset
from GCGRU.GRU import GRU
from GCGRU.GCGRU import GCGRU
from copy import deepcopy

device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, root_mean_squared_error, median_absolute_error, mean_squared_error
import pandas as pd
import numpy as np
import math

  from .autonotebook import tqdm as notebook_tqdm


cuda


In [2]:
data = load_data(columns=["TurbID", "Wspd", "Wdir", "Etmp", "Itmp", "Ndir", "Pab1", "Pab2", "Pab3", "Prtv", "Patv", "datetime", "P_norm"])
nan_mask = ~data["Patv"].isna().to_numpy()
# subset of turbines for faster experiments
turbines_idx = data.TurbID.unique()
turbines_idx = [9, 10, 11, 12, 31, 32, 33, 34, 35, 52, 53, 54, 55, 56, 57]
data = data[data["TurbID"].isin(turbines_idx)]

data = data.sort_values(["datetime", "TurbID"]).reset_index(drop=True)
data['T'] = data.groupby("TurbID").cumcount()

# normalize features
features = ["Wspd", "Wdir", "Etmp", "Itmp", "Ndir", "Pab1", "Pab2", "Pab3", "Prtv", "Patv"]
data[features] = data[features].apply(lambda col: ((col - col.min()) / (col.max() - col.min())))

train_data, val_data, test_data = split_data(data, splits=[0.7, 0.2, 0.1])

In [3]:
datasets = {
    "train": train_data.copy(),
    "val": val_data.copy(),
    "test": test_data.copy()
}

In [4]:
turbines = pd.read_csv('../data/turbines.csv', index_col=False)
turbines = turbines[turbines['TurbID'].isin(turbines_idx)]
turbines.info()

<class 'pandas.core.frame.DataFrame'>
Index: 15 entries, 8 to 56
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   TurbID  15 non-null     int64  
 1   x       15 non-null     float64
 2   y       15 non-null     float64
dtypes: float64(2), int64(1)
memory usage: 480.0 bytes


In [5]:
x_min = turbines['x'].min()
x_max = turbines['x'].max()
y_min = turbines['y'].min()
y_max = turbines['y'].max()
turbines['x'] = (turbines['x'] - x_min) / (x_max - x_min)
turbines['y'] = (turbines['y'] - y_min) / (y_max - y_min)

In [6]:
def MAE(y_true, y_pred):
    return mean_absolute_error(y_true, y_pred)

def RMSE(y_true, y_pred):
    return root_mean_squared_error(y_true, y_pred)

def MedAE(y_true, y_pred):
    return median_absolute_error(y_true, y_pred)

In [7]:
def impute_linear_interpolation(df):
    copy = df.copy()
    copy['Patv'] = (
        df.groupby('TurbID')['Patv']
        .apply(lambda group: group.interpolate(method='linear', limit_direction='both'))
        .reset_index(level=0, drop=True)
    )
    return copy

In [8]:
distance_df = pd.DataFrame(index=turbines.TurbID.unique(), columns=turbines.TurbID.unique(), dtype=turbines.TurbID.dtype)
turbine_ids = turbines.index.values

for i in turbine_ids:
    xi, yi = turbines.loc[i, ['x', 'y']]
    for j in turbine_ids:
        if i == j:
            continue
        else:
            xj, yj = turbines.loc[j, ['x', 'y']]
            dist = np.sqrt((xi - xj) ** 2 + (yi - yj) ** 2)
            distance_df.loc[i, j] = dist

max_dist = distance_df.max().max()
for i in turbines.index.values:
    distance_df.loc[i, i] = max_dist + 1
    

nearest_neighbors = {}
for tid in distance_df.index:
    distances = distance_df.loc[tid].drop(tid)
    nearest = distances.nsmallest(5)
    nearest_neighbors[tid] = nearest.index.tolist()

In [9]:
from joblib import Parallel, delayed
import pandas as pd
import numpy as np

def impute_knn_spatial_mean_parallel(df, k=5, n_jobs=-1):
    # Step 1: Precompute Patv lookup
    patv_lookup = df.set_index(['TurbID', 'T'])['Patv'].to_dict()

    # Step 2: Precompute nearest neighbors (no weights)
    nearest_neighbors = {}
    for tid in distance_df.index:
        distances = distance_df.loc[tid].drop(tid)
        nearest = distances.nsmallest(k)
        nearest_neighbors[tid] = nearest.index.tolist()

    # Step 3: Define per-turbine imputation function
    def impute_turbine(tid):
        target_df = df[df['TurbID'] == tid].copy()
        missing_mask = target_df['Patv'].isna()

        for idx in target_df[missing_mask].index:
            timestep = target_df.loc[idx, 'T']

            neighbor_vals = [
                patv_lookup.get((neighbor_id, timestep), np.nan)
                for neighbor_id in nearest_neighbors[tid]
            ]
            neighbor_vals = [val for val in neighbor_vals if not pd.isna(val)]

            if neighbor_vals:
                target_df.at[idx, 'Patv'] = float(np.mean(neighbor_vals))
            else:
                target_df.at[idx, 'Patv'] = 0

        print(f"Imputed {tid}", end="\r")
        return target_df

    # Step 4: Run imputation in parallel
    turbine_ids = distance_df.index.tolist()
    imputed_list = Parallel(n_jobs=n_jobs)(
        delayed(impute_turbine)(tid) for tid in turbine_ids
    )

    # Step 5: Combine all turbine data back together
    final_df = pd.concat(imputed_list, ignore_index=True)
    final_df = final_df.sort_values(["datetime", "TurbID"]).reset_index(drop=True)
    return final_df


In [10]:
def Baseline_experiment(baseline_method, experiments: dict, experiment: Experiment, target_column: str):
    experiment_masks = experiments[experiment]
    for name, masks in experiment_masks.items():
        nan_mask = datasets[name][target_column].notna()
        for size, mask in masks.items():
            input_data = deepcopy(datasets[name])
            input_data.loc[~mask, target_column] = np.nan
            result = baseline_method(input_data)
            y_pred = result.set_index("TurbID").loc[datasets[name].loc[nan_mask, "TurbID"], target_column]
            mse = mean_squared_error(datasets[name][nan_mask][target_column], y_pred)
            mae = mean_absolute_error(datasets[name][nan_mask][target_column], y_pred)
            mdae = median_absolute_error(datasets[name][nan_mask][target_column], y_pred)
            rmse = root_mean_squared_error(datasets[name][nan_mask][target_column], y_pred)
            print(f"{name}, size {size}, MSE {mse}, MAE {mae}, MDAE {mdae}, RMSE {rmse}")

In [11]:
random_percentages = [(0.01, None), (0.02, None), (0.05, None), (0.1, None)]
blackout_periods = [(30, 0.01), (60, 0.01), (150, 0.01), (300, 0.01)]
# blackout_periods = [(30, 0.01), (30, 0.02), (30, 0.05), (30, 0.1), (60, 0.01), (60, 0.02), (60, 0.05), (60, 0.1), (150, 0.01), (150, 0.02), (150, 0.05), (150, 0.1), (300, 0.01), (300, 0.02), (300, 0.05), (300, 0.1)]
maintenance_periods = [(1, 0.01), (2, 0.01), (7, 0.01), (14, 0.01)]
# maintenance_periods = [(1, 0.01), (1, 0.02), (1, 0.05), (1, 0.1), (2, 0.01), (2, 0.02), (2, 0.05), (2, 0.1), (7, 0.01), (7, 0.02), (7, 0.05), (7, 0.1), (14, 0.01), (14, 0.02), (14, 0.05), (14, 0.1)]

In [12]:
train_masks_random = { (size, fraction): mask_data(train_data, base_mask=None, experiment=Experiment.RANDOM, size = size, fraction=fraction) for (size, fraction) in random_percentages }
val_masks_random = { (size, fraction): mask_data(val_data, base_mask=None, experiment=Experiment.RANDOM, size = size, fraction=fraction) for (size, fraction) in random_percentages }
test_masks_random = { (size, fraction): mask_data(test_data, base_mask=None, experiment=Experiment.RANDOM, size = size, fraction=fraction) for (size, fraction) in random_percentages }

In [13]:
train_masks_blackout = { (size, fraction): mask_data(train_data, base_mask=None, experiment=Experiment.BLACKOUT, size=size, fraction=fraction) for (size, fraction) in blackout_periods }
val_masks_blackout = { (size, fraction): mask_data(val_data, base_mask=None, experiment=Experiment.BLACKOUT, size=size, fraction=fraction) for (size, fraction) in blackout_periods }
test_masks_blackout = { (size, fraction): mask_data(test_data, base_mask=None, experiment=Experiment.BLACKOUT, size=size, fraction=fraction) for (size, fraction) in blackout_periods }

In [14]:
train_masks_maintenance = { (size, fraction): mask_data(train_data, base_mask=None, experiment=Experiment.MAINTENANCE, size=size, fraction=fraction) for (size, fraction) in maintenance_periods }
val_masks_maintenance = { (size, fraction): mask_data(val_data, base_mask=None, experiment=Experiment.MAINTENANCE, size=size, fraction=fraction) for (size, fraction) in maintenance_periods }
test_masks_maintenance = { (size, fraction): mask_data(test_data, base_mask=None, experiment=Experiment.MAINTENANCE, size=size, fraction=fraction) for (size, fraction) in maintenance_periods }

In [15]:
experiments = {
    Experiment.RANDOM: {
        "train": train_masks_random,
        "val": val_masks_random,
        "test": test_masks_random,
    },
    Experiment.BLACKOUT: {
        "train": train_masks_blackout,
        "val": val_masks_blackout,
        "test": test_masks_blackout,
    },
    Experiment.MAINTENANCE: {
        "train": train_masks_maintenance,
        "val": val_masks_maintenance,
        "test": test_masks_maintenance,
    }
}

In [None]:
target_column = "Patv"

: 

In [None]:
Baseline_experiment(impute_linear_interpolation, experiments, Experiment.RANDOM, target_column)

In [None]:
Baseline_experiment(impute_linear_interpolation, experiments, Experiment.BLACKOUT, target_column)

train, size (30, 0.01), MSE 6.035055048414506e-05, MAE 0.0004432430141605437, MDAE 0.0, RMSE 0.007768561597913504
train, size (60, 0.01), MSE 0.00010193928028456867, MAE 0.0005523453000932932, MDAE 0.0, RMSE 0.010096498765051365
train, size (150, 0.01), MSE 0.00017114296497311443, MAE 0.0007971234736032784, MDAE 0.0, RMSE 0.013082162477076054
train, size (300, 0.01), MSE 0.00027617672458291054, MAE 0.0010665463050827384, MDAE 0.0, RMSE 0.01661856658756733
val, size (30, 0.01), MSE 2.5239205569960177e-05, MAE 0.0002783160307444632, MDAE 0.0, RMSE 0.005023863632231951
val, size (60, 0.01), MSE 3.948717494495213e-05, MAE 0.00036669219844043255, MDAE 0.0, RMSE 0.006283882074058056
val, size (150, 0.01), MSE 0.00012401872663758695, MAE 0.0006561026675626636, MDAE 0.0, RMSE 0.011136369779706001
val, size (300, 0.01), MSE 0.00019709811022039503, MAE 0.0009471580851823092, MDAE 0.0, RMSE 0.01403916347771883
test, size (30, 0.01), MSE 1.2774959031958133e-05, MAE 0.00017005913832690567, MDAE 0.0

In [None]:
Baseline_experiment(impute_linear_interpolation, experiments, Experiment.MAINTENANCE, target_column)

train, size (1, 0.01), MSE 0.000587958435062319, MAE 0.0017622595187276602, MDAE 0.0, RMSE 0.024247854948043823
train, size (2, 0.01), MSE 0.0010650239419192076, MAE 0.0023608694318681955, MDAE 0.0, RMSE 0.03263470530509949
train, size (7, 0.01), MSE 0.0011383052915334702, MAE 0.0023727132938802242, MDAE 0.0, RMSE 0.03373878076672554
train, size (14, 0.01), MSE 0.000696045346558094, MAE 0.0016437132144346833, MDAE 0.0, RMSE 0.026382671669125557
val, size (1, 0.01), MSE 0.000536785984877497, MAE 0.0015761953545734286, MDAE 0.0, RMSE 0.023168642073869705
val, size (2, 0.01), MSE 0.0009182174107991159, MAE 0.0019601238891482353, MDAE 0.0, RMSE 0.030302101746201515
val, size (7, 0.01), MSE 0.0007263424340635538, MAE 0.0021922558080404997, MDAE 0.0, RMSE 0.026950741186738014
val, size (14, 0.01), MSE 0.0, MAE 0.0, MDAE 0.0, RMSE 0.0
test, size (1, 0.01), MSE 0.00010687011672416702, MAE 0.0005574191454797983, MDAE 0.0, RMSE 0.010337800718843937
test, size (2, 0.01), MSE 2.2564208848052658e-0

In [None]:
Baseline_experiment(impute_knn_spatial_mean_parallel, experiments, Experiment.RANDOM, target_column)

Imputed 51

: 

In [None]:
Baseline_experiment(impute_knn_spatial_mean_parallel, experiments, Experiment.BLACKOUT, target_column)

train size 30 MSE: 0.005349527113139629
train size 60 MSE: 0.01811414398252964
train size 150 MSE: 0.0367899052798748
train size 300 MSE: 0.0446394719183445
val size 30 MSE: 0.0031557055190205574
val size 60 MSE: 0.010852232575416565
val size 150 MSE: 0.03259248286485672
val size 300 MSE: 0.04761580377817154
test size 30 MSE: 0.001336216228082776
test size 60 MSE: 0.011795250698924065
test size 150 MSE: 0.030071230605244637
test size 300 MSE: 0.036378972232341766


In [None]:
Baseline_experiment(impute_knn_spatial_mean_parallel, experiments, Experiment.MAINTENANCE, target_column)

train size 1 MSE: 0.0564543716609478
train size 2 MSE: 0.06006365641951561
train size 7 MSE: 0.0605817511677742
train size 14 MSE: 0.06616128236055374
val size 1 MSE: 0.05690144747495651
val size 2 MSE: 0.0576503612101078
val size 7 MSE: 0.06566920876502991
val size 14 MSE: 0.08713645488023758
test size 1 MSE: 0.04370870813727379
test size 2 MSE: 0.045181598514318466
test size 7 MSE: 0.043030161410570145
test size 14 MSE: 0.05615140125155449
