# Experiments

In [1]:
import sys, os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
from utils.prep_data import load_data, split_data, mask_data, Experiment
from utils.train import train
from utils.dataset import WindFarmDataset
from GCGRU.GRU import GRU
import torch
from torch.utils.data import DataLoader
import numpy as np

device = "cuda" if torch.cuda.is_available() else "cpu"

### Preprocess data

#### Load the data

In [2]:
data = load_data(columns=["TurbID", "P_norm", "datetime"])
nan_mask = ~data["P_norm"].isna().to_numpy()
# Assign mean to nan values
data_mean = data["P_norm"].mean()
data.loc[~nan_mask, "P_norm"] = data_mean
# subset of turbines for faster experiments
turbines = [9, 10, 11, 12, 31, 32, 33, 34, 35, 52, 53, 54, 55, 56, 57]
data = data[data["TurbID"].isin(turbines)]
train_data, val_data, test_data = split_data(data, splits=[0.7, 0.2, 0.1])

Reset data split indices

In [3]:
train_data.reset_index(drop=True, inplace=True)
val_data.reset_index(drop=True, inplace=True)
test_data.reset_index(drop=True, inplace=True)

#### Create masks for different splits

In [4]:
random_percentages = [0.01, 0.02, 0.05, 0.1]
blackout_periods = [30, 60, 150, 300]
maintenance_periods = [1, 2, 7, 14]

In [5]:
train_masks_random = { size: mask_data(train_data, base_mask=None, experiment=Experiment.RANDOM, size = size) for size in random_percentages }
val_masks_random = { size: mask_data(val_data, base_mask=None, experiment=Experiment.RANDOM, size = size) for size in random_percentages }
test_masks_random = { size: mask_data(test_data, base_mask=None, experiment=Experiment.RANDOM, size = size) for size in random_percentages }

In [6]:
train_masks_blackout = { size: mask_data(train_data, base_mask=None, experiment=Experiment.BLACKOUT, size=size) for size in blackout_periods }
val_masks_blackout = { size: mask_data(val_data, base_mask=None, experiment=Experiment.BLACKOUT, size=size) for size in blackout_periods }
test_masks_blackout = { size: mask_data(test_data, base_mask=None, experiment=Experiment.BLACKOUT, size=size) for size in blackout_periods }

In [7]:
train_masks_maintenance = { size: mask_data(train_data, base_mask=None, experiment=Experiment.MAINTENANCE, size=size) for size in maintenance_periods }
val_masks_maintenance = { size: mask_data(val_data, base_mask=None, experiment=Experiment.MAINTENANCE, size=size) for size in maintenance_periods }
test_masks_maintenance = { size: mask_data(test_data, base_mask=None, experiment=Experiment.MAINTENANCE, size=size) for size in maintenance_periods }

### RNN-based models

In [8]:
def format_sliding_window(data: np.ndarray, data_mask: np.ndarray, nan_mask: np.ndarray, t: int = 6 * 24):
    X_window, y_window = [], []
    for i in range(data.shape[0] - t - 1):
        xw = data[i:(i+t)]
        # Simulate missing data on X
        xw[~data_mask[i:(i+t)]] = data_mean
        X_window.append(xw)
        # Keep unmasked true data in y
        y_window.append(np.stack([data[(i+1):(i+1+t)], data_mask[(i+1):(i+1+t)], nan_mask[(i+1):(i+1+t)]], axis=-1))

    # conversion to np.ndarray -> then torch.Tensor because of python list of np.ndarray
    return torch.tensor(np.array(X_window)).unsqueeze(-1).to(device), torch.tensor(np.array(y_window)).to(device)

In [9]:
data_9 = train_data[train_data["TurbID"] == 9]["P_norm"].to_numpy()
mask_9 = train_masks_random[0.05][train_data[train_data["TurbID"] == 9].index]
nan_mask_9 = nan_mask[train_data[train_data["TurbID"] == 9].index]

In [10]:
data_9[~nan_mask_9] = 0.5

In [11]:
train_X, train_y = format_sliding_window(data_9, mask_9, nan_mask_9)
train_y.shape

torch.Size([24366, 144, 3])

#### GRU

In [12]:
models = {turb_id: GRU(1, 16, 1) for turb_id in data['TurbID'].unique()}

lr = 0.01
epochs = 100
batch_size = 512
turbine_train_losses, turbine_val_losses = {}, {}
for turb_id, model in models.items():
    optimizer = torch.optim.Adam(list(model.parameters()), lr=lr)
    criterion = torch.nn.MSELoss()

    model.to(device)
    criterion.to(device)

    turb_train_data = train_data[train_data['TurbID'] == turb_id]['P_norm'].to_numpy()
    turb_train_mask = train_masks_random[0.05][train_data[train_data["TurbID"] == turb_id].index]
    turb_train_nan_mask = nan_mask[train_data[train_data["TurbID"] == turb_id].index]

    X_train_window, y_train_window = format_sliding_window(turb_train_data, turb_train_mask, turb_train_nan_mask)
    train_loader = DataLoader(WindFarmDataset(X_train_window, y_train_window), batch_size=batch_size)


    turb_val_data = val_data[val_data['TurbID'] == turb_id]['P_norm'].to_numpy()
    turb_val_mask = val_masks_random[0.05][val_data[val_data["TurbID"] == turb_id].index]
    turb_val_nan_mask = nan_mask[val_data[val_data["TurbID"] == turb_id].index]

    X_val_window, y_val_window = format_sliding_window(turb_val_data, turb_val_mask, turb_val_nan_mask)
    val_loader = DataLoader(WindFarmDataset(X_val_window, y_val_window), batch_size=batch_size)


    print(f"Training GRU for turbine {turb_id}")
    models[turb_id], train_losses, val_losses = train(model, train_loader, val_loader, optimizer, criterion, epochs=epochs, patience=10)
    print("\n\n")
    turbine_train_losses[turb_id] = train_losses
    turbine_val_losses[turb_id] = val_losses

Training GRU for turbine 9
Epoch: 1;	training loss: 0.0028;	validation loss: 0.0011
Epoch: 10;	training loss: 0.0006;	validation loss: 0.0008
Epoch: 20;	training loss: 0.0006;	validation loss: 0.0008
Stopping early on epoch 26



Training GRU for turbine 10
Epoch: 1;	training loss: 0.0321;	validation loss: 0.0023
Epoch: 10;	training loss: 0.0006;	validation loss: 0.0007
Epoch: 20;	training loss: 0.0006;	validation loss: 0.0007
Epoch: 30;	training loss: 0.0006;	validation loss: 0.0007
Epoch: 40;	training loss: 0.0006;	validation loss: 0.0007
Epoch: 50;	training loss: 0.0006;	validation loss: 0.0007
Epoch: 60;	training loss: 0.0006;	validation loss: 0.0007
Epoch: 70;	training loss: 0.0006;	validation loss: 0.0007
Epoch: 80;	training loss: 0.0006;	validation loss: 0.0007
Epoch: 90;	training loss: 0.0006;	validation loss: 0.0007
Stopping early on epoch 96



Training GRU for turbine 11
Epoch: 1;	training loss: 0.0127;	validation loss: 0.0011
Epoch: 10;	training loss: 0.0007;	validation los

In [14]:
for turb_id, model in models.items():
    torch.save(model.state_dict(), f"../data/params/turbine_{turb_id}_params.pth")
    with open("../data/params/train_losses.csv", "a") as f:
        line = f"{turb_id}," + ",".join(map(str, turbine_train_losses[turb_id])) + "\n"
        f.write(line)
    with open("../data/params/val_losses.csv", "a") as f:
        line = f"{turb_id}," + ",".join(map(str, turbine_val_losses[turb_id])) + "\n"
        f.write(line)
