In [1]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, root_mean_squared_error, median_absolute_error
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv('../data/data_interpolated.csv')
pos_data = pd.read_csv('../data/turbines.csv', index_col='TurbID')
noise_mask = pd.read_csv('../data/noise_mask.csv')
interval_noise_mask = pd.read_csv('../data/interval_noise_mask.csv')

print(data.shape)
print(pos_data.shape)
print(noise_mask.shape)
print(interval_noise_mask.shape)

(4727520, 16)
(134, 2)
(4727520, 1)
(4727520, 1)


In [3]:
# X, Y = train_test_split(
#     data, test_size=0.2, random_state=42
# )

data_noise = data.copy()
data_noise['Patv'] = data['Patv'].where(~noise_mask['NaN_Mask'], np.nan)

data_interval_noise = data.copy()
data_interval_noise['Patv'] = data['Patv'].where(~interval_noise_mask['NaN_Mask'], np.nan)

In [4]:
def MAE(y_true, y_pred):
    return mean_absolute_error(y_true, y_pred)

def RMSE(y_true, y_pred):
    return root_mean_squared_error(y_true, y_pred)

def MedAE(y_true, y_pred):
    return median_absolute_error(y_true, y_pred)

In [5]:
def impute_linear_interpolation(df):
    copy = df.copy()
    copy['Patv'] = (
        df.groupby('TurbID')['Patv']
        .apply(lambda group: group.interpolate(method='linear', limit_direction='both'))
        .reset_index(level=0, drop=True)
    )
    return copy

imputed_linear = impute_linear_interpolation(data_noise)

In [6]:
distance_df = pd.DataFrame(index=pos_data.index.values, columns=pos_data.index.values, dtype=pos_data.index.dtype)
turbine_ids = pos_data.index.values

for i in turbine_ids:
    xi, yi = pos_data.loc[i, ['x', 'y']]
    for j in turbine_ids:
        if i == j:
            continue
        else:
            xj, yj = pos_data.loc[j, ['x', 'y']]
            dist = np.sqrt((xi - xj) ** 2 + (yi - yj) ** 2)
            distance_df.loc[i, j] = dist

max_dist = distance_df.max().max()
for i in pos_data.index.values:
    distance_df.loc[i, i] = max_dist + 1
    

In [7]:
from joblib import Parallel, delayed
import pandas as pd
import numpy as np

def impute_knn_spatial_parallel(df, distance_df, k=5, p=2, n_jobs=-1):
    patv_lookup = df.set_index(['TurbID', 'T'])['Patv'].to_dict()

    neighbor_weights = {}
    for tid in distance_df.index:
        distances = distance_df.loc[tid].drop(tid)
        nearest = distances.nsmallest(k)
        nearest_dists = nearest.values.astype(float)
        weights = 1 / (nearest_dists ** p)
        weights /= weights.sum()

        neighbor_weights[tid] = list(zip(nearest.index, weights))

    def impute_turbine(tid):
        target_df = df[df['TurbID'] == tid].copy()
        missing_mask = target_df['Patv'].isna()

        for idx in target_df[missing_mask].index:
            timestep = target_df.loc[idx, 'T']

            weighted_sum = 0
            total_weight = 0

            for neighbor_id, weight in neighbor_weights[tid]:
                val = patv_lookup.get((neighbor_id, timestep), np.nan)
                if not pd.isna(val):
                    weighted_sum += weight * val
                    total_weight += weight

            if total_weight > 0:
                target_df.at[idx, 'Patv'] = weighted_sum / total_weight
            else:
                target_df.at[idx, 'Patv'] = 0

        print(f"Imputed {tid}", end="\r")
        return target_df

    turbine_ids = distance_df.index.tolist()
    imputed_list = Parallel(n_jobs=n_jobs)(
        delayed(impute_turbine)(tid) for tid in turbine_ids
    )

    final_df = pd.concat(imputed_list, ignore_index=True)
    return final_df

imputed_weighted = impute_knn_spatial_parallel(data_noise, distance_df, k=5, p=2)

Imputed 134

In [8]:
from joblib import Parallel, delayed
import pandas as pd
import numpy as np

def impute_knn_spatial_mean_parallel(df, distance_df, k=5, n_jobs=-1):
    # Step 1: Precompute Patv lookup
    patv_lookup = df.set_index(['TurbID', 'T'])['Patv'].to_dict()

    # Step 2: Precompute nearest neighbors (no weights)
    nearest_neighbors = {}
    for tid in distance_df.index:
        distances = distance_df.loc[tid].drop(tid)
        nearest = distances.nsmallest(k)
        nearest_neighbors[tid] = nearest.index.tolist()

    # Step 3: Define per-turbine imputation function
    def impute_turbine(tid):
        target_df = df[df['TurbID'] == tid].copy()
        missing_mask = target_df['Patv'].isna()

        for idx in target_df[missing_mask].index:
            timestep = target_df.loc[idx, 'T']

            neighbor_vals = [
                patv_lookup.get((neighbor_id, timestep), np.nan)
                for neighbor_id in nearest_neighbors[tid]
            ]
            neighbor_vals = [val for val in neighbor_vals if not pd.isna(val)]

            if neighbor_vals:
                target_df.at[idx, 'Patv'] = np.mean(neighbor_vals)
            else:
                target_df.at[idx, 'Patv'] = 0  # or np.nan

        print(f"Imputed {tid}", end="\r")
        return target_df

    # Step 4: Run imputation in parallel
    turbine_ids = distance_df.index.tolist()
    imputed_list = Parallel(n_jobs=n_jobs)(
        delayed(impute_turbine)(tid) for tid in turbine_ids
    )

    # Step 5: Combine all turbine data back together
    final_df = pd.concat(imputed_list, ignore_index=True)
    return final_df

imputed_mean = impute_knn_spatial_mean_parallel(data_noise, distance_df, k=5)


Imputed 134

In [9]:
def evaluate_imputation(true_df, imputed_df, mask_df, method_name=''):
    """
    Compute MAE, RMSE, and MedAE at masked positions.
    """
    mask = mask_df['NaN_Mask'].values  # Use the boolean array directly

    y_true = true_df['Patv'].values[mask]
    y_pred = imputed_df['Patv'].values[mask]

    print(f"\nðŸ“Š Error Metrics for {method_name}:")
    print(f"  MAE   = {MAE(y_true, y_pred):.3f}")
    print(f"  RMSE  = {RMSE(y_true, y_pred):.3f}")
    print(f"  MedAE = {MedAE(y_true, y_pred):.3f}")

evaluate_imputation(data, imputed_linear, noise_mask, method_name='Linear Interpolation')
evaluate_imputation(data, imputed_mean, noise_mask, method_name='kNN Mean (k=5)')
evaluate_imputation(data, imputed_weighted, noise_mask, method_name='kNN Weighted (k=5, p=2)')


ðŸ“Š Error Metrics for Linear Interpolation:
  MAE   = 45.341
  RMSE  = 86.134
  MedAE = 18.160

ðŸ“Š Error Metrics for kNN Mean (k=5):
  MAE   = 66.229
  RMSE  = 137.395
  MedAE = 26.100

ðŸ“Š Error Metrics for kNN Weighted (k=5, p=2):
  MAE   = 64.093
  RMSE  = 140.009
  MedAE = 23.231
