In [7]:
import pandas as pd
import numpy as np

# Load desired dataset to compute extra features

In [None]:
input_ds = "../data/ml-engineer-dataset.csv"
output_ds = "../data/FE_full_dataset.csv"
df = pd.read_csv(input_ds)

In [3]:
input_ds = "../data/ml-engineer-dataset-smoothed_outliers.csv"
output_ds = "../data/FE_full_dataset_smoothed_outliers.csv"
df = pd.read_csv(input_ds)

In [5]:
df["contract-delivery"] = pd.to_datetime(df["contract-delivery"], utc=True)
df["day-ahead-auction-time"] = pd.to_datetime(df["day-ahead-auction-time"], utc=True)

# Feature Engineering

Time-Based Features

    Hour of the Day – Prices vary across the day.
    Day of the Week – Weekday vs. weekend effect.
    Month & Seasonality – Capture seasonal trends.
    Holiday Indicator? – Prices may spike on holidays.
    Sinousoidal enconding

Lag Features

    Previous Day’s Price – Autoregressive signal.
    Previous Week’s Price – Weekly patterns.

Rolling Statistics

    Rolling Mean (7-day, 30-day) – Smooth trends.
    Rolling Standard Deviation – Captures volatility.

Volatility Features

    Wind/Solar Generation Variance – Renewable fluctuations.
    Price Volatility – Standard deviation of past prices.

Interaction Features

    Wind x Solar Generation – Combined effect.
    Demand / Renewable Ratio – Supply-demand dynamics.

In [9]:
# --- TIME-BASED FEATURES ---
df["hour"] = df["contract-delivery"].dt.hour  # Hour of the day
df["day_of_week"] = df["contract-delivery"].dt.dayofweek  # Monday=0, Sunday=6
df["month"] = df["contract-delivery"].dt.month  # Month
df["is_weekend"] = df["day_of_week"].isin([5, 6]).astype(int)  # Weekend indicator
df['quarter'] = df['contract-delivery'].dt.quarter
df['year'] = df['contract-delivery'].dt.year
df['dayofyear'] = df['contract-delivery'].dt.dayofyear
df['dayofmonth'] = df['contract-delivery'].dt.day
df['weekofyear'] = df['contract-delivery'].apply(lambda x : x.weekofyear)

# Time-Based Features: Sinusoidal encoding
df["hour_sin"] = np.sin(2 * np.pi * df["hour"] / 24)
df["hour_cos"] = np.cos(2 * np.pi * df["hour"] / 24)
df["day_of_week_sin"] = np.sin(2 * np.pi * df["day_of_week"] / 24)
df["day_of_week_cos"] = np.cos(2 * np.pi * df["day_of_week"] / 24)
df["month_sin"] = np.sin(2 * np.pi * df["month"] / 24)
df["month_cos"] = np.cos(2 * np.pi * df["month"] / 24)

# --- LAG FEATURES ---
df = df.sort_values("contract-delivery")  # Ensure sorted by time
df["lag_1"] = df["day-ahead-auction-price"].shift(24)  # Previous day price
df["lag_2"] = df["day-ahead-auction-price"].shift(24*2)  # Previous day price
df["lag_3"] = df["day-ahead-auction-price"].shift(24*3)  # Previous day price
df["lag_7"] = df["day-ahead-auction-price"].shift(24 * 7)  # One week ago price
df["lag_14"] = df["day-ahead-auction-price"].shift(24 * 14)  # Two weeks ago price
df["lag_21"] = df["day-ahead-auction-price"].shift(24 * 21)  # Two weeks ago price


# --- ROLLING STATISTICS ---
df["rolling_mean_7"] = df["day-ahead-auction-price"].rolling(window=7*24, min_periods=1).mean()
df["rolling_std_7"] = df["day-ahead-auction-price"].rolling(window=7*24, min_periods=1).std()

# --- VOLATILITY FEATURES ---
df["wind_volatility"] = df["wind-forecast"].rolling(window=24, min_periods=1).std()
df["solar_volatility"] = df["solar-forecast"].rolling(window=24, min_periods=1).std()
df["price_volatility"] = df["day-ahead-auction-price"].rolling(window=24, min_periods=1).std()

# --- INTERACTION FEATURES ---
df["wind_solar_interaction"] = df["wind-forecast"] * df["solar-forecast"]
df["demand_to_renewable_ratio"] = df["demand-forecast"] / (df["wind-forecast"] + df["solar-forecast"] + 1e-6)  # Avoid division by zero

# Drop rows with NaN values created due to shifting and rolling windows
df = df.dropna()


In [10]:
# Save the new dataset with features
df.to_csv(output_ds, index=False)