# Price Data Preprocessing

**Objectives:**
1.  **Clean:** Resample to hourly, fill gaps (Forward Fill), and handle outliers.
2.  **Timezone:** Convert to Europe/Vienna.
3.  **Reshape:** Transform into a **Wide Format** (Date x 24 Hours).
4.  **DST Handling:** Merge duplicates and fill missing spring-forward hours.
5.  **Formatting:** Ensure all values are Floats rounded to 2 decimal places.

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

# 1. Setup Paths
if 'DATA_DIR' not in locals():
    DATA_DIR = Path('../../data')
    
RAW_DIR = DATA_DIR / 'raw'
PROCESSED_DIR = DATA_DIR / 'processed'
PROCESSED_DIR.mkdir(exist_ok=True, parents=True)

# Configuration
ZONES = {
    'ES': '10YES-REE------0',
    'NO2': '10YNO-2--------T',
    'NO4': '10YNO-4--------9',
    'DK1': '10YDK-1--------W'
}

def process_and_reshape(zone_name):
    input_path = RAW_DIR / f"{zone_name}_raw.csv"
    if not input_path.exists():
        print(f"Skipping {zone_name}: File not found.")
        return

    # --- Part 1: Standard Cleaning ---
    
    # Load & Sort
    df = pd.read_csv(input_path)
    df['timestamp'] = pd.to_datetime(df['timestamp'], utc=True)
    df = df.set_index('timestamp').sort_index()

    # Resample to 1H (Mean of sub-hourly periods)
    df_hourly = df.resample('1h').mean()

    # Reindex (Create perfect timeline)
    full_range = pd.date_range(
        start=df_hourly.index.min(),
        end=df_hourly.index.max(),
        freq='h',
        tz='UTC'
    )
    df_clean = df_hourly.reindex(full_range)

    # Fill Gaps (Forward Fill)
    df_clean['price_eur_mwh'] = df_clean['price_eur_mwh'].ffill().bfill()

    # Convert Timezone (UTC -> Vienna)
    df_clean.index = df_clean.index.tz_convert('Europe/Vienna')

    # --- Part 2: Reshape to Wide Format ---
    # Extract Date and Hour
    df_clean['date'] = df_clean.index.date
    df_clean['hour'] = df_clean.index.hour
    
    # Group by Date/Hour and take Mean to merge duplicate hours (Fall back)
    df_grouped = df_clean.groupby(['date', 'hour'])['price_eur_mwh'].mean().reset_index()
    
    # Pivot: Index=Date, Columns=Hour, Values=Price
    df_wide = df_grouped.pivot(index='date', columns='hour', values='price_eur_mwh')
    
    # Rename columns to h00, h01... h23
    df_wide.columns = [f'h{h:02d}' for h in df_wide.columns]
    
    # Handle Missing DST Hours (Spring Forward)
    if df_wide.isna().sum().sum() > 0:
        df_wide = df_wide.ffill(axis=1).bfill(axis=1)

    # --- Part 3: Final Formatting ---
    # Ensure Floats and Round to 2 decimal places
    df_wide = df_wide.astype(float).round(2)

    # Save
    output_path = PROCESSED_DIR / f"{zone_name}_processed.csv"
    df_wide.to_csv(output_path)
    
    print(f"Processed {zone_name}:")
    print(f"  Shape: {df_wide.shape} (Days x Hours)")
    print(f"  Date Range: {df_wide.index.min()} to {df_wide.index.max()}")
    print(f"  Saved to: {output_path.name}\n")

# Execute
print("Starting Cleaning, Reshaping & Rounding Pipeline...\n")
for zone in ZONES.keys():
    process_and_reshape(zone)
print("Preprocessing Complete.")

Starting Cleaning, Reshaping & Rounding Pipeline...

Processed ES:
  Shape: (3651, 24) (Days x Hours)
  Date Range: 2015-11-29 to 2025-11-26
  Saved to: ES_processed.csv

Processed NO2:
  Shape: (3651, 24) (Days x Hours)
  Date Range: 2015-11-29 to 2025-11-26
  Saved to: NO2_processed.csv

Processed NO4:
  Shape: (3651, 24) (Days x Hours)
  Date Range: 2015-11-29 to 2025-11-26
  Saved to: NO4_processed.csv

Processed DK1:
  Shape: (3651, 24) (Days x Hours)
  Date Range: 2015-11-29 to 2025-11-26
  Saved to: DK1_processed.csv

Preprocessing Complete.


## Outlier detection (tbc)