In [12]:
import requests
import pandas as pd

In [13]:
from datetime import datetime, timedelta
import time

In [14]:
import json
import os
from typing import List, Dict, Optional

In [15]:
import pandas as pd
import numpy as np
import glob
import os
from datetime import datetime
import re

INPUT_DIR = '/Users/annie_lok_yan_wong/Downloads/CFE%/' 
OUTPUT_DIR = '/Users/annie_lok_yan_wong/Downloads/CFE%/merged_output'  
os.makedirs(OUTPUT_DIR, exist_ok=True)

COLUMNS_TO_KEEP = {
    'Datetime (UTC)': 'timestamp',
    'Zone id': 'zone',
    'Country': 'country',
    'Carbon-free energy percentage (CFE%)': 'carbon_free_pct',
    'Renewable energy percentage (RE%)': 'renewable_pct',
    'Carbon intensity gCO‚ÇÇeq/kWh (direct)': 'carbon_intensity_direct',
    'Carbon intensity gCO‚ÇÇeq/kWh (Life cycle)': 'carbon_intensity_lifecycle',
    'Data estimated': 'is_estimated'
}

def extract_metadata_from_filename(filename):
    """
    Extract zone and year from filename.
    Example: 'snapshots_2025-07-03_DK-2023-hourly.csv' -> ('DK', 2023)
    """
    basename = os.path.basename(filename)
    
    # Pattern: zone-year
    match = re.search(r'_([A-Z]{2})-(\d{4})-', basename)
    if match:
        zone = match.group(1)
        year = int(match.group(2))
        return zone, year
    
    return None, None


def load_and_process_file(filepath):
   
    try:
        # Load CSV
        df = pd.read_csv(filepath)
        
        # Get metadata from filename
        zone_from_file, year_from_file = extract_metadata_from_filename(filepath)
        
        # Rename columns to standard names
        df = df.rename(columns=COLUMNS_TO_KEEP)
        
        # Keep only columns we want
        cols_present = [col for col in COLUMNS_TO_KEEP.values() if col in df.columns]
        df = df[cols_present]
        
        # Convert timestamp to datetime
        df['timestamp'] = pd.to_datetime(df['timestamp'])
        
        # Extract year from data
        df['year'] = df['timestamp'].dt.year
        
        # Add metadata
        df['source_file'] = os.path.basename(filepath)
        
        # Convert is_estimated to boolean
        if 'is_estimated' in df.columns:
            df['is_estimated'] = df['is_estimated'].map({'true': True, 'false': False, True: True, False: False})
        
        return df, zone_from_file, year_from_file
    
    except Exception as e:
        print(f"Error processing {os.path.basename(filepath)}: {e}")
        return None, None, None


In [16]:
#Merge all country CSV files into one dataset.
def merge_all_country_files(input_dir, file_pattern='snapshots_*.csv'):
    
    # Find all CSV files
    file_pattern_full = os.path.join(input_dir, file_pattern)
    csv_files = glob.glob(file_pattern_full)
    
    if not csv_files:
        print(f"No files found matching pattern: {file_pattern_full}")
        return None
    
    print(f"Found {len(csv_files)} files to merge")
    
    # Load and process each file
    all_dataframes = []
    file_summaries = []
    
    for i, filepath in enumerate(sorted(csv_files), 1):
        filename = os.path.basename(filepath)
        print(f"\n[{i}/{len(csv_files)}] Processing: {filename}")
        
        df, zone, year = load_and_process_file(filepath)
        
        if df is not None:
            summary = get_data_summary(df)
            print(f"Loaded: {summary['total_rows']} rows")
            print(f"Zone: {df['zone'].iloc[0]}, Year: {df['year'].unique().tolist()}")
            print(f"Date range: {summary['date_range']}")
            print(f"Missing data: {summary['missing_pct']}")
            print(f"Avg carbon-free: {summary['avg_carbon_free']}")
            
            all_dataframes.append(df)
            file_summaries.append({
                'file': filename,
                'zone': zone,
                'year': year,
                **summary
            })
        else:
            print(f"Failed to load")

In [17]:
#Merge all country CSV files into one dataset.
def merge_all_country_files(input_dir, file_pattern='snapshots_*.csv'):
    
    # Find all CSV files
    file_pattern_full = os.path.join(input_dir, file_pattern)
    csv_files = glob.glob(file_pattern_full)
    
    if not csv_files:
        print(f"No files found matching pattern: {file_pattern_full}")
        return None
    
    print(f"Found {len(csv_files)} files to merge")
    
    # Load and process each file
    all_dataframes = []
    file_summaries = []
    
    for i, filepath in enumerate(sorted(csv_files), 1):
        filename = os.path.basename(filepath)
        print(f"\n[{i}/{len(csv_files)}] Processing: {filename}")
        
        df, zone, year = load_and_process_file(filepath)
        
        if df is not None:
            summary = get_data_summary(df)
            print(f"Loaded: {summary['total_rows']} rows")
            print(f"Zone: {df['zone'].iloc[0]}, Year: {df['year'].unique().tolist()}")
            print(f"Date range: {summary['date_range']}")
            print(f"Missing data: {summary['missing_pct']}")
            print(f"Avg carbon-free: {summary['avg_carbon_free']}")
            
            all_dataframes.append(df)
            file_summaries.append({
                'file': filename,
                'zone': zone,
                'year': year,
                **summary
            })
        else:
            print(f"Failed to load")

    
    # Combine all dataframes
    print(f"Merging {len(all_dataframes)} dataframes...")
    merged_df = pd.concat(all_dataframes, ignore_index=True)
    
    # Sort by zone and timestamp
    merged_df = merged_df.sort_values(['zone', 'timestamp']).reset_index(drop=True)
    
    # Remove duplicates (if any)
    duplicates_before = len(merged_df)
    merged_df = merged_df.drop_duplicates(subset=['zone', 'timestamp'])
    duplicates_removed = duplicates_before - len(merged_df)
    
    if duplicates_removed > 0:
        print(f"Removed {duplicates_removed} duplicate rows")
    
    print(f"Merged dataset: {len(merged_df)} rows")
    
    # Summary by country
    print("\n" + "=" * 70)
    print("SUMMARY BY COUNTRY")
    print("=" * 70)
    
    country_summary = merged_df.groupby('zone').agg({
        'timestamp': ['min', 'max', 'count'],
        'carbon_free_pct': ['mean', 'std'],
        'country': 'first'
    }).round(2)
    
    print(country_summary.to_string())
    
    # Save summary report
    summary_df = pd.DataFrame(file_summaries)
    summary_path = os.path.join(OUTPUT_DIR, 'merge_summary_report.csv')
    summary_df.to_csv(summary_path, index=False)
    print(f"\nüíæ Summary report saved to: {summary_path}")
    
    return merged_df


In [18]:
def create_wide_format(merged_df):

# Create wide format where each country is a column.
    
    # Pivot carbon-free percentage
    wide_cf = merged_df.pivot(
        index='timestamp',
        columns='zone',
        values='carbon_free_pct'
    )
    
    # Add suffix to column names
    wide_cf.columns = [f'{col}_carbon_free_pct' for col in wide_cf.columns]
    
    print(f" Wide format: {len(wide_cf)} rows √ó {len(wide_cf.columns)} columns")
    
    return wide_cf


def check_data_quality(merged_df):
    """
    Perform comprehensive data quality checks.
    """
    print("\nüîç DATA QUALITY CHECKS")
    print("=" * 70)
    
    issues = []
    
    # Check 1: Missing data
    missing_counts = merged_df['carbon_free_pct'].isna().sum()
    missing_pct = (missing_counts / len(merged_df)) * 100
    print(f"Missing carbon-free data: {missing_counts} ({missing_pct:.2f}%)")
    if missing_pct > 5:
        issues.append(f"High missing data: {missing_pct:.2f}%")
    
    # Check 2: Invalid values (should be 0-100)
    invalid = ((merged_df['carbon_free_pct'] < 0) | (merged_df['carbon_free_pct'] > 100)).sum()
    print(f"Invalid values (<0 or >100): {invalid}")
    if invalid > 0:
        issues.append(f"{invalid} invalid percentage values")
    
    # Check 3: Time gaps
    print("\nTime gaps by country:")
    for zone in merged_df['zone'].unique():
        zone_df = merged_df[merged_df['zone'] == zone].sort_values('timestamp')
        time_diffs = zone_df['timestamp'].diff()
        gaps = (time_diffs > pd.Timedelta(hours=1)).sum()
        print(f"  {zone}: {gaps} gaps")
        if gaps > 10:
            issues.append(f"{zone} has {gaps} time gaps")
    
    # Check 4: Date coverage
    print("\nDate coverage by country:")
    coverage = merged_df.groupby('zone')['timestamp'].agg(['min', 'max', 'count'])
    print(coverage)
    

In [19]:
def main():
    
    # Step 1: Merge all files
    merged_df = merge_all_country_files(INPUT_DIR)
    
    if merged_df is None:
        return
    
    # Step 2: Save merged dataset
    output_path = os.path.join(OUTPUT_DIR, 'all_countries_merged.csv')
    merged_df.to_csv(output_path, index=False)
    print(f"Merged dataset saved to: {output_path}")
    
    # Step 3: Data quality checks
    check_data_quality(merged_df)
    
    # Step 4: Create wide format (optional)
    wide_df = create_wide_format(merged_df)
    wide_path = os.path.join(OUTPUT_DIR, 'all_countries_wide_format.csv')
    wide_df.to_csv(wide_path)
    print(f"Wide format saved to: {wide_path}")
    
    # Step 5: Final summary
    print("\n" + "=" * 70)
    print("MERGE COMPLETE!")
    print("=" * 70)
    print(f"Total countries: {merged_df['zone'].nunique()}")
    print(f"Total rows: {len(merged_df):,}")
    print(f"Date range: {merged_df['timestamp'].min()} to {merged_df['timestamp'].max()}")
    print(f"\nFiles created:")
    print(f"  1. {output_path}")
    print(f"  2. {wide_path}")
    print(f"  3. {os.path.join(OUTPUT_DIR, 'merge_summary_report.csv')}")
    


if __name__ == "__main__":
        main()

Found 40 files to merge

[1/40] Processing: snapshots_2025-07-03_AT-2023-hourly.csv
Loaded: 8760 rows
Zone: AT, Year: [2023]
Date range: 2023-01-01 00:00:00 to 2023-12-31 23:00:00
Missing data: 0.00%
Avg carbon-free: 83.38%

[2/40] Processing: snapshots_2025-07-03_AT-2024-hourly.csv
Loaded: 8784 rows
Zone: AT, Year: [2024]
Date range: 2024-01-01 00:00:00 to 2024-12-31 23:00:00
Missing data: 0.00%
Avg carbon-free: 86.98%

[3/40] Processing: snapshots_2025-07-03_BE-2023-hourly.csv
Loaded: 8760 rows
Zone: BE, Year: [2023]
Date range: 2023-01-01 00:00:00 to 2023-12-31 23:00:00
Missing data: 0.00%
Avg carbon-free: 73.03%

[4/40] Processing: snapshots_2025-07-03_BE-2024-hourly.csv
Loaded: 8784 rows
Zone: BE, Year: [2024]
Date range: 2024-01-01 00:00:00 to 2024-12-31 23:00:00
Missing data: 0.00%
Avg carbon-free: 77.54%

[5/40] Processing: snapshots_2025-07-03_CH-2023-hourly.csv
Loaded: 8760 rows
Zone: CH, Year: [2023]
Date range: 2023-01-01 00:00:00 to 2023-12-31 23:00:00
Missing data: 0.00%


In [33]:
import pandas as pd
import numpy as np
import os
import glob

# Paths
CARBON_FREE_DATA = '/Users/annie_lok_yan_wong/Downloads/CFE%/merged_output/all_countries_merged.csv'
PRICING_DIR = '/Users/annie_lok_yan_wong/Downloads/european_wholesale_electricity_price_data_hourly/For_merging'  # Directory with pricing CSV files
OUTPUT_DIR = '/Users/annie_lok_yan_wong/Downloads/european_wholesale_electricity_price_data_hourly/For_merging/final_merged'  # Output directory
os.makedirs(OUTPUT_DIR, exist_ok=True)

# ISO3 to ISO2 mapping (3-letter to 2-letter zone codes)
ISO3_TO_ISO2 = {
    'NLD': 'NL',  
    'DEU': 'DE',  
    'FRA': 'FR',  
    'ESP': 'ES',  
    'ITA': 'IT',  
    'GBR': 'GB',  
    'BEL': 'BE',  
    'POL': 'PL',  
    'AUT': 'AT',  
    'CHE': 'CH',  
    'SWE': 'SE',  
    'NOR': 'NO',  
    'DNK': 'DK',  
    'FIN': 'FI',  
    'PRT': 'PT', 
    'GRC': 'GR', 
    'CZE': 'CZ', 
    'ROU': 'RO', 
    'IRL': 'IE', 
    'HUN': 'HU', 
}

TARGET_YEARS = [2023, 2024]

def load_pricing_file(filepath):
    """
    Load and standardize a pricing CSV file.
    
    Expected columns:
    - Country
    - ISO3 Code
    - Datetime (UTC)
    - Datetime (Local)
    - Price (EUR/MWhe)
    """
    try:
        df = pd.read_csv(filepath)
        
        # Standardize column names
        df = df.rename(columns={
            'Datetime (UTC)': 'timestamp',
            'ISO3 Code': 'iso3',
            'Price (EUR/MWhe)': 'price',
            'Country': 'country_name'
        })
        
        # Convert timestamp
        df['timestamp'] = pd.to_datetime(df['timestamp'])
        
        # Extract year
        df['year'] = df['timestamp'].dt.year
        
        # Map ISO3 to ISO2 (zone code)
        df['zone'] = df['iso3'].map(ISO3_TO_ISO2)
        
        # Drop rows where zone mapping failed
        before = len(df)
        df = df.dropna(subset=['zone'])
        after = len(df)
        
        if before > after:
            print(f"  ‚ö†Ô∏è  Dropped {before - after} rows with unmapped ISO3 codes")
        
        # Keep only relevant columns
        df = df[['timestamp', 'zone', 'country_name', 'price', 'year']]
        
        return df
    
    except Exception as e:
        print(f"  ‚ùå Error loading {os.path.basename(filepath)}: {e}")
        return None


def load_all_pricing_data(pricing_dir, target_years=None):
    """
    Load all pricing CSV files from a directory.
    """
    print("=" * 70)
    print("LOADING PRICING DATA")
    print("=" * 70)
    
    # Find all CSV files
    csv_files = glob.glob(os.path.join(pricing_dir, '*.csv'))
    
    if not csv_files:
        print(f"No CSV files found in: {pricing_dir}")
        return None
    
    print(f"\nüìÅ Found {len(csv_files)} pricing files")
    
    all_pricing = []
    
    for i, filepath in enumerate(sorted(csv_files), 1):
        filename = os.path.basename(filepath)
        print(f"\n[{i}/{len(csv_files)}] Loading: {filename}")
        
        df = load_pricing_file(filepath)
        
        if df is not None:
            # Filter by year if specified
            if target_years:
                before = len(df)
                df = df[df['year'].isin(target_years)]
                after = len(df)
                print(f"  ‚úÖ Loaded: {after:,} rows (filtered from {before:,} for years {target_years})")
            else:
                print(f"  ‚úÖ Loaded: {len(df):,} rows")
            
            # Show date range
            print(f"     Zone: {df['zone'].iloc[0]}")
            print(f"     Date range: {df['timestamp'].min()} to {df['timestamp'].max()}")
            print(f"     Price range: ‚Ç¨{df['price'].min():.2f} to ‚Ç¨{df['price'].max():.2f}")
            
            # Check for negative prices
            neg_count = (df['price'] < 0).sum()
            if neg_count > 0:
                neg_pct = (neg_count / len(df)) * 100
                print(f"     ‚ö° Negative prices: {neg_count} ({neg_pct:.2f}%)")
            
            all_pricing.append(df)
    
    
    # Combine all pricing data
    print(f"\nüîó Combining {len(all_pricing)} pricing files...")
    combined = pd.concat(all_pricing, ignore_index=True)
    
    # Remove duplicates
    before = len(combined)
    combined = combined.drop_duplicates(subset=['timestamp', 'zone'])
    after = len(combined)
    
    if before > after:
        print(f"  ‚ö†Ô∏è  Removed {before - after} duplicate rows")
    
    print(f"Combined pricing data: {len(combined):,} rows")
    print(f" Countries: {combined['zone'].nunique()}")
    print(f" Date range: {combined['timestamp'].min()} to {combined['timestamp'].max()}")
    
    return combined


def merge_carbon_and_pricing(carbon_df, pricing_df):
    """
    Merge carbon-free data with pricing data on timestamp and zone.
    """
    print("\n" + "=" * 70)
    print("MERGING CARBON-FREE AND PRICING DATA")
    print("=" * 70)
    
    print(f"\nBefore merge:")
    print(f"  Carbon-free data: {len(carbon_df):,} rows, {carbon_df['zone'].nunique()} countries")
    print(f"  Pricing data: {len(pricing_df):,} rows, {pricing_df['zone'].nunique()} countries")
    
    # Check which countries are in both datasets
    carbon_zones = set(carbon_df['zone'].unique())
    pricing_zones = set(pricing_df['zone'].unique())
    
    common_zones = carbon_zones & pricing_zones
    carbon_only = carbon_zones - pricing_zones
    pricing_only = pricing_zones - carbon_zones
    
    print(f"\nüìä Zone overlap:")
    print(f"  Common zones: {len(common_zones)} - {sorted(common_zones)}")
    
    if carbon_only:
        print(f"  ‚ö†Ô∏è  Carbon data only: {sorted(carbon_only)}")
    if pricing_only:
        print(f"  ‚ö†Ô∏è  Pricing data only: {sorted(pricing_only)}")
    
    # Merge
    print(f"\nüîó Performing merge...")
    merged = carbon_df.merge(
        pricing_df[['timestamp', 'zone', 'price']],
        on=['timestamp', 'zone'],
        how='left'  # Keep all carbon-free data
    )
    
    # Analyze merge results
    total_rows = len(merged)
    matched_rows = merged['price'].notna().sum()
    match_rate = (matched_rows / total_rows) * 100
    
    print(f"Merge complete!")
    print(f"  Total rows: {total_rows:,}")
    print(f"  Rows with pricing: {matched_rows:,} ({match_rate:.1f}%)")
    print(f"  Rows without pricing: {total_rows - matched_rows:,} ({100 - match_rate:.1f}%)")
    
    # Breakdown by country
    print(f"\nüìä Match rate by country:")
    for zone in sorted(merged['zone'].unique()):
        zone_df = merged[merged['zone'] == zone]
        zone_matched = zone_df['price'].notna().sum()
        zone_rate = (zone_matched / len(zone_df)) * 100
        status = "‚úÖ" if zone_rate > 90 else "‚ö†Ô∏è " if zone_rate > 50 else "‚ùå"
        print(f"  {status} {zone}: {zone_rate:.1f}% ({zone_matched:,}/{len(zone_df):,})")
    
    return merged


def analyze_merged_data(merged_df):
    """
    Perform analysis on the merged dataset.
    """
    print("\n" + "=" * 70)
    print("DATA ANALYSIS")
    print("=" * 70)
    
    # Overall statistics
    print(f"\nüìä Overall Statistics:")
    print(f"  Total rows: {len(merged_df):,}")
    print(f"  Countries: {merged_df['zone'].nunique()}")
    print(f"  Date range: {merged_df['timestamp'].min()} to {merged_df['timestamp'].max()}")
    
    # Price statistics
    price_data = merged_df['price'].dropna()
    print(f"\nüí∞ Price Statistics:")
    print(f"  Mean: ‚Ç¨{price_data.mean():.2f}/MWh")
    print(f"  Median: ‚Ç¨{price_data.median():.2f}/MWh")
    print(f"  Std Dev: ‚Ç¨{price_data.std():.2f}/MWh")
    print(f"  Min: ‚Ç¨{price_data.min():.2f}/MWh")
    print(f"  Max: ‚Ç¨{price_data.max():.2f}/MWh")
    
    # Negative prices
    neg_count = (price_data < 0).sum()
    neg_pct = (neg_count / len(price_data)) * 100
    print(f"  Negative prices: {neg_count:,} ({neg_pct:.2f}%)")
    
    # Carbon-free statistics
    print(f"\nüå± Carbon-Free Statistics:")
    print(f"  Mean: {merged_df['carbon_free_pct'].mean():.2f}%")
    print(f"  Median: {merged_df['carbon_free_pct'].median():.2f}%")
    print(f"  Std Dev: {merged_df['carbon_free_pct'].std():.2f}%")
    
    # Correlation
    complete_data = merged_df[['carbon_free_pct', 'price']].dropna()
    if len(complete_data) > 0:
        correlation = complete_data['carbon_free_pct'].corr(complete_data['price'])
        print(f"\nüîó Correlation:")
        print(f"  Carbon-free % vs Price: {correlation:.3f}")
        
        if correlation < -0.2:
            print(f" Negative correlation: Higher carbon-free % ‚Üí Lower prices")
        elif correlation > 0.2:
            print(f" Positive correlation: Higher carbon-free % ‚Üí Higher prices")
        else:
            print(f"  ‚û°Ô∏è  Weak correlation")


def save_results(merged_df, output_dir):
    """
    Save merged results and create summary reports.
    """
    print("\n" + "=" * 70)
    print("SAVING RESULTS")
    print("=" * 70)
    
    # Save full merged dataset
    output_path = os.path.join(output_dir, 'carbon_free_and_pricing_merged.csv')
    merged_df.to_csv(output_path, index=False)
    print(f"Full dataset saved to: {output_path}")
    
    # Save only rows with both carbon-free and pricing data
    complete_data = merged_df.dropna(subset=['carbon_free_pct', 'price'])
    complete_path = os.path.join(output_dir, 'complete_data_only.csv')
    complete_data.to_csv(complete_path, index=False)
    print(f"Complete data only: {complete_path}")
    print(f"   ({len(complete_data):,} rows with both carbon-free % and price)")
    
    # Create summary by country
    summary = merged_df.groupby('zone').agg({
        'timestamp': ['min', 'max', 'count'],
        'carbon_free_pct': ['mean', 'std'],
        'price': ['mean', 'std', 'count'],
        'country': 'first'
    }).round(2)
    
    summary_path = os.path.join(output_dir, 'country_summary.csv')
    summary.to_csv(summary_path)
    
    print(f"All files saved to: {output_dir}/")
def main():
    
    
    # Step 1: Load carbon-free data

    carbon_df = pd.read_csv(CARBON_FREE_DATA)
    carbon_df['timestamp'] = pd.to_datetime(carbon_df['timestamp'])
    print(f"Loaded carbon-free data: {len(carbon_df):,} rows, {carbon_df['zone'].nunique()} countries")
    # Step 2: Load pricing data
    pricing_df = load_all_pricing_data(PRICING_DIR, target_years=TARGET_YEARS)
    
    if pricing_df is None:
        print("Failed to load pricing data")
        return
    
    # Step 3: Merge datasets
    merged_df = merge_carbon_and_pricing(carbon_df, pricing_df)
    
    # Step 4: Analyze
    analyze_merged_data(merged_df)
    
    # Step 5: Save results
    save_results(merged_df, OUTPUT_DIR)

if __name__ == "__main__":
    main()



Loaded carbon-free data: 350,880 rows, 20 countries
LOADING PRICING DATA

üìÅ Found 20 pricing files

[1/20] Loading: Austria.csv
  ‚úÖ Loaded: 17,544 rows (filtered from 95,120 for years [2023, 2024])
     Zone: AT
     Date range: 2023-01-01 00:00:00 to 2024-12-31 23:00:00
     Price range: ‚Ç¨-270.42 to ‚Ç¨711.69
     ‚ö° Negative prices: 335 (1.91%)

[2/20] Loading: Belgium.csv
  ‚úÖ Loaded: 17,544 rows (filtered from 95,120 for years [2023, 2024])
     Zone: BE
     Date range: 2023-01-01 00:00:00 to 2024-12-31 23:00:00
     Price range: ‚Ç¨-140.00 to ‚Ç¨565.46
     ‚ö° Negative prices: 625 (3.56%)

[3/20] Loading: Czechia.csv
  ‚úÖ Loaded: 17,544 rows (filtered from 95,120 for years [2023, 2024])
     Zone: CZ
     Date range: 2023-01-01 00:00:00 to 2024-12-31 23:00:00
     Price range: ‚Ç¨-138.75 to ‚Ç¨844.63
     ‚ö° Negative prices: 449 (2.56%)

[4/20] Loading: Denmark.csv
  ‚úÖ Loaded: 17,544 rows (filtered from 95,120 for years [2023, 2024])
     Zone: DK
     Date range: 2

In [22]:
DATA_FILE = '/Users/annie_lok_yan_wong/Downloads/european_wholesale_electricity_price_data_hourly/For_merging/final_merged/complete_data_only.csv'

In [23]:
df = pd.read_csv(DATA_FILE)
print(df.head())

             timestamp zone  country  carbon_free_pct  renewable_pct  \
0  2023-01-01 00:00:00   AT  Austria            83.29          70.09   
1  2023-01-01 01:00:00   AT  Austria            82.78          69.40   
2  2023-01-01 02:00:00   AT  Austria            82.37          68.71   
3  2023-01-01 03:00:00   AT  Austria            81.61          67.24   
4  2023-01-01 04:00:00   AT  Austria            81.20          66.00   

   carbon_intensity_direct  carbon_intensity_lifecycle  is_estimated  year  \
0                   158.86                      196.54         False  2023   
1                   164.07                      201.87         False  2023   
2                   167.60                      206.23         False  2023   
3                   173.77                      213.41         False  2023   
4                   177.88                      218.07         False  2023   

                               source_file  price  
0  snapshots_2025-07-03_AT-2023-hourly.csv  -7

In [24]:
import pandas as pd

df['timestamp'] = pd.to_datetime(df['timestamp'])


df['hour'] = df['timestamp'].dt.hour

hourly_avg = df.groupby(['year', 'hour']).agg({
    'price': 'mean',
    'carbon_free_pct': 'mean'
}).reset_index()

hourly_avg.columns = ['year', 'hour', 'avg_price', 'avg_carbon_free_pct']

print(hourly_avg)

hourly_2023 = hourly_avg[hourly_avg['year'] == 2023]
hourly_2024 = hourly_avg[hourly_avg['year'] == 2024]


print(hourly_2023)
print(hourly_2024)

    year  hour   avg_price  avg_carbon_free_pct
0   2023     0   83.110585            65.443371
1   2023     1   80.202653            65.518263
2   2023     2   78.581741            65.549703
3   2023     3   81.712102            65.331976
4   2023     4   91.812519            65.167965
5   2023     5  104.881183            65.730796
6   2023     6  110.761566            66.971624
7   2023     7  107.639033            68.587244
8   2023     8   98.816591            70.423947
9   2023     9   91.098140            71.814973
10  2023    10   85.541589            72.605054
11  2023    11   80.655758            72.936796
12  2023    12   79.029695            72.727434
13  2023    13   83.338651            71.821944
14  2023    14   91.370701            70.158416
15  2023    15  104.216591            68.014015
16  2023    16  119.913853            66.051209
17  2023    17  133.516630            64.842553
18  2023    18  132.785534            64.187560
19  2023    19  121.174861            63

In [25]:
hourly_avg.to_csv('/Users/annie_lok_yan_wong/Downloads/european_wholesale_electricity_price_data_hourly/For_merging/final_merged/hourly_average_price_and_cfe.csv', index=False)