In [2]:
import os
import pandas as pd
import numpy as np

In [5]:
def granularity_converter(path):
    # Read the original parquet file
    df = pd.read_parquet(path)
    
    # Convert the 'timestamp' column to datetime if not already
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    
    # Initialize a list to collect new rows
    rows = []
    
    # Loop through each row in the original DataFrame
    for _, row in df.iterrows():
        # Generate four new rows for each entry, adjusting the timestamp by 15 minutes for each
        for i in range(4):
            new_row = {'timestamp': row['timestamp'] + pd.Timedelta(minutes=15*i),
                       'carbon_intensity': row['carbon_intensity'] / 4}
            rows.append(new_row)
    
    # Create a new DataFrame from the list of new rows
    new_df = pd.DataFrame(rows)
    
    # Save the new DataFrame to a parquet file
    new_df.to_parquet(path)

In [17]:
all_files = os.listdir(".")
all_files

['carbon_usage.txt',
 'grabber2.ipynb',
 'EnergyData.py',
 'co2_granularity',
 'variables.py',
 'util.ipynb',
 '__pycache__',
 'grabber.ipynb',
 'consistency.ipynb',
 'parquets']

In [9]:
# Iterate over files and process Parquet files
for file in all_files:
    if file.endswith(".parquet"):
        try:
            granularity_converter(file)
        except Exception as e:
            print(f"Error processing {file}: {e}")

In [7]:
granularity_converter("data/entso-e-IT-2023_1-2023_12.parquet")

In [8]:
pd.read_parquet("data/entso-e-IT-2023_1-2023_12.parquet")

Unnamed: 0,timestamp,carbon_intensity
0,2022-12-31 23:00:00+01:00,47.311868
1,2022-12-31 23:15:00+01:00,47.311868
2,2022-12-31 23:30:00+01:00,47.311868
3,2022-12-31 23:45:00+01:00,47.311868
4,2023-01-01 00:00:00+01:00,48.484459
...,...,...
34939,2023-12-30 21:45:00+01:00,26.454862
34940,2023-12-30 22:00:00+01:00,26.954670
34941,2023-12-30 22:15:00+01:00,26.954670
34942,2023-12-30 22:30:00+01:00,26.954670


In [38]:
# for every file, keep only the timestamps that are in the range of 2023-01-01 00:00:00 to 2023-12-31 23:45:00
for file in all_files:
    if file.endswith(".parquet"):
        try:
            df = pd.read_parquet(file)
            df = df[(df['timestamp'] >= '2023-01-01 00:00:00') & (df['timestamp'] <= '2023-12-31 23:45:00')]
            df.to_parquet(file)
        except Exception as e:
            print(f"Error processing {file}: {e}")

In [8]:
pd.read_parquet("entso-e-DE-2023_1-2023_12.parquet")

Unnamed: 0,timestamp,carbon_intensity
0,2023-01-01 00:00:00+01:00,108.917555
1,2023-01-01 00:15:00+01:00,108.349605
2,2023-01-01 00:30:00+01:00,106.946341
3,2023-01-01 00:45:00+01:00,108.492066
4,2023-01-01 01:00:00+01:00,106.033925
...,...,...
34939,2023-12-30 22:45:00+01:00,126.913995
34940,2023-12-30 23:00:00+01:00,122.441263
34941,2023-12-30 23:15:00+01:00,122.994395
34942,2023-12-30 23:30:00+01:00,123.970878


In [14]:
pd.read_parquet("../parquets/entso-e-ME-2023_1-2023_12.parquet")

ArrowInvalid: Could not open Parquet input source '<Buffer>': Parquet magic bytes not found in footer. Either the file is corrupted or this is not a parquet file.