In [1]:
!pip install pandas
!pip install fastparquet
!pip install openpyxl



In [2]:
# Imports
import pandas as pd
import pathlib

In [3]:
# Load data from files to dataframes
# All files were saved with their default name from provided emails / source sites in the "data" directory.
data_path = pathlib.Path.cwd() / "data"

# Provided modeled generation
modeled_generation_df = pd.read_csv(data_path / "windGenTS.csv")
# This one's a doozy, but it's concatenating all the tables from all the worksheets in all the different power price files
power_prices_data_unflat = [pd.read_excel(pathname, sheet_name=None) for pathname in data_path.glob("rpt.*")]

In [4]:
# at this point power_prices_data_unflat is a List[Dict[str, pd.DataFrame]], and since all tables share the same schema, we want to consolidate them into a single table
power_prices_data = pd.concat([value for subdict in power_prices_data_unflat for value in subdict.values()], ignore_index=True)

In [5]:
# Data cleaning
for column in ["Delivery Hour", "Delivery Interval"]:
	power_prices_data[column] = pd.to_numeric(power_prices_data[column], downcast="integer")

# Hey, there's a 24 o'clock in the power prices dataset! I THINK this means midnight, so I'm going to change this to 0 because that's how time works.
power_prices_data.loc[power_prices_data['Delivery Hour'] == 24, 'Delivery Hour'] = 0
power_prices_data = power_prices_data.dropna()

In [6]:
power_prices_data.to_parquet(data_path / "power_prices_data.gzip", index=False, compression='gzip')