## Extracting data from NEM

- Related reading: https://adgefficiency.com/hackers-aemo/
- Packages used: https://github.com/UNSW-CEEM/NEMOSIS

In [19]:
import os
from nemosis import dynamic_data_compiler
import pandas as pd
data_folder = os.path.join(os.getcwd(), 'data')
os.makedirs(data_folder, exist_ok=True)

In [2]:
start_time = '2019/01/01 00:00:01'
end_time = '2024/12/31 23:59:59'
table = 'DISPATCHPRICE'
raw_data_cache = os.path.join(data_folder, 'raw_data_cache') 
os.makedirs(raw_data_cache, exist_ok=True)


In [3]:
price_data = dynamic_data_compiler(start_time, end_time, table, raw_data_cache)

INFO: Compiling data for table DISPATCHPRICE
INFO: Downloading data for table DISPATCHPRICE, year 2018, month 12
INFO: Creating feather file for DISPATCHPRICE, 2018, 12
INFO: Downloading data for table DISPATCHPRICE, year 2019, month 01
INFO: Creating feather file for DISPATCHPRICE, 2019, 01
INFO: Downloading data for table DISPATCHPRICE, year 2019, month 02
INFO: Creating feather file for DISPATCHPRICE, 2019, 02
INFO: Downloading data for table DISPATCHPRICE, year 2019, month 03
INFO: Creating feather file for DISPATCHPRICE, 2019, 03
INFO: Downloading data for table DISPATCHPRICE, year 2019, month 04
INFO: Creating feather file for DISPATCHPRICE, 2019, 04
INFO: Downloading data for table DISPATCHPRICE, year 2019, month 05
INFO: Creating feather file for DISPATCHPRICE, 2019, 05
INFO: Downloading data for table DISPATCHPRICE, year 2019, month 06
INFO: Creating feather file for DISPATCHPRICE, 2019, 06
INFO: Downloading data for table DISPATCHPRICE, year 2019, month 07
INFO: Creating feat

Required columns:
- Date: "SETTLEMENTDATE"
- Region: "REGIONID"
- Intervention: "INTERVENTION"
- RRP: "RRP"
- EEP: "EEP"
- ROP: "ROP"
- Administered Price Cap: "APCFLAG"
- Market Suspended: "MARKETSUSPENDEDFLAG"

Only from 2019.01 to 2024.11 is available on https://nemweb.com.au/Data_Archive/Wholesale_Electricity/MMSDM

The data for 2024.12 needs to be downloaded from https://nemweb.com.au/Reports/Current/Public_Prices/ and processed separately.

In [13]:
import requests
from bs4 import BeautifulSoup

# URL of the public prices page
url = "https://nemweb.com.au/Reports/Current/Public_Prices/"

# Get the page content
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

# Find all links that contain 'PUBLIC_PRICES' and extract href strings
price_links = [link['href'] for link in soup.find_all('a', href=lambda href: href and 'PUBLIC_PRICES' in href)]

# Print the available zip file links
print("Available Public Prices ZIP files:", len(price_links))


Filtered Public Prices ZIP files (0):


In [15]:
from datetime import datetime

start_date = datetime.strptime('20241130', '%Y%m%d')
end_date = datetime.strptime('20241231', '%Y%m%d')

# use regex to filter out the links that are not in the date range
import re
pattern = re.compile(r'PUBLIC_PRICES_([0-9]{8})')

# Filter out the links that are not in the date range
price_links = [link for link in price_links if start_date <= datetime.strptime(pattern.search(link).group(1), '%Y%m%d') <= end_date]


In [24]:
# download and extract the zip files into the raw_data_cache folder
import zipfile
import io

root_url = "https://nemweb.com.au"
for link in price_links:
    response = requests.get(root_url + link)
    with zipfile.ZipFile(io.BytesIO(response.content)) as z:
        z.extractall(raw_data_cache)


In [38]:
# Combine all downloaded csv files into a csv file
# How to: 
# 1. del first row of all csv files
# 2. keep the second row of the first csv file, and delete the second row of the rest of the csv files
# 3. concatenate all csv files into one csv file
# 4. keep these column only: ['SETTLEMENTDATE', 'REGIONID', 'RRP', 'EEP', 'ROP', 'INTERVENTION', 'APCFLAG', 'MARKETSUSPENDEDFLAG']
import os
import pandas as pd

# Get list of all CSV files in raw_data_cache
csv_files = [f for f in os.listdir(raw_data_cache) if f.endswith('.CSV') and f.startswith('PUBLIC_PRICES')]

# Initialize empty list to store processed DataFrames
processed_dfs = []

# Process each CSV file
for i, csv_file in enumerate(csv_files):
    
    with open(os.path.join(raw_data_cache, csv_file), 'r') as file:
        # Skip the first line (line 1)
        next(file)
        # Read lines 2 to 1442 (1441 lines total)
        lines = [next(file) for _ in range(1441)]

    # Convert the selected lines into a Pandas-readable buffer and create a DataFrame
    df = pd.read_csv(io.StringIO(''.join(lines)))


    # Select only required columns
    df = df[['SETTLEMENTDATE', 'REGIONID', 'RRP', 'EEP', 'ROP', 'INTERVENTION', 'APCFLAG', 'MARKETSUSPENDEDFLAG']]
    
    # Append to processed_dfs list
    processed_dfs.append(df)

# Concatenate all DataFrames
combined_df_202412 = pd.concat(processed_dfs, ignore_index=True)

# order by SETTLEMENTDATE
combined_df_202412 = combined_df_202412.sort_values(by='SETTLEMENTDATE')

# SETTLEMENTDATE range from 2024/12/01 00:05:00 to 2025/01/01 00:00:00
combined_df_202412 = combined_df_202412[(combined_df_202412['SETTLEMENTDATE'] >= '2024/12/01 00:05:00') & (combined_df_202412['SETTLEMENTDATE'] <= '2025/01/01 00:00:00')]
