In [2]:
import requests
import os
import time
import os
import pandas as pd

# Define directories
data_dir = 'data/aemo_data'
out_dir = 'data/concatenated_data'
analysis_dir = 'data/analysis'
os.makedirs(out_dir, exist_ok=True)
# Create directory to store files
os.makedirs(data_dir, exist_ok=True)
os.makedirs(analysis_dir, exist_ok=True)


# List of states/regions
regions = ['NSW1', 'QLD1', 'SA1', 'TAS1', 'VIC1']




## Price

In [2]:
# Base URL template
base_url = "https://aemo.com.au/aemo/data/nem/priceanddemand/PRICE_AND_DEMAND_{}_{}.csv"

# Headers
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

# Loop through years and months
for year in range(2019, 2025):
    for month in range(1, 13):
        month_str = f"{year}{month:02d}"
        if year == 2024 and month > 12:
            continue
        for region in regions:
            filename = f"data/aemo_data/PRICE_AND_DEMAND_{month_str}_{region}.csv"
            # Skip if file already exists
            if os.path.exists(filename):
                print(f"Skipping {filename} - already exists")
                continue
                
            url = base_url.format(month_str, region)
            try:
                response = requests.get(url, headers=headers)
                response.raise_for_status()
                with open(filename, 'wb') as f:
                    f.write(response.content)
                print(f"Downloaded {filename}")
            except requests.exceptions.RequestException as e:
                print(f"Failed to download {url}: {e}")
            time.sleep(1)  # Wait 1 second between requests

Downloaded data/aemo_data/PRICE_AND_DEMAND_201901_NSW1.csv
Downloaded data/aemo_data/PRICE_AND_DEMAND_201901_QLD1.csv
Downloaded data/aemo_data/PRICE_AND_DEMAND_201901_SA1.csv
Downloaded data/aemo_data/PRICE_AND_DEMAND_201901_TAS1.csv
Downloaded data/aemo_data/PRICE_AND_DEMAND_201901_VIC1.csv
Downloaded data/aemo_data/PRICE_AND_DEMAND_201902_NSW1.csv
Downloaded data/aemo_data/PRICE_AND_DEMAND_201902_QLD1.csv
Downloaded data/aemo_data/PRICE_AND_DEMAND_201902_SA1.csv
Downloaded data/aemo_data/PRICE_AND_DEMAND_201902_TAS1.csv
Downloaded data/aemo_data/PRICE_AND_DEMAND_201902_VIC1.csv
Downloaded data/aemo_data/PRICE_AND_DEMAND_201903_NSW1.csv
Downloaded data/aemo_data/PRICE_AND_DEMAND_201903_QLD1.csv
Downloaded data/aemo_data/PRICE_AND_DEMAND_201903_SA1.csv
Downloaded data/aemo_data/PRICE_AND_DEMAND_201903_TAS1.csv
Downloaded data/aemo_data/PRICE_AND_DEMAND_201903_VIC1.csv
Downloaded data/aemo_data/PRICE_AND_DEMAND_201904_NSW1.csv
Downloaded data/aemo_data/PRICE_AND_DEMAND_201904_QLD1.csv


In [3]:
def concat_files_by_year_and_region():
    for year in range(2019, 2025):
        for region in regions:
            # Collect all files for the given year and region
            region_files = [
                os.path.join(data_dir, file) 
                for file in os.listdir(data_dir) 
                if file.startswith(f"PRICE_AND_DEMAND_{year}") and file.endswith(f"_{region}.csv")
            ]

            # If no files found, skip
            if not region_files:
                print(f"No files found for {year} - {region}")
                continue

            # Load and concatenate CSV files
            data_frames = []
            for file in region_files:
                try:
                    df = pd.read_csv(file)
                    data_frames.append(df)
                except Exception as e:
                    print(f"Failed to read {file}: {e}")

            # Concatenate and save the result
            if data_frames:
                concatenated_df = pd.concat(data_frames, ignore_index=True)
                output_filename = f"PRICE_AND_DEMAND_{year}_{region}.csv"
                output_path = os.path.join(out_dir, output_filename)
                concatenated_df.to_csv(output_path, index=False)
                print(f"Concatenated data saved to {output_path}")

In [4]:
concat_files_by_year_and_region()

Concatenated data saved to data/concatenated_data\PRICE_AND_DEMAND_2019_NSW1.csv
Concatenated data saved to data/concatenated_data\PRICE_AND_DEMAND_2019_QLD1.csv
Concatenated data saved to data/concatenated_data\PRICE_AND_DEMAND_2019_SA1.csv
Concatenated data saved to data/concatenated_data\PRICE_AND_DEMAND_2019_TAS1.csv
Concatenated data saved to data/concatenated_data\PRICE_AND_DEMAND_2019_VIC1.csv
Concatenated data saved to data/concatenated_data\PRICE_AND_DEMAND_2020_NSW1.csv
Concatenated data saved to data/concatenated_data\PRICE_AND_DEMAND_2020_QLD1.csv
Concatenated data saved to data/concatenated_data\PRICE_AND_DEMAND_2020_SA1.csv
Concatenated data saved to data/concatenated_data\PRICE_AND_DEMAND_2020_TAS1.csv
Concatenated data saved to data/concatenated_data\PRICE_AND_DEMAND_2020_VIC1.csv
Concatenated data saved to data/concatenated_data\PRICE_AND_DEMAND_2021_NSW1.csv
Concatenated data saved to data/concatenated_data\PRICE_AND_DEMAND_2021_QLD1.csv
Concatenated data saved to dat

In [5]:
def concat_files_by_year():
    for region in regions:
        # Collect all files for the region across all years
        region_files = [
            os.path.join(data_dir, file) 
            for file in os.listdir(data_dir) 
            if file.endswith(f"_{region}.csv")
        ]

        # If no files found, skip
        if not region_files:
            print(f"No files found for {region}")
            continue

        # Load and concatenate CSV files
        data_frames = []
        for file in region_files:
            try:
                df = pd.read_csv(file)
                data_frames.append(df)
            except Exception as e:
                print(f"Failed to read {file}: {e}")

        # Concatenate and save the result
        if data_frames:
            concatenated_df = pd.concat(data_frames, ignore_index=True)
            # Sort by date to ensure chronological order
            concatenated_df['SETTLEMENTDATE'] = pd.to_datetime(concatenated_df['SETTLEMENTDATE'])
            concatenated_df = concatenated_df.sort_values('SETTLEMENTDATE')

            # split 'SETTLEMENTDATE' to year, month, day, hour, minute, and weekday
            concatenated_df['YEAR'] = concatenated_df['SETTLEMENTDATE'].dt.year
            concatenated_df['MONTH'] = concatenated_df['SETTLEMENTDATE'].dt.month
            concatenated_df['DAY'] = concatenated_df['SETTLEMENTDATE'].dt.day
            concatenated_df['HOUR'] = concatenated_df['SETTLEMENTDATE'].dt.hour
            concatenated_df['MINUTE'] = concatenated_df['SETTLEMENTDATE'].dt.minute
            concatenated_df['WEEKDAY'] = concatenated_df['SETTLEMENTDATE'].dt.weekday
            
            output_filename = f"PRICE_AND_DEMAND_ALL_YEARS_{region}.csv"
            output_path = os.path.join(out_dir, output_filename)
            concatenated_df.to_csv(output_path, index=False)
            print(f"Concatenated data for all years saved to {output_path}")

concat_files_by_year()

Concatenated data for all years saved to data/concatenated_data\PRICE_AND_DEMAND_ALL_YEARS_NSW1.csv
Concatenated data for all years saved to data/concatenated_data\PRICE_AND_DEMAND_ALL_YEARS_QLD1.csv
Concatenated data for all years saved to data/concatenated_data\PRICE_AND_DEMAND_ALL_YEARS_SA1.csv
Concatenated data for all years saved to data/concatenated_data\PRICE_AND_DEMAND_ALL_YEARS_TAS1.csv
Concatenated data for all years saved to data/concatenated_data\PRICE_AND_DEMAND_ALL_YEARS_VIC1.csv


In [6]:
# calculate the mean, median, min, max of the trading price by month of years (201901, 201902, ..., 202412)

def calculate_price_stats_by_month():
    for region in regions:
        input_filename = f"PRICE_AND_DEMAND_ALL_YEARS_{region}.csv"
        input_path = os.path.join(out_dir, input_filename)
        output_filename = f"PRICE_STATS_BY_MONTH_{region}.csv"
        output_path = os.path.join(analysis_dir, output_filename)

        try:
            df = pd.read_csv(input_path)
            df['SETTLEMENTDATE'] = pd.to_datetime(df['SETTLEMENTDATE'])
            df['YEAR_MONTH'] = df['SETTLEMENTDATE'].dt.strftime('%Y-%m')
            price_stats = df.groupby('YEAR_MONTH').agg(
                RRP_mean=('RRP', 'mean'),
                RRP_median=('RRP', 'median'),
                RRP_min=('RRP', 'min'),
                RRP_max=('RRP', 'max'),
                TOTALDEMAND_mean=('TOTALDEMAND', 'mean'),
                TOTALDEMAND_median=('TOTALDEMAND', 'median'),
                TOTALDEMAND_min=('TOTALDEMAND', 'min'),
                TOTALDEMAND_max=('TOTALDEMAND', 'max')
            ).round(2)
            #split 'YEAR_MONTH' to year and month
            price_stats['YEAR'] = price_stats.index.str.split('-').str[0]
            price_stats['MONTH'] = price_stats.index.str.split('-').str[1]

            # remove 2025 year data
            price_stats = price_stats[price_stats['YEAR'] != '2025']

            price_stats.to_csv(output_path)
            print(f"Price statistics by month saved to {output_path}")
        except Exception as e:
            print(f"Failed to calculate price statistics for {region}: {e}")
        

calculate_price_stats_by_month()

Price statistics by month saved to data/analysis\PRICE_STATS_BY_MONTH_NSW1.csv
Price statistics by month saved to data/analysis\PRICE_STATS_BY_MONTH_QLD1.csv
Price statistics by month saved to data/analysis\PRICE_STATS_BY_MONTH_SA1.csv
Price statistics by month saved to data/analysis\PRICE_STATS_BY_MONTH_TAS1.csv
Price statistics by month saved to data/analysis\PRICE_STATS_BY_MONTH_VIC1.csv


In [7]:
def calculate_price_stats_by_week():
    for region in regions:
        input_filename = f"PRICE_AND_DEMAND_ALL_YEARS_{region}.csv"
        input_path = os.path.join(out_dir, input_filename)
        output_filename = f"PRICE_STATS_BY_WEEK_{region}.csv"
        output_path = os.path.join(analysis_dir, output_filename)

        try:
            df = pd.read_csv(input_path)
            df['SETTLEMENTDATE'] = pd.to_datetime(df['SETTLEMENTDATE'])
            df['YEAR_WEEK'] = df['SETTLEMENTDATE'].dt.strftime('%Y-%U')  # Year-Week format
            price_stats = df.groupby('YEAR_WEEK').agg(
                RRP_mean=('RRP', 'mean'),
                RRP_median=('RRP', 'median'),
                RRP_min=('RRP', 'min'),
                RRP_max=('RRP', 'max'),
                TOTALDEMAND_mean=('TOTALDEMAND', 'mean'),
                TOTALDEMAND_median=('TOTALDEMAND', 'median'),
                TOTALDEMAND_min=('TOTALDEMAND', 'min'),
                TOTALDEMAND_max=('TOTALDEMAND', 'max')
            ).round(2)
            #split 'YEAR_WEEK' to year and week
            price_stats['YEAR'] = price_stats.index.str.split('-').str[0]
            price_stats['WEEK'] = price_stats.index.str.split('-').str[1]
            price_stats = price_stats[price_stats['YEAR'] != '2025']

            price_stats.to_csv(output_path)
            print(f"Price statistics by week saved to {output_path}")
        except Exception as e:
            print(f"Failed to calculate price statistics for {region}: {e}")

calculate_price_stats_by_week()

Price statistics by week saved to data/analysis\PRICE_STATS_BY_WEEK_NSW1.csv
Price statistics by week saved to data/analysis\PRICE_STATS_BY_WEEK_QLD1.csv
Price statistics by week saved to data/analysis\PRICE_STATS_BY_WEEK_SA1.csv
Price statistics by week saved to data/analysis\PRICE_STATS_BY_WEEK_TAS1.csv
Price statistics by week saved to data/analysis\PRICE_STATS_BY_WEEK_VIC1.csv


In [8]:
def calculate_price_stats_by_day():
    for region in regions:
        input_filename = f"PRICE_AND_DEMAND_ALL_YEARS_{region}.csv"
        input_path = os.path.join(out_dir, input_filename)
        output_filename = f"PRICE_STATS_BY_DAY_{region}.csv"
        output_path = os.path.join(analysis_dir, output_filename)

        try:
            df = pd.read_csv(input_path)
            df['SETTLEMENTDATE'] = pd.to_datetime(df['SETTLEMENTDATE'])
            df['YEAR_MONTH_DAY'] = df['SETTLEMENTDATE'].dt.strftime('%Y-%m-%d')
            price_stats = df.groupby('YEAR_MONTH_DAY').agg(
                RRP_mean=('RRP', 'mean'),
                RRP_median=('RRP', 'median'),
                RRP_min=('RRP', 'min'),
                RRP_max=('RRP', 'max'),
                TOTALDEMAND_mean=('TOTALDEMAND', 'mean'),
                TOTALDEMAND_median=('TOTALDEMAND', 'median'),
                TOTALDEMAND_min=('TOTALDEMAND', 'min'),
                TOTALDEMAND_max=('TOTALDEMAND', 'max')
            ).round(2)
            #split 'YEAR_MONTH_DAY' to year, month, and day
            price_stats['YEAR'] = price_stats.index.str.split('-').str[0]
            price_stats['MONTH'] = price_stats.index.str.split('-').str[1]
            price_stats['DAY'] = price_stats.index.str.split('-').str[2]
            price_stats['WEEKDAY'] = pd.to_datetime(price_stats.index).dayofweek
            price_stats = price_stats[price_stats['YEAR'] != '2025']

            price_stats.to_csv(output_path)
            print(f"Price statistics by day saved to {output_path}")
        except Exception as e:
            print(f"Failed to calculate price statistics for {region}: {e}")

calculate_price_stats_by_day()

Price statistics by day saved to data/analysis\PRICE_STATS_BY_DAY_NSW1.csv
Price statistics by day saved to data/analysis\PRICE_STATS_BY_DAY_QLD1.csv
Price statistics by day saved to data/analysis\PRICE_STATS_BY_DAY_SA1.csv
Price statistics by day saved to data/analysis\PRICE_STATS_BY_DAY_TAS1.csv
Price statistics by day saved to data/analysis\PRICE_STATS_BY_DAY_VIC1.csv


In [9]:
def calculate_price_stats_by_hour():
    for region in regions:
        input_filename = f"PRICE_AND_DEMAND_ALL_YEARS_{region}.csv"
        input_path = os.path.join(out_dir, input_filename)
        output_filename = f"PRICE_STATS_BY_HOUR_{region}.csv"
        output_path = os.path.join(analysis_dir, output_filename)

        try:
            df = pd.read_csv(input_path)
            df['SETTLEMENTDATE'] = pd.to_datetime(df['SETTLEMENTDATE'])
            df['YEAR_MONTH_DAY_HOUR'] = df['SETTLEMENTDATE'].dt.strftime('%Y-%m-%d %H:00')
            price_stats = df.groupby('YEAR_MONTH_DAY_HOUR').agg(
                RRP_mean=('RRP', 'mean'),
                RRP_median=('RRP', 'median'),
                RRP_min=('RRP', 'min'),
                RRP_max=('RRP', 'max'),
                TOTALDEMAND_mean=('TOTALDEMAND', 'mean'),
                TOTALDEMAND_median=('TOTALDEMAND', 'median'),
                TOTALDEMAND_min=('TOTALDEMAND', 'min'),
                TOTALDEMAND_max=('TOTALDEMAND', 'max')
            ).round(2)
            #split 'YEAR_MONTH_DAY_HOUR' to year, month, day, and hour
            price_stats['YEAR'] = price_stats.index.str.split('-').str[0]
            price_stats['MONTH'] = price_stats.index.str.split('-').str[1]
            price_stats['DAY'] = price_stats.index.str.split('-').str[2]
            price_stats['HOUR'] = price_stats.index.str.split(' ').str[1].str.split(':').str[0]
            price_stats['WEEKDAY'] = pd.to_datetime(price_stats.index).dayofweek
            price_stats = price_stats[price_stats['YEAR'] != '2025']

            price_stats.to_csv(output_path)
            print(f"Price statistics by hour saved to {output_path}")
        except Exception as e:
            print(f"Failed to calculate price statistics for {region}: {e}")

calculate_price_stats_by_hour()

Price statistics by hour saved to data/analysis\PRICE_STATS_BY_HOUR_NSW1.csv
Price statistics by hour saved to data/analysis\PRICE_STATS_BY_HOUR_QLD1.csv
Price statistics by hour saved to data/analysis\PRICE_STATS_BY_HOUR_SA1.csv
Price statistics by hour saved to data/analysis\PRICE_STATS_BY_HOUR_TAS1.csv
Price statistics by hour saved to data/analysis\PRICE_STATS_BY_HOUR_VIC1.csv


## Outage

In [4]:

# Headers
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

In [6]:
# Download the latest data of NETWORK.NETWORK_OUTAGEDETAIL in MMS Data Model
# NETWORK.NETWORK_OUTAGEDETAIL stores the details of network outages in the NEM from 2003 to the present
# there is no need to download the table for each year and month
# The latest data can be found from the following URL: https://visualisations.aemo.com.au/aemo/nemweb/#mms-data-model

# 2025 January data
outage_detail_url = "https://nemweb.com.au/Data_Archive/Wholesale_Electricity/MMSDM/2025/MMSDM_2025_01/MMSDM_Historical_Data_SQLLoader/DATA/PUBLIC_ARCHIVE%23NETWORK_OUTAGEDETAIL%23FILE01%23202501010000.zip"

# Download the zip file
outage_detail_zipname = "data/aemo_data/NETWORK_OUTAGEDETAIL_202501.zip"
outage_detail_csvname = "PUBLIC_ARCHIVE#NETWORK_OUTAGEDETAIL#FILE01#202501010000.CSV"

try:
    response = requests.get(outage_detail_url, headers=headers)
    response.raise_for_status()
    with open(outage_detail_zipname, 'wb') as f:
        f.write(response.content)
    print(f"Downloaded {outage_detail_zipname}")

    # Unzip the file
    import zipfile
    with zipfile.ZipFile(outage_detail_zipname, 'r') as zip_ref:
        zip_ref.extractall("data/aemo_data")
    print(f"Unzipped {outage_detail_zipname}")
except requests.exceptions.RequestException as e:
    print(f"Failed to download {outage_detail_url}: {e}")

Downloaded data/aemo_data/NETWORK_OUTAGEDETAIL_202501.zip
Unzipped data/aemo_data/NETWORK_OUTAGEDETAIL_202501.zip


In [9]:
# prepare the data for analysis

## remove first line and last line of the csv file
## STARTTIME,ENDTIME,SUBMITTEDDATE - "2003/03/15 07:00:00","2003/06/13 17:00:00","2003/04/29 12:13:51"
## filter data for 2022 and later using START_DATE
## save the prepared data to data/analysis/NETWORK_OUTAGEDETAIL.csv
## gzip the file

def prepare_outage_detail_data():
    input_filename = "data/aemo_data/PUBLIC_ARCHIVE#NETWORK_OUTAGEDETAIL#FILE01#202501010000.CSV"
    output_filename = "data/analysis/NETWORK_OUTAGEDETAIL_202201_202501.csv"

    try:
        df = pd.read_csv(input_filename, skiprows=1, skipfooter=1, engine='python')
        df['STARTTIME'] = pd.to_datetime(df['STARTTIME'])
        df = df[df['STARTTIME'].dt.year >= 2022]
        df.to_csv(output_filename, index=False)
        print(f"Prepared data saved to {output_filename}")
    except Exception as e:
        print(f"Failed to prepare data: {e}")

prepare_outage_detail_data()

Prepared data saved to data/analysis/NETWORK_OUTAGEDETAIL_202201_202501.csv.gz
