# Data Transformation

## Table A. Energy Supply & Consumption Balance

In [1]:
import pandas as pd

# Load both sheets
xls = pd.ExcelFile("../data/raw/table_a_supply_consumption.xlsx")

# Optional: Check names if not sure
print(xls.sheet_names)


['Title page', '2021-22', '2022-23']


In [2]:
# Step 1: Define flow type mapping (simplified to start)
flow_map = {
    "Primary indigenous supply": "primary_supply",
    "plus all imports": "imports",
    "less all exports": "exports",
    "less stock changes": "stock_changes",
    "less discrepancies": "statistical_discrepancies",
    "Total primary energy supply": "primary_energy_supply",

    "Coke ovens": "conversion",
    "LNG plants": "conversion",
    "Petroleum refining": "conversion",
    "Gas manufacturing": "conversion",
    "Electricity generation": "conversion",
    "Other conversion": "conversion",
    "Fuel use in conversion": "conversion_losses",

    "Total final energy consumption": "final_consumption",

    "Agriculture": "sector_agriculture",
    "Mining": "sector_mining",
    "Food, beverages, textiles": "sector_industry",
    "Wood, paper And printing": "sector_industry",
    "Chemical": "sector_industry",
    "Iron and steel": "sector_industry",
    "Non-ferrous metals": "sector_industry",
    "Other industry": "sector_industry",
    "Water and waste": "sector_services",
    "Construction": "sector_services",
    "Transport": "sector_transport",
    "Commercial and services": "sector_commercial",
    "Residential": "sector_residential",
    "Lubes, bitumen, solvents": "sector_other"
}

In [3]:
sheets = {"2021-22": "2021-22", "2022-23": "2022-23"}
dfs = []

for sheet_name, year in sheets.items():
    df = xls.parse(sheet_name=sheet_name, skiprows=4)
    df = df.dropna(axis=1, how='all')
    
    # Clean column names
    df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_').str.replace(r'[^a-z0-9_]', '', regex=True)
    id_col = df.columns[0]
    
    # Drop total row and nans
    df = df[~df[id_col].str.lower().str.contains("total", na=False)]
    df = df.dropna(subset=[id_col])
    
    # Assign flow_type based on matching
    def assign_flow(label):
        for key, val in flow_map.items():
            if key.lower() in str(label).lower():
                return val
        return "unclassified"

    df["flow_type"] = df[id_col].apply(assign_flow)
    df["period"] = year
    
    # Melt to long format
    df_melt = df.melt(id_vars=[id_col, "flow_type", "period"], var_name="fuel", value_name="petajoules")
    dfs.append(df_melt)



In [4]:
# Combine and save
df_final = pd.concat(dfs, ignore_index=True)
df_final = df_final.dropna(subset=["petajoules"])
df_final = df_final.rename(columns={"unnamed_1": "flow_description"})
df_final.head()


Unnamed: 0,flow_description,flow_type,period,fuel,petajoules
0,Primary indigenous supply,primary_supply,2021-22,black_coal,11215.668
1,plus all imports,imports,2021-22,black_coal,2.636
2,less all exports,exports,2021-22,black_coal,10173.475
3,less stock changes,stock_changes,2021-22,black_coal,-202.904
4,less discrepancies,statistical_discrepancies,2021-22,black_coal,66.587


In [5]:
df_final.to_csv("../data/processed/table_a_energy_supply_consumption.csv", index=False)
print("Data processing complete. Output saved to ../data/processed/energy_supply_consumption.csv")

Data processing complete. Output saved to ../data/processed/energy_supply_consumption.csv


## Table B. Population GDP Consumption

In [6]:
xls = pd.ExcelFile("../data/raw/table_b_population_gdp_consumption.xlsx")
print("Sheets:", xls.sheet_names)

Sheets: ['Title page', 'AUS', 'NSW', 'VIC', 'QLD', 'SA', 'WA', 'TAS', 'NT']


In [7]:
import pandas as pd

def load_table_b_clean(path, sheet_code, sheet_map):
    """
    Cleans one sheet from Table B (Population, GDP, and Energy Consumption).
    Combines title + unit for column names, cleans year, adds state.
    """
    df_all = pd.read_excel(path, sheet_name=sheet_code, header=None)

    # Extract column headers from row 4 and units from row 5
    title_row = df_all.iloc[4]
    unit_row = df_all.iloc[5]

    # Build combined column names
    combined_columns = []
    for idx, (title, unit) in enumerate(zip(title_row, unit_row)):
        title = str(title).strip()
        unit = str(unit).strip()

        if idx == 1:
            combined_columns.append("year")
        elif title and title.lower() != "nan":
            col_name = f"{title} [{unit}]" if unit and unit.lower() != "nan" else title
            combined_columns.append(col_name)
        else:
            combined_columns.append(f"unnamed_{idx}")

    # Load data starting from row 6 onward
    df = df_all.iloc[6:].copy()
    df.columns = combined_columns

    # Drop the first unnamed column if it exists (e.g., "unnamed_0")
    if "unnamed_0" in df.columns:
        df = df.drop(columns=["unnamed_0"])

    # Keep only rows with a valid year
    df = df[df["year"].astype(str).str.match(r"\d{4}-\d{2}", na=False)]

    # Convert "1962-63" to 1962 (int)
    df["year"] = df["year"].astype(str).str.extract(r"^(\d{4})").astype(int)

    # Add state column
    df["state"] = sheet_map.get(sheet_code, sheet_code)

    return df

In [8]:
# Sheet mapping
sheet_map = {
    "AUS": "Australia",
    "NSW": "New South Wales",
    "VIC": "Victoria",
    "QLD": "Queensland",
    "SA": "South Australia",
    "WA": "Western Australia",
    "TAS": "Tasmania",
    "NT": "Northern Territory"
}

# File path
path = "../data/raw/table_b_population_gdp_consumption.xlsx"

# Clean and combine all sheets
all_dfs = []
for sheet_code in sheet_map:
    df_clean = load_table_b_clean(path, sheet_code, sheet_map)
    all_dfs.append(df_clean)

df_b_final = pd.concat(all_dfs, ignore_index=True)


df_b_final.columns = [
    col.lower().strip().replace(" ", "_") if "[" not in col else
    col.split("[")[0].strip().lower().replace(" ", "_") + " [" + col.split("[")[1].lower()
    for col in df_b_final.columns
]


# Save
df_b_final.to_csv("../data/processed/table_b_population_gdp_consumption.csv", index=False)
print("Table B cleaned and saved to: data/processed/table_b_population_gdp_consumption.csv")

Table B cleaned and saved to: data/processed/table_b_population_gdp_consumption.csv


In [9]:
df_b_final.head()

Unnamed: 0,year,population [number],gdp [$ million],energy_consumption [pj],energy_consumption_per_capita [gj/person],energy_intensity [gj/$ million],energy_productivity [$ million/pj],state
0,1960,10548267,308362,1336.6,126.713,4334.516,230.706,Australia
1,1961,10742291,312353,1365.8,127.142,4372.617,228.696,Australia
2,1962,10950379,331770,1432.2,130.79,4316.846,231.651,Australia
3,1963,11166702,354929,1531.4,137.14,4314.666,231.768,Australia
4,1964,11387665,376155,1625.3,142.725,4320.825,231.437,Australia


## Table C. Consumption by Fuel

In [10]:
def load_table_c_clean(path, sheet_code, sheet_map):
    """
    Cleans one sheet from Table C (Consumption by Fuel).
    Combines title + unit, cleans year, adds state.
    """
    import pandas as pd

    df_all = pd.read_excel(path, sheet_name=sheet_code, header=None)

    # Extract column titles (row 4) and units (row 5)
    title_row = df_all.iloc[4]
    unit_row = df_all.iloc[5]

    # Build clean column names
    combined_columns = []
    for idx, (title, unit) in enumerate(zip(title_row, unit_row)):
        title = str(title).strip()
        unit = str(unit).strip()

        if idx == 1:
            combined_columns.append("year")
        elif title and title.lower() != "nan":
            col_name = f"{title} [{unit}]" if unit and unit.lower() != "nan" else title
            combined_columns.append(col_name.lower().replace(" ", "_"))
        else:
            combined_columns.append(f"unnamed_{idx}")

    # Load data from row 6 onward
    df = df_all.iloc[6:].copy()
    df.columns = combined_columns

    # Drop the unnamed_0 column (filler)
    if "unnamed_0" in df.columns:
        df = df.drop(columns=["unnamed_0"])

    # Filter valid year rows
    df = df[df["year"].astype(str).str.match(r"\d{4}-\d{2}", na=False)]

    # Convert to 4-digit year
    df["year"] = df["year"].astype(str).str.extract(r"^(\d{4})").astype(int)

    # Add state column
    df["state"] = sheet_map.get(sheet_code, sheet_code)

    return df

In [11]:
sheet_map = {
    "AUS": "Australia",
    "NSW": "New South Wales",
    "VIC": "Victoria",
    "QLD": "Queensland",
    "SA": "South Australia",
    "WA": "Western Australia",
    "TAS": "Tasmania",
    "NT": "Northern Territory"
}

path = "../data/raw/table_c_consumption_by_fuel.xlsx"
dfs = []

for sheet_code in sheet_map:
    df_clean = load_table_c_clean(path, sheet_code, sheet_map)
    dfs.append(df_clean)

df_c_final = pd.concat(dfs, ignore_index=True)

# Optional: normalize column names to snake_case + keep units
df_c_final.columns = [
    col.lower().strip().replace(" ", "_") if "[" not in col else
    col.split("[")[0].strip().lower().replace(" ", "_") + " [" + col.split("[")[1].lower()
    for col in df_c_final.columns
]

# Save final cleaned table
df_c_final.to_csv("../data/processed/table_c_consumption_by_fuel.csv", index=False)
print("Table C cleaned and saved.")

Table C cleaned and saved.


In [12]:
df_c_final.head()

Unnamed: 0,year,coal_ [pj],oil_ [pj],gas_ [pj],renewables_ [pj],statistical_discrepancy_b_ [pj],total_ [pj],state
0,1960,na,510.0,0.0,189.1,na,1336.6,Australia
1,1961,na,537.3,0.0,185.6,na,1365.8,Australia
2,1962,na,581.9,0.1,194.6,na,1432.2,Australia
3,1963,na,656.8,0.1,188.9,na,1531.4,Australia
4,1964,na,719.9,0.1,198.2,na,1625.3,Australia


In [13]:
import numpy as np
df_melted = pd.melt(
    df_c_final,
    id_vars=['year', 'state'],
    value_vars=[
        'coal_ [pj]',
        'oil_ [pj]',
        'gas_ [pj]',
        'renewables_ [pj]',
        'statistical_discrepancy_b_ [pj]',
        'total_ [pj]'
    ],
    var_name='source',
    value_name='energy_pj'
)

# Optional: Clean the 'source' column to remove brackets and underscores
df_melted['source'] = df_melted['source'].str.replace(r'_ \[pj\]', '', regex=True)

df_melted.head(20)

df_melted["energy_pj"] = df_melted["energy_pj"].replace("na", np.nan).astype(float)
df_melted["energy_pj"] = df_melted["energy_pj"].fillna(0)

df_melted.head()

  df_melted["energy_pj"] = df_melted["energy_pj"].replace("na", np.nan).astype(float)


Unnamed: 0,year,state,source,energy_pj
0,1960,Australia,coal,0.0
1,1961,Australia,coal,0.0
2,1962,Australia,coal,0.0
3,1963,Australia,coal,0.0
4,1964,Australia,coal,0.0


In [14]:
# lets remove the statistical discrepancy from source
df_melted = df_melted[df_melted["source"] != "statistical_discrepancy_b"]
df_melted = df_melted[df_melted["source"] != "total"]

df_melted["source"].unique()

array(['coal', 'oil', 'gas', 'renewables'], dtype=object)

In [15]:
df_melted.to_csv("../data/processed/table_c_consumption_melted.csv", index=False)
print("Table C cleaned and saved.")

Table C cleaned and saved.


## Table D. Consumption Detailed Fuel

In [16]:
def clean_table_d_by_state(path: str, sheet_name: str, state_name: str) -> pd.DataFrame:
    import pandas as pd

    # Read entire sheet
    df_all = pd.read_excel(path, sheet_name=sheet_name, header=None)

    # Slice from row 7 down (actual data)
    df = df_all.iloc[7:].copy()

    # Assign clean column names manually
    df.columns = [
        "drop_0", "year_raw",
        "black_coal [pj]", "brown_coal [pj]", "coke [pj]", "coal_by_products [pj]",
        "liquid_gas_biofuels [pj]", "wood_woodwaste [pj]", "bagasse [pj]",
        "refinery_input [pj]", "petroleum_products [pj]", "natural_gas [pj]",
        "town_gas [pj]", "electricity [pj]", "solar_hotwater [pj]", "drop_15",
        "coke_derived [pj]", "coal_by_products_derived [pj]", "petroleum_products_a [pj]",
        "town_gas_derived [pj]", "thermal_electricity [pj]", "drop_21",
        "total_energy_consumption [pj]"
    ]

    # Extract 4-digit year
    df["year"] = df["year_raw"].astype(str).str.extract(r"^(\d{4})")
    df["year"] = df["year"].astype("Int64")

    # Drop unused columns
    df_clean = df.drop(columns=["drop_0", "year_raw", "drop_15", "drop_21"])

    # Melt to long format
    df_long = df_clean.melt(id_vars="year", var_name="fuel", value_name="pj")

    # Assign category
    df_long["category"] = df_long["fuel"].apply(
        lambda x: "consumption_of_fuels" if "derived" not in x and "total" not in x
        else ("production_of_derived_fuels" if "derived" in x else "total")
    )

    # Clean fuel name
    df_long["fuel"] = df_long["fuel"].str.replace("_derived", "", regex=False)

    # Convert pj to float
    df_long["pj"] = pd.to_numeric(df_long["pj"], errors="coerce")
    df_long.rename(columns={"pj": "energy_[pj]"}, inplace=True)

    # Add state
    df_long["state"] = state_name

    return df_long

In [17]:
# Map sheet codes to readable state names
sheet_map = {
    "AUS": "Australia",
    "NSW": "New South Wales",
    "VIC": "Victoria",
    "QLD": "Queensland",
    "SA": "South Australia",
    "WA": "Western Australia",
    "TAS": "Tasmania",
    "NT": "Northern Territory"
}

path = "../data/raw/table_d_consumption_detailed_fuel.xlsx"
all_states_df = []

# Loop through each sheet/state
for sheet_code, state_name in sheet_map.items():
    print(f"Processing: {state_name}")
    df_state = clean_table_d_by_state(path, sheet_code, state_name)
    all_states_df.append(df_state)

# Combine all
df_d_final = pd.concat(all_states_df, ignore_index=True)

# Save final dataset
df_d_final.to_csv("../data/processed/table_d_consumption_detailed_fuel_all_states.csv", index=False)
print("All states processed and saved.")

Processing: Australia
Processing: New South Wales
Processing: Victoria
Processing: Queensland
Processing: South Australia
Processing: Western Australia
Processing: Tasmania
Processing: Northern Territory
All states processed and saved.


In [18]:
df_d_final.head()

Unnamed: 0,year,fuel,energy_[pj],category,state
0,1960,black_coal [pj],495.3,consumption_of_fuels,Australia
1,1961,black_coal [pj],492.3,consumption_of_fuels,Australia
2,1962,black_coal [pj],494.3,consumption_of_fuels,Australia
3,1963,black_coal [pj],517.0,consumption_of_fuels,Australia
4,1964,black_coal [pj],531.5,consumption_of_fuels,Australia


## Table E: Consumption by Industry

In [19]:
def load_table_e_all_states(path: str, sheet_map: dict) -> pd.DataFrame:
    import pandas as pd

    all_dfs = []

    for sheet_code, state in sheet_map.items():
        df_raw = pd.read_excel(path, sheet_name=sheet_code, header=None)

        # Step 1: Manually assign headers using rows 4–6
        cat_row = df_raw.iloc[4]
        ind_row = df_raw.iloc[5]
        unit_row = df_raw.iloc[6]

        # Step 2: Extract data from row 7 onwards
        df = df_raw.iloc[7:].reset_index(drop=True)

        # Step 3: Generate column names
        column_names = []
        for i in range(df.shape[1]):
            cat = str(cat_row[i]).strip().lower().replace(" ", "_") if pd.notna(cat_row[i]) else ""
            ind = str(ind_row[i]).strip().lower().replace(" ", "_") if pd.notna(ind_row[i]) else ""
            unit = str(unit_row[i]).strip().lower() if pd.notna(unit_row[i]) else ""

            if i == 1:
                column_names.append("year_raw")
            elif ind:
                column_names.append(f"{cat}_{ind} [{unit}]".strip("_"))
            else:
                column_names.append(f"drop_{i}")

        df.columns = column_names

        # Step 4: Clean and extract year
        df = df[df["year_raw"].notna()].copy()
        df["year"] = df["year_raw"].astype(str).str.extract(r"(\d{4})")
        df = df[df["year"].notna()]
        df["year"] = df["year"].astype(int)

        # Step 5: Drop unused columns and melt
        df = df.drop(columns=[col for col in df.columns if col.startswith("drop_") or col == "year_raw"])

        df_long = df.melt(id_vars="year", var_name="industry", value_name="energy")

        # Step 6: Clean industry names and extract unit
        df_long["unit"] = df_long["industry"].str.extract(r"\[(.*?)\]")
        df_long["industry"] = df_long["industry"].str.replace(r"\s*\[.*?\]", "", regex=True)
        df_long["state"] = state

        all_dfs.append(df_long)

    return pd.concat(all_dfs, ignore_index=True)

In [20]:
sheet_map = {
    "AUS": "Australia",
    "NSW": "New South Wales",
    "VIC": "Victoria",
    "QLD": "Queensland",
    "SA": "South Australia",
    "WA": "Western Australia",
    "TAS": "Tasmania",
    "NT": "Northern Territory"
}

path = "../data/raw/table_e_consumption_by_industry.xlsx"
df_e_final = load_table_e_all_states(path, sheet_map)

# Preview
print(df_e_final.head())

# Save
df_e_final.to_csv("../data/processed/table_e_consumption_by_industry.csv", index=False)


   year        industry energy  unit      state
0  1974  agriculture_pj   39.4  38.7  Australia
1  1975  agriculture_pj   39.9  38.7  Australia
2  1976  agriculture_pj   41.2  38.7  Australia
3  1977  agriculture_pj   43.3  38.7  Australia
4  1978  agriculture_pj   44.9  38.7  Australia


## Table H - Final Consumption 

### Table H3 - Total Final Energy Consumption by Industry 

In [21]:
def load_table_h_fuel_industry(path, sheet_name="TFEC fuel and industry"):
    # Read the sheet with no header so we can inspect manually
    df_raw = pd.read_excel(path, sheet_name=sheet_name, header=None)

    # Step 1: Identify where the actual data starts
    df_data = df_raw.iloc[6:, 1:]  # starting from row 6, drop first column (mostly empty or index)

    # Step 2: Set proper column names
    years = df_raw.iloc[4, 2:].tolist()  # Row 4 contains the years (e.g., 2002-03)
    df_data.columns = ["category"] + years

    # Step 3: Fill down the industry sectors
    df_data["sector"] = df_data["category"].where(~df_data["category"].isin(
        ["Coal", "Gas", "Oil", "Electricity", "Renewables", "Total"]
    ))
    df_data["sector"] = df_data["sector"].fillna(method="ffill")

    # Step 4: Keep only fuel-type rows (not "Total" rows)
    fuel_types = ["Coal", "Gas", "Oil", "Electricity", "Renewables"]
    df_fuel = df_data[df_data["category"].isin(fuel_types)].copy()

    # Step 5: Reshape to long format
    df_long = df_fuel.melt(id_vars=["sector", "category"], var_name="year", value_name="energy_PJ")

    # Step 6: Clean up
    df_long["year"] = df_long["year"].astype(str).str.extract(r"(\d{4})").astype(int)
    df_long.rename(columns={"category": "fuel"}, inplace=True)
    df_long["state"] = "Australia"  # This table is at the national level

    # Final column order
    df_long = df_long[["year", "sector", "fuel", "energy_PJ", "state"]]

    return df_long

# Example usage:
path = "../data/raw/table_h_total_final_consumption.xlsx"
df_fuel_industry = load_table_h_fuel_industry(path)

df_fuel_industry.head()

  df_data["sector"] = df_data["sector"].fillna(method="ffill")


Unnamed: 0,year,sector,fuel,energy_PJ,state
0,2002,Agriculture,Coal,,Australia
1,2002,Agriculture,Gas,0.098,Australia
2,2002,Agriculture,Oil,87.726,Australia
3,2002,Agriculture,Electricity,9.92,Australia
4,2002,Agriculture,Renewables,,Australia


In [22]:
df_fuel_industry.to_csv("../data/processed/table_h_fuel_industry.csv", index=False)

In [25]:
df_fuel_industry_total = df_fuel_industry[df_fuel_industry["state"] == "Australia"]
df_fuel_industry_total = df_fuel_industry[df_fuel_industry["sector"] == "Total final consumption"]

df_fuel_industry_total.tail()

Unnamed: 0,year,sector,fuel,energy_PJ,state
2200,2022,Total final consumption,Coal,99.631,Australia
2201,2022,Total final consumption,Gas,561.046,Australia
2202,2022,Total final consumption,Oil,2190.041,Australia
2203,2022,Total final consumption,Electricity,855.866,Australia
2204,2022,Total final consumption,Renewables,193.851,Australia


In [27]:
df_fuel_industry_total.to_csv("../data/processed/table_h_fuel_industry_total.csv", index=False)

In [30]:
total_energy_2022 = df_fuel_industry_total[df_fuel_industry_total["year"] == 2022]['energy_PJ'].sum()
total_energy_2022

3900.4350000000004

## Table O - Electricity Generation

In [192]:
def load_table_o_all_states(path: str) -> pd.DataFrame:
    xls = pd.ExcelFile(path)
    
    # Map sheet names to state codes
    sheet_map = {s: s.replace(" FY", "") for s in xls.sheet_names if "FY" in s and s != "ACT FY"}

    # Map of short codes to full state names
    state_name_map = {
        "AUS": "Australia",
        "NSW": "New South Wales",
        "VIC": "Victoria",
        "QLD": "Queensland",
        "SA": "South Australia",
        "WA": "Western Australia",
        "TAS": "Tasmania",
        "NT": "Northern Territory"
    }

    all_dfs = []

    for sheet_name, state_code in sheet_map.items():
        df_raw = pd.read_excel(path, sheet_name=sheet_name, header=None)

        # Extract years and units
        years = df_raw.iloc[4, 2:].tolist()
        df = df_raw.iloc[6:, 1:].copy()
        df.columns = ['fuel_name'] + years

        # Reshape
        df = df.melt(id_vars="fuel_name", var_name="year", value_name="generation")
        df["unit"] = "gwh"
        df["state"] = state_name_map.get(state_code, state_code)

        # Assign type
        fuel_type = None
        cleaned_rows = []

        for _, row in df.iterrows():
            name = str(row["fuel_name"]).strip()

            if name.lower() == "non-renewable fuels":
                fuel_type = "non-renewable"
                continue
            elif name.lower() == "renewable fuels":
                fuel_type = "renewable"
                continue
            elif "total" in name.lower() or "note" in name.lower() or name.lower().startswith("a "):
                continue

            # Assign type and keep row
            new_row = row.copy()
            new_row["type"] = fuel_type
            cleaned_rows.append(new_row)

        df_clean = pd.DataFrame(cleaned_rows)

        # Convert year from "2022-23" → 2022
        df_clean["year"] = df_clean["year"].astype(str).str.extract(r"(\d{4})").astype(float).astype("Int64")
        df_clean["generation"] = pd.to_numeric(df_clean["generation"], errors="coerce")

        all_dfs.append(df_clean)

    # Combine and reorder
    df_final = pd.concat(all_dfs, ignore_index=True)
    df_final = df_final[["fuel_name", "year", "generation", "unit", "state", "type"]]

    return df_final

In [194]:
path = "../data/raw/table_o_electricity_generation.xlsx"
df_generation = load_table_o_all_states(path)

df_generation.head(10)

Unnamed: 0,fuel_name,year,generation,unit,state,type
0,Black coal,1989,87573.0,gwh,Australia,non-renewable
1,Brown coal,1989,33594.0,gwh,Australia,non-renewable
2,Natural gas,1989,14359.0,gwh,Australia,non-renewable
3,Oil products,1989,3552.0,gwh,Australia,non-renewable
4,Other a,1989,,gwh,Australia,non-renewable
5,,1989,,gwh,Australia,non-renewable
6,"Bagasse, wood",1989,750.0,gwh,Australia,renewable
7,Biogas,1989,,gwh,Australia,renewable
8,Wind,1989,,gwh,Australia,renewable
9,Hydro,1989,14880.0,gwh,Australia,renewable


In [195]:
df_generation.to_csv("../data/processed/table_o_electricity_generation.csv", index=False)
print("Table O cleaned and saved to: data/processed/table_o_electricity_generation.csv")

Table O cleaned and saved to: data/processed/table_o_electricity_generation.csv


## Table R - Renewable Energy Consumption

In [None]:
path = "../data/raw/table_r_renewables_consumption.xlsx"
sheet_name = "Consumption by fuel"

# Read raw data
df_raw = pd.read_excel(path, sheet_name=sheet_name, header=None)

# Extract year labels and clean them to be 4-digit starting years
years_raw = df_raw.iloc[4, 2:].tolist()  # Skip first two columns
years_cleaned = [str(y).split("-")[0] if isinstance(y, str) and "-" in y else str(y) for y in years_raw]

# Extract unit from the units row (usually row 5, column 2)
unit = df_raw.iloc[5, 2]

# Extract fuel names and their corresponding values
df_fuels = df_raw.iloc[6:].reset_index(drop=True)
df_fuels = df_fuels[[1] + list(range(2, 2 + len(years_cleaned)))]
df_fuels.columns = ['fuel'] + years_cleaned

# Melt the table to long format
df_long = df_fuels.melt(id_vars='fuel', var_name='year', value_name='consumption')

# Add unit column
df_long['unit'] = unit

# Remove any rows where fuel or year is missing
df_long = df_long[df_long['fuel'].notna() & df_long['year'].notna()].reset_index(drop=True)

# Preview the cleaned dataset
df_long.head()

Unnamed: 0,fuel,year,consumption,unit
0,Biomass,2015,196.075,PJ
1,wood and other b,2015,93.857,PJ
2,bagasse,2015,102.218,PJ
3,Municpal and industrial waste,2015,2.543,PJ
4,Biogas,2015,15.845,PJ


In [None]:
# Define path and sheet names
path = "../data/raw/table_r_renewables_consumption.xlsx"

# R1 – Consumption by fuel
sheet_r1 = "Consumption by fuel"
df_r1_raw = pd.read_excel(path, sheet_name=sheet_r1, header=None)

years_r1 = df_r1_raw.iloc[4, 2:].tolist()
years_r1_clean = [str(y).split("-")[0] if isinstance(y, str) and "-" in y else str(y) for y in years_r1]
unit_r1 = df_r1_raw.iloc[5, 2]

df_r1 = df_r1_raw.iloc[6:].reset_index(drop=True)
df_r1 = df_r1[[1] + list(range(2, 2 + len(years_r1_clean)))]
df_r1.columns = ['fuel'] + years_r1_clean

r1 = df_r1.melt(id_vars='fuel', var_name='year', value_name='consumption')
r1['unit'] = unit_r1
r1 = r1[r1['fuel'].notna() & r1['year'].notna()].reset_index(drop=True)

# R2 – Consumption by industry
sheet_r2 = "Consumption by industry"
df_r2_raw = pd.read_excel(path, sheet_name=sheet_r2, header=None)

years_r2 = df_r2_raw.iloc[4, 2:].tolist()
years_r2_clean = [str(y).split("-")[0] if isinstance(y, str) and "-" in y else str(y) for y in years_r2]
unit_r2 = df_r2_raw.iloc[5, 2]

df_r2 = df_r2_raw.iloc[6:].reset_index(drop=True)
df_r2 = df_r2[[1] + list(range(2, 2 + len(years_r2_clean)))]
df_r2.columns = ['sector'] + years_r2_clean

r2 = df_r2.melt(id_vars='sector', var_name='year', value_name='consumption')
r2['unit'] = unit_r2
r2 = r2[r2['sector'].notna() & r2['year'].notna()].reset_index(drop=True)

# -----------------------
# R3 – Consumption by activity
# -----------------------
sheet_r3 = "Consumption by activity"
df_r3_raw = pd.read_excel(path, sheet_name=sheet_r3, header=None)

# Combine multi-level headers from rows 4 and 5
r3_headers = []
for col in range(2, df_r3_raw.shape[1]):
    main = df_r3_raw.iloc[4, col]
    sub = df_r3_raw.iloc[5, col]
    if pd.notna(main) and pd.notna(sub):
        label = f"{main} - {sub}"
    elif pd.notna(main):
        label = str(main)
    elif pd.notna(sub):
        label = str(sub)
    else:
        label = f"Unnamed_{col}"
    r3_headers.append(label)

df_r3 = df_r3_raw.iloc[7:].reset_index(drop=True)
df_r3 = df_r3[[1] + list(range(2, 2 + len(r3_headers)))]
df_r3.columns = ['fuel'] + r3_headers

r3 = df_r3.melt(id_vars='fuel', var_name='activity', value_name='consumption')
r3['unit'] = 'PJ'
r3 = r3[r3['fuel'].notna() & r3['activity'].notna()].reset_index(drop=True)

# Save all three as CSV
r1.to_csv("../data/processed/table_r1_consumption_by_fuel.csv", index=False)
r2.to_csv("../data/processed/table_r2_consumption_by_industry.csv", index=False)
r3.to_csv("../data/processed/table_r3_consumption_by_activity.csv", index=False)