In [8]:
import pandas as pd
import numpy as np
from utils.github_commit import push_df_to_github
import os

# --- CONFIG ---
os.environ["GITHUB_TOKEN"] = "ghp_#########"  # Replace or inject securely in Colab

years = list(range(2010, 2026))
base_url = "https://raw.githubusercontent.com/blainehodder/WCSB_Supply_Demand/main/raw_data/st39/ST39-{}.xlsx"
sheet_name = "VAR0800-ST39Extracts_xls"

product_anchors = [
    "Oil Sands (tonnes)", "Synthetic Crude Oil  (m3)", "Diluent Naphtha (m3)",
    "Process Gas (103m3)", "Bitumen (m3)", "Electricity (MWh)", "Sulphur (tonnes)"
]

month_map = {
    "January": 1, "February": 2, "March": 3, "April": 4,
    "May": 5, "June": 6, "July": 7, "August": 8,
    "September": 9, "October": 10, "November": 11, "December": 12
}

def is_operator_row(row, month_cols):
    numeric_count = sum(pd.to_numeric(row[month_cols], errors='coerce').notna())
    return pd.notna(row["Metric"]) and numeric_count >= 4

all_years_data = []

for year in years:
    print(f"Processing {year}...")
    try:
        url = base_url.format(year)
        df = pd.read_excel(url, sheet_name=sheet_name, header=None)

        block_starts = [
            (i, str(row[0]).strip())
            for i, row in df.iterrows()
            if str(row[0]).strip() in product_anchors
        ]

        year_blocks = []

        for idx, (start_idx, product_name) in enumerate(block_starts):
            end_idx = block_starts[idx + 1][0] if idx + 1 < len(block_starts) else len(df)

            try:
                header_row = df.iloc[start_idx + 1, 1:13].tolist()
                year_val = df.iloc[start_idx + 1, 13]

                if any(pd.isna(header_row)) or pd.isna(year_val):
                    continue

                block = df.iloc[start_idx + 2:end_idx, 0:14].copy()
                block.columns = ['Metric'] + header_row + ['Drop']
                block["Product"] = product_name
                block["Year"] = int(year_val)

                month_cols = header_row
                block["Is_Operator_Row"] = block.apply(lambda r: is_operator_row(r, month_cols), axis=1)
                block["Operator"] = np.where(block["Is_Operator_Row"], block["Metric"], np.nan)
                block["Operator"] = block["Operator"].ffill()

                usable = block[~block["Is_Operator_Row"]].copy()
                year_blocks.append(usable)

            except Exception as e:
                print(f"⚠️ Skipping block at {start_idx} in {product_name}: {e}")

        if year_blocks:
            all_years_data.append(pd.concat(year_blocks, ignore_index=True))
        else:
            print(f"⚠️ No valid blocks found in {year}")

    except Exception as e:
        print(f"❌ Failed to process {year}: {e}")

# Combine all
if not all_years_data:
    raise ValueError("❌ No valid ST39 data parsed.")

combined = pd.concat(all_years_data, ignore_index=True)

# Melt
melted = pd.melt(
    combined,
    id_vars=["Operator", "Metric", "Product", "Year"],
    value_vars=header_row,
    var_name="Month",
    value_name="Value"
)

melted["Month_Num"] = melted["Month"].map(month_map)
melted["Date"] = pd.to_datetime(dict(year=melted["Year"], month=melted["Month_Num"], day=1))
melted["Value"] = pd.to_numeric(melted["Value"], errors="coerce")

final_st39_df = melted.dropna(subset=["Value"]).sort_values(by=["Date", "Operator", "Product"])

# --- Push to GitHub ---

push_df_to_github(
    df=final_st39_df,
    user="blainehodder",
    repo="WCSB_Supply_Demand",
    path="clean_data/st39/st39_cleaned.csv",
    commit_message="Upload cleaned ST39 data (multi-year)"
)

print("✅ Successfully processed and pushed ST39 data.")


Processing 2010...
Processing 2011...
Processing 2012...
Processing 2013...
Processing 2014...
Processing 2015...
Processing 2016...
Processing 2017...
Processing 2018...
Processing 2019...
Processing 2020...
Processing 2021...
Processing 2022...
Processing 2023...
Processing 2024...
Processing 2025...
✅ File pushed to GitHub: https://github.com/blainehodder/WCSB_Supply_Demand/blob/main/clean_data/st39/st39_cleaned.csv
✅ Successfully processed and pushed ST39 data.
