In [1]:
# --- Clone repo to get push utility ---
!git clone https://github.com/blainehodder/WCSB_Supply_Demand.git
import sys
sys.path.append("/content/WCSB_Supply_Demand")

# --- Import push function ---
from utils.github_commit import push_df_to_github

# --- Core imports ---
import pandas as pd
import requests
from io import BytesIO
import os

# --- GitHub token (REPLACE WITH YOURS) ---
os.environ["GITHUB_TOKEN"] = "ghp_####"

# --- CONFIG ---
years = list(range(2010, 2026))
base_url = "https://raw.githubusercontent.com/blainehodder/WCSB_Supply_Demand/main/raw_data/st53/ST53_{}.xls"

all_data = []

for year in years:
    try:
        print(f"\n📦 Processing {year}...")
        url = base_url.format(year)
        response = requests.get(url)
        response.raise_for_status()

        # Load XLS and parse BITUMEN sheet
        xls = pd.ExcelFile(BytesIO(response.content))
        df = pd.read_excel(xls, sheet_name="BITUMEN", header=None)

        # Detect the header row dynamically
        header_idx = df[df.apply(lambda row: row.astype(str).str.contains("Operator", case=False).any(), axis=1)].index[0]
        df.columns = df.iloc[header_idx]
        df = df.iloc[header_idx + 1:].reset_index(drop=True)

        # Normalize column names
        df.columns = df.columns.str.strip()

        # Skip if recovery method is completely missing
        if "Recovery Method" not in df.columns:
            print(f"⚠️ No 'recovery method' in {year}")
            continue

        print("✅ Recovery methods seen:", df["Recovery Method"].dropna().unique()[:10])

        # Melt to long format
        id_cols = ['Operator', 'Scheme Name', 'Area', 'Approval Number', 'Recovery Method']
        months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun',
                  'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']

        # Coerce production values
        for m in months:
            if m in df.columns:
                df[m] = pd.to_numeric(df[m], errors="coerce")

        melted = df.melt(id_vars=id_cols, value_vars=months,
                         var_name="Month", value_name="Bitumen Production")

        melted["Year"] = year
        melted["Month_Num"] = melted["Month"].map({
            'Jan': 1, 'Feb': 2, 'Mar': 3, 'Apr': 4, 'May': 5, 'Jun': 6,
            'Jul': 7, 'Aug': 8, 'Sep': 9, 'Oct': 10, 'Nov': 11, 'Dec': 12
        })
        melted["Date"] = pd.to_datetime(dict(year=melted["Year"], month=melted["Month_Num"], day=1))

        # Clean and keep valid records
        melted["Bitumen Production"] = pd.to_numeric(melted["Bitumen Production"], errors="coerce")
        cleaned = melted[['Date'] + id_cols + ['Bitumen Production']].copy()
        cleaned = cleaned.dropna(subset=["Bitumen Production", "Operator", "Recovery Method"])

        all_data.append(cleaned)

    except Exception as e:
        print(f"❌ Failed {year}: {e}")

# Final concat and push
if not all_data:
    raise ValueError("No data loaded from any year.")

final_df = pd.concat(all_data, ignore_index=True)

print(f"\n🎯 Final shape: {final_df.shape}")
print(f"📅 Dates: {final_df['Date'].min()} → {final_df['Date'].max()}")
print(f"🔍 Recovery Methods: {final_df['Recovery Method'].dropna().unique()}")

# --- Push to GitHub ---
push_df_to_github(
    df=final_df,
    user="blainehodder",
    repo="WCSB_Supply_Demand",
    path="clean_data/st53/st53_cleaned.csv",
    commit_message="Full ST53 cleaned with Primary rows preserved and recovery method fix"
)


Cloning into 'WCSB_Supply_Demand'...
remote: Enumerating objects: 198, done.[K
remote: Counting objects: 100% (198/198), done.[K
remote: Compressing objects: 100% (178/178), done.[K
remote: Total 198 (delta 58), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (198/198), 5.70 MiB | 3.43 MiB/s, done.
Resolving deltas: 100% (58/58), done.

📦 Processing 2010...
✅ Recovery methods seen: ['Commercial' 'Commercial-CSS' 'Commercial-SAGD' 'Enhanced Recovery'
 'Experimental' 'Primary' 'Total ']

📦 Processing 2011...
✅ Recovery methods seen: ['Commercial' 'Commercial-CSS' 'Commercial-SAGD' 'Enhanced Recovery'
 'Experimental' 'Primary' 'Total ']

📦 Processing 2012...
✅ Recovery methods seen: ['Commercial-CSS' 'Commercial-SAGD' 'Enhanced Recovery' 'Experimental'
 'Primary' 'Total ']

📦 Processing 2013...
✅ Recovery methods seen: ['Commercial-CSS' 'Commercial-SAGD' 'Enhanced Recovery' 'Experimental'
 'Primary' 'Total ']

📦 Processing 2014...
✅ Recovery methods seen: ['Commer