In [2]:
# --- Clone repo to get push utility ---
!git clone https://github.com/blainehodder/WCSB_Supply_Demand.git
import sys
sys.path.append("/content/WCSB_Supply_Demand")

# --- Import push function ---
from utils.github_commit import push_df_to_github

# --- Core imports ---
import pandas as pd
import requests
from io import BytesIO
import os

# --- GitHub token (REPLACE WITH YOURS) ---
os.environ["GITHUB_TOKEN"] = "ghp_#######"


# --- CONFIG ---
years = list(range(2010, 2026))
base_url = "https://raw.githubusercontent.com/blainehodder/WCSB_Supply_Demand/main/raw_data/st53/ST53_{}.xls"

all_data = []

for year in years:
    try:
        print(f"\n📦 Processing {year}...")
        url = base_url.format(year)
        response = requests.get(url)
        response.raise_for_status()

        # Load XLS
        xls = pd.ExcelFile(BytesIO(response.content))
        df = pd.read_excel(xls, sheet_name="BITUMEN", header=None)

        # Detect header row dynamically
        header_row = df[df.apply(lambda row: row.astype(str).str.contains("Operator", case=False).any(), axis=1)].index[0]
        df.columns = df.iloc[header_row]
        df = df.iloc[header_row + 1:].reset_index(drop=True)

        # Drop garbage
        df = df.dropna(subset=["Operator", "Scheme Name"])

        # Confirm Recovery Methods exist
        if "Recovery Method" not in df.columns:
            raise ValueError("Missing Recovery Method in header.")

        print("✅ Recovery methods seen:", df["Recovery Method"].dropna().unique()[:10])

        # Melt data
        id_cols = ['Operator', 'Scheme Name', 'Area', 'Approval Number', 'Recovery Method']
        months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun',
                  'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']

        melted = df.melt(id_vars=id_cols, value_vars=months,
                         var_name="Month", value_name="Bitumen Production")
        melted["Year"] = year
        melted["Month_Num"] = melted["Month"].map({
            'Jan': 1, 'Feb': 2, 'Mar': 3, 'Apr': 4, 'May': 5, 'Jun': 6,
            'Jul': 7, 'Aug': 8, 'Sep': 9, 'Oct': 10, 'Nov': 11, 'Dec': 12
        })
        melted["Date"] = pd.to_datetime(dict(year=melted["Year"], month=melted["Month_Num"], day=1))

        cleaned = melted[['Date'] + id_cols + ['Bitumen Production']].copy()
        cleaned["Bitumen Production"] = pd.to_numeric(cleaned["Bitumen Production"], errors="coerce")
        cleaned = cleaned.dropna(subset=["Bitumen Production"])

        all_data.append(cleaned)

    except Exception as e:
        print(f"❌ Failed {year}: {e}")

# Combine and push
final_df = pd.concat(all_data, ignore_index=True)

print(f"\n🎯 Final shape: {final_df.shape}")
print(f"📅 Dates: {final_df['Date'].min()} → {final_df['Date'].max()}")
print(f"🔍 Sample Recovery Methods: {final_df['Recovery Method'].dropna().unique()[:10]}")

# --- Push to GitHub ---
push_df_to_github(
    df=final_df,
    user="blainehodder",
    repo="WCSB_Supply_Demand",
    path="clean_data/st53/st53_cleaned.csv",
    commit_message="Full ST53 cleaned with Primary recovery method fix"
)


fatal: destination path 'WCSB_Supply_Demand' already exists and is not an empty directory.

📦 Processing 2010...
✅ Recovery methods seen: ['Commercial-CSS' 'Commercial-SAGD' 'Enhanced Recovery' 'Experimental']

📦 Processing 2011...
✅ Recovery methods seen: ['Commercial' 'Commercial-CSS' 'Commercial-SAGD' 'Enhanced Recovery'
 'Experimental']

📦 Processing 2012...
✅ Recovery methods seen: ['Commercial-CSS' 'Commercial-SAGD' 'Enhanced Recovery' 'Experimental']

📦 Processing 2013...
✅ Recovery methods seen: ['Commercial-CSS' 'Commercial-SAGD' 'Enhanced Recovery' 'Experimental']

📦 Processing 2014...
✅ Recovery methods seen: ['Commercial-CSS' 'Commercial-SAGD' 'Enhanced Recovery' 'Experimental']

📦 Processing 2015...
✅ Recovery methods seen: ['Commercial-CSS' 'Commercial-SAGD' 'Enhanced Recovery' 'Experimental']

📦 Processing 2016...
✅ Recovery methods seen: ['Commercial-CSS' 'Commercial-SAGD' 'Enhanced Recovery']

📦 Processing 2017...
✅ Recovery methods seen: ['Commercial-CSS' 'Commercial-