In [3]:
# ===============================================================
# 🛰️ SAR Smart Analyzer — ML + Cloud Integration + Live CSV Export
# ===============================================================
# Features:
#  - Reads all .tif from GCS bucket (nasa_sar_spacetron)
#  - Applies PCA + KMeans clustering (change / biomass detection)
#  - Computes summary metrics per AOI/year/quarter
#  - Appends results into /content/drive/MyDrive/live.csv

!pip install rasterio google-cloud-storage scikit-learn pandas numpy -q

import os
import re
import rasterio
import numpy as np
import pandas as pd
from io import BytesIO
from google.colab import auth, drive
from google.cloud import storage
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

# ---------------------------------------------------------------
# 🔑 Authentication and setup
# ---------------------------------------------------------------
auth.authenticate_user()
drive.mount('/content/drive')
client = storage.Client(project="YOUR_PROJECT_ID")   # replace if needed

# ---------------------------------------------------------------
# 📂 Configuration
# ---------------------------------------------------------------
BUCKET_NAME = "nasa_sar_spacetron"
PREFIX = "CLEAN/SAR_Exports/"
LIVE_CSV = "/content/drive/MyDrive/live.csv"
N_CLUSTERS = 3  # for KMeans segmentation
EXPECTED_BANDS = 2  # VV and VH

# ---------------------------------------------------------------
# 🧩 Helper: read .tif directly from GCS
# ---------------------------------------------------------------
def read_tif_from_gcs(blob):
    content = blob.download_as_bytes()
    with rasterio.open(BytesIO(content)) as src:
        data = src.read().astype(float)
        meta = src.meta
        data[data <= 0] = np.nan
    return data, meta

# ---------------------------------------------------------------
# 🧠 ML Analysis: PCA + KMeans
# ---------------------------------------------------------------
def analyze_tif(data):
    """
    data: numpy array [bands, height, width]
    returns: dict with PCA variance ratio and cluster-based biomass %
    """
    bands, h, w = data.shape
    X = np.moveaxis(data, 0, -1).reshape(-1, bands)
    X = X[~np.isnan(X).any(axis=1)]  # remove NaN rows

    if X.shape[0] < 50:
        return {"pca_var": 0, "biomass_pct": 0, "clusters": 0}

    # --- PCA ---
    pca = PCA(n_components=min(bands, 3))
    X_pca = pca.fit_transform(X)
    var_ratio = np.sum(pca.explained_variance_ratio_)

    # --- KMeans ---
    km = KMeans(n_clusters=N_CLUSTERS, random_state=42, n_init='auto')
    labels = km.fit_predict(X_pca)
    cluster_means = np.array([np.mean(X_pca[labels == i, 0]) for i in range(N_CLUSTERS)])
    biomass_pct = np.clip(np.mean(cluster_means > np.percentile(cluster_means, 50)) * 100, 0, 100)

    return {
        "pca_var": round(var_ratio, 4),
        "biomass_pct": round(biomass_pct, 2),
        "clusters": N_CLUSTERS
    }

# ---------------------------------------------------------------
# 🧮 Metadata extraction from file name
# ---------------------------------------------------------------
def parse_metadata(filename):
    # Expected pattern: AOI_1_2020_Q1.tif
    m = re.match(r".*AOI_(\d+)_(\d{4})_Q(\d)", filename)
    if m:
        return {"AOI": int(m.group(1)), "Year": int(m.group(2)), "Quarter": int(m.group(3))}
    return {"AOI": None, "Year": None, "Quarter": None}

# ---------------------------------------------------------------
# 📊 Process each .tif and export analysis
# ---------------------------------------------------------------
def process_and_export():
    print(f"🛰️ Scanning bucket '{BUCKET_NAME}/{PREFIX}' ...")

    bucket = client.bucket(BUCKET_NAME)
    blobs = [b for b in client.list_blobs(BUCKET_NAME, prefix=PREFIX) if b.name.endswith(".tif")]

    # Load or create live.csv
    if os.path.exists(LIVE_CSV):
        df = pd.read_csv(LIVE_CSV)
        processed_files = set(df["file"])
    else:
        df = pd.DataFrame(columns=[
            "file", "AOI", "Year", "Quarter",
            "bands", "pca_var", "biomass_pct", "clusters",
            "vv_mean", "vh_mean", "valid_px", "nan_ratio"
        ])
        processed_files = set()

    new_entries = []

    for blob in blobs:
        if blob.name in processed_files:
            continue
        print(f"📂 Processing {blob.name} ...")
        try:
            data, meta = read_tif_from_gcs(blob)
            bands, h, w = data.shape
            vv_mean = np.nanmean(data[0])
            vh_mean = np.nanmean(data[1]) if bands > 1 else np.nan
            valid_px = np.count_nonzero(np.isfinite(data))
            nan_ratio = round(1 - (valid_px / data.size), 4)

            ml = analyze_tif(data)
            meta_info = parse_metadata(blob.name)

            entry = {
                "file": blob.name,
                "AOI": meta_info["AOI"],
                "Year": meta_info["Year"],
                "Quarter": meta_info["Quarter"],
                "bands": bands,
                "pca_var": ml["pca_var"],
                "biomass_pct": ml["biomass_pct"],
                "clusters": ml["clusters"],
                "vv_mean": round(vv_mean, 4),
                "vh_mean": round(vh_mean, 4),
                "valid_px": valid_px,
                "nan_ratio": nan_ratio
            }
            new_entries.append(entry)
            print(f"✅ Done → PCA={ml['pca_var']}  Biomass%={ml['biomass_pct']}  NaN%={nan_ratio*100:.2f}")
        except Exception as e:
            print(f"⚠️ Error: {e}")

    # Save results
    if new_entries:
        df = pd.concat([df, pd.DataFrame(new_entries)], ignore_index=True)
        df.to_csv(LIVE_CSV, index=False)
        print(f"💾 Updated {LIVE_CSV} with {len(new_entries)} new analyses")
    else:
        print("✅ No new .tif files found.")

# ---------------------------------------------------------------
# 🚀 Run full analysis once
# ---------------------------------------------------------------
process_and_export()


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
🛰️ Scanning bucket 'nasa_sar_spacetron/CLEAN/SAR_Exports/' ...
📂 Processing CLEAN/SAR_Exports/AOI_5_2024_Q1.tif ...
✅ Done → PCA=0  Biomass%=0  NaN%=70.16
📂 Processing CLEAN/SAR_Exports/AOI_5_2024_Q4.tif ...
✅ Done → PCA=0  Biomass%=0  NaN%=70.16
📂 Processing CLEAN/SAR_Exports/AOI_5_2025_Q1.tif ...
✅ Done → PCA=0  Biomass%=0  NaN%=70.16
📂 Processing CLEAN/SAR_Exports/AOI_5_2025_Q2.tif ...
✅ Done → PCA=0  Biomass%=0  NaN%=70.16
📂 Processing CLEAN/SAR_Exports/AOI_5_2025_Q3.tif ...
✅ Done → PCA=0  Biomass%=0  NaN%=70.16
📂 Processing CLEAN/SAR_Exports/AOI_5_2025_Q4.tif ...
✅ Done → PCA=0  Biomass%=0  NaN%=70.15
📂 Processing CLEAN/SAR_Exports/AOI_6_2020_Q1.tif ...
✅ Done → PCA=0.9997  Biomass%=33.33  NaN%=65.09
📂 Processing CLEAN/SAR_Exports/AOI_6_2020_Q2.tif ...
✅ Done → PCA=0.9997  Biomass%=33.33  NaN%=65.10
📂 Processing CLEAN/SAR_Exports/AOI_6_2020_Q3.tif ...
✅