In [1]:
# ensemble_biomass_dino_siglip_optimized.py

import os
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
os.environ["XLA_FLAGS"] = "--xla_gpu_cuda_data_dir="
os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"

import json
import numpy as np
import pandas as pd
from pathlib import Path
from PIL import Image
from tqdm import tqdm
import torch
from transformers import AutoImageProcessor, AutoModel, AutoProcessor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GroupKFold
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import catboost
import polars as pl
import matplotlib.pyplot as plt
import warnings
import lightgbm as lgb

warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)

E0000 00:00:1763897657.506299      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1763897657.562932      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

In [2]:
# =============================
# Configuration
# =============================
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
DATA_PATH = Path('/kaggle/input/csiro-biomass')
DINO_MODEL = "/kaggle/input/dinov2/pytorch/large/1"
SIGLIP_MODEL = "/kaggle/input/google-siglip-so400m-patch14-384/transformers/default/1"

LABELS = ['Dry_Clover_g', 'Dry_Dead_g', 'Dry_Green_g', 'Dry_Total_g', 'GDM_g']

In [3]:
# =============================
# Load Models
# =============================
print("Loading DINOv2 and SigLIP models...")
dino_model = AutoModel.from_pretrained(DINO_MODEL).to(DEVICE).eval()
dino_proc = AutoImageProcessor.from_pretrained(DINO_MODEL)

siglip_model = AutoModel.from_pretrained(SIGLIP_MODEL).to(DEVICE).eval()
siglip_proc = AutoProcessor.from_pretrained(SIGLIP_MODEL)

Loading DINOv2 and SigLIP models...


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [4]:
# =============================
# Load Data
# =============================
train = pl.read_csv(DATA_PATH / 'train.csv')
test = pl.read_csv(DATA_PATH / 'test.csv')

# Pivot targets to wide (one row per image)
train_pivot = (
    train.with_columns([
        pl.when(pl.col('target_name') == label).then(pl.col('target')).alias(label)
        for label in LABELS
    ])
    .group_by('image_path')
    .agg([
        pl.col(label).mean() for label in LABELS
    ])
    .sort('image_path')
)

test_pivot = test.group_by('image_path').len().sort('image_path')

In [5]:
# =====================================================
# Color Index Features (hand-crafted, very helpful for Dry_Dead_g)
# =====================================================
def compute_color_indices(img: Image.Image):

    arr = np.asarray(img).astype(np.float32)
    if arr.ndim == 2:  # grayscale safeguard
        arr = np.stack([arr, arr, arr], axis=-1)

    R = arr[..., 0]
    G = arr[..., 1]
    B = arr[..., 2]

    exg = 2 * G - R - B
    vari = (G - R) / (G + R - B + 1e-6)
    gcc = G / (R + G + B + 1e-6)

    features = [
        exg.mean(), exg.std(),
        vari.mean(), vari.std(),
        gcc.mean(), gcc.std()
    ]
    return features

def build_color_feature_df(image_paths, save_path):
    records = []
    for p in tqdm(image_paths, desc=f"Computing color indices -> {save_path}"):
        img = Image.open(DATA_PATH / p).convert("RGB")
        feats = compute_color_indices(img)
        img.close()
        records.append([p] + feats)

    cols = [
        "image_path",
        "exg_mean", "exg_std",
        "vari_mean", "vari_std",
        "gcc_mean", "gcc_std",
    ]
    df = pd.DataFrame(records, columns=cols)
    df.to_csv(save_path, index=False)
    return df

In [6]:
# =======================================
# Feature Extraction (Token mean + std)
# =======================================
def extract_features(model, processor, image_paths, save_path, model_name):
    feat_list = []

    with torch.no_grad():
        for img_path in tqdm(image_paths, desc=f"Extracting {model_name} features"):
            img = Image.open(DATA_PATH / img_path).convert("RGB")
            inputs = processor(images=img, return_tensors="pt").to(DEVICE)
            img.close()

            outputs = model(**inputs)

            if hasattr(outputs, "last_hidden_state"):
                tokens = outputs.last_hidden_state.squeeze(0)  # (seq, dim)
                if tokens.ndim == 1:
                    tokens = tokens.unsqueeze(0)
                mean_feat = tokens.mean(dim=0)
                std_feat = tokens.std(dim=0)
                feats = torch.cat([mean_feat, std_feat], dim=0)
            elif hasattr(model, "get_image_features"):
                feats = model.get_image_features(**inputs).squeeze(0)
            else:
                raise RuntimeError(f"{model_name} outputs not understood.")

            feat_list.append(feats.cpu().numpy())

    feat_arr = np.stack(feat_list, axis=0)
    df = pd.DataFrame(
        feat_arr,
        columns=[f"{model_name}_{i}" for i in range(feat_arr.shape[1])]
    )
    df["image_path"] = list(image_paths)
    df.to_csv(save_path, index=False)
    return df

In [7]:
# =============================
# Build All Features
# =============================
print("Extracting features and color indices...")

# Image path lists
train_image_paths = train_pivot["image_path"].to_list()
test_image_paths = test_pivot["image_path"].to_list()

# Vision-model features
dino_train_df = extract_features(
    dino_model, dino_proc, train_image_paths, "dino_train_feats_opt.csv", "dino"
)
siglip_train_df = extract_features(
    siglip_model, siglip_proc, train_image_paths, "siglip_train_feats_opt.csv", "siglip"
)

dino_test_df = extract_features(
    dino_model, dino_proc, test_image_paths, "dino_test_feats_opt.csv", "dino"
)
siglip_test_df = extract_features(
    siglip_model, siglip_proc, test_image_paths, "siglip_test_feats_opt.csv", "siglip"
)

# Color indices
color_train_df = build_color_feature_df(train_image_paths, "color_train_feats.csv")
color_test_df = build_color_feature_df(test_image_paths, "color_test_feats.csv")

# Merge feature sets
train_feats = (
    dino_train_df
    .merge(siglip_train_df, on="image_path")
    .merge(color_train_df, on="image_path")
)
test_feats = (
    dino_test_df
    .merge(siglip_test_df, on="image_path")
    .merge(color_test_df, on="image_path")
)

train_targets = train_pivot.to_pandas()[LABELS]


Extracting features and color indices...


Extracting dino features: 100%|██████████| 357/357 [01:15<00:00,  4.76it/s]
Extracting siglip features:   0%|          | 0/357 [00:00<?, ?it/s]


ValueError: You have to specify input_ids

In [None]:
# =============================
# Train Ensemble Regressors
# =============================
def ensemble_train(X, y, groups, X_test, target_names):
    n_targets = y.shape[1]
    n_test = X_test.shape[0]

    oof = np.zeros_like(y, dtype=np.float32)
    test_preds = np.zeros((n_test, n_targets), dtype=np.float32)

    kf = GroupKFold(n_splits=5)

    for fold, (train_idx, val_idx) in enumerate(kf.split(X, groups=groups)):
        print(f"\n========== Fold {fold + 1} / 5 ==========")
        X_tr, X_val = X[train_idx], X[val_idx]

        for t in range(n_targets):
            name = target_names[t]
            print(f"  → Training target [{name}]")

            y_tr = y[train_idx, t]
            y_val = y[val_idx, t]

            # Stronger configuration for Dry_Dead_g
            if name == "Dry_Dead_g":
                cb = catboost.CatBoostRegressor(
                    iterations=800,
                    depth=10,
                    learning_rate=0.03,
                    loss_function="RMSE",
                    verbose=0,
                )
                gb = GradientBoostingRegressor(
                    n_estimators=400,
                    learning_rate=0.03,
                    max_depth=3,
                    subsample=0.9,
                )

                # Higher weight for larger dead biomass values
                sample_weight = 1.0 / (1.0 + np.exp(-0.1 * y_tr))  # smooth weighting

                cb.fit(X_tr, y_tr, sample_weight=sample_weight)
                gb.fit(X_tr, y_tr, sample_weight=sample_weight)

                pred_val_cb = cb.predict(X_val)
                pred_val_gb = gb.predict(X_val)
                pred_test_cb = cb.predict(X_test)
                pred_test_gb = gb.predict(X_test)

                if HAS_LGB:
                    lgbm = lgb.LGBMRegressor(
                        num_leaves=64,
                        n_estimators=800,
                        learning_rate=0.02,
                        subsample=0.8,
                        colsample_bytree=0.8,
                    )
                    lgbm.fit(X_tr, y_tr, sample_weight=sample_weight)
                    pred_val_lgb = lgbm.predict(X_val)
                    pred_test_lgb = lgbm.predict(X_test)

                    pred_val = (pred_val_cb + pred_val_gb + pred_val_lgb) / 3.0
                    pred_test = (pred_test_cb + pred_test_gb + pred_test_lgb) / 3.0
                else:
                    pred_val = (pred_val_cb + pred_val_gb) / 2.0
                    pred_test = (pred_test_cb + pred_test_gb) / 2.0

            else:
                # Default models for other targets
                cb = catboost.CatBoostRegressor(
                    iterations=500,
                    depth=8,
                    learning_rate=0.05,
                    loss_function="RMSE",
                    verbose=0,
                )
                gb = GradientBoostingRegressor(
                    n_estimators=300,
                    learning_rate=0.05,
                    max_depth=3,
                    subsample=0.9,
                )

                cb.fit(X_tr, y_tr)
                gb.fit(X_tr, y_tr)

                pred_val = (cb.predict(X_val) + gb.predict(X_val)) / 2.0
                pred_test = (cb.predict(X_test) + gb.predict(X_test)) / 2.0

            oof[val_idx, t] = pred_val
            test_preds[:, t] += pred_test / kf.n_splits

    return oof, test_preds

# Prepare numpy arrays
X = train_feats.drop(columns=["image_path"]).values.astype(np.float32)
y = train_targets[LABELS].values.astype(np.float32)
X_test = test_feats.drop(columns=["image_path"]).values.astype(np.float32)

# Map image_path to Sampling_Date for GroupKFold
sampling_dates = train.select(["image_path", "Sampling_Date"]).unique().to_pandas()
sampling_map = dict(zip(sampling_dates["image_path"], sampling_dates["Sampling_Date"]))
groups = train_feats["image_path"].map(sampling_map).values

# Train ensemble and get predictions
oof, test_preds = ensemble_train(X, y, groups, X_test, LABELS)

In [None]:
# =================================
# Evaluation & Plotting
# =================================
def evaluate_and_plot_preds(y_true, y_pred, target_names):
    # Weight mapping
    weights = {
        "Dry_Green_g": 0.1,
        "Dry_Dead_g": 0.1,
        "Dry_Clover_g": 0.1,
        "GDM_g": 0.2,
        "Dry_Total_g": 0.5,
    }

    metrics_table = []
    weighted_rmse = 0.0

    for i, name in enumerate(target_names):
        true_vals = y_true[:, i]
        pred_vals = y_pred[:, i]

        mae = mean_absolute_error(true_vals, pred_vals)
        rmse = np.sqrt(mean_squared_error(true_vals, pred_vals))
        r2 = r2_score(true_vals, pred_vals)

        metrics_table.append([name, mae, rmse, r2])
        weighted_rmse += weights[name] * rmse

    metrics_df = pd.DataFrame(metrics_table, columns=["Target", "MAE", "RMSE", "R2"])
    print(metrics_df)
    print("\nUnweighted Average RMSE:", metrics_df["RMSE"].mean())
    print("Weighted RMSE:", weighted_rmse)

    # Plot scatter comparisons
    fig, axes = plt.subplots(2, 3, figsize=(18, 10))
    axes = axes.flatten()

    for i, name in enumerate(target_names):
        ax = axes[i]
        ax.scatter(y_true[:, i], y_pred[:, i], s=12, alpha=0.5)
        max_val = max(y_true[:, i].max(), y_pred[:, i].max())
        ax.plot([0, max_val], [0, max_val], "r--", linewidth=2)
        ax.set_title(f"{name} (RMSE={metrics_df.iloc[i]['RMSE']:.2f})")
        ax.set_xlabel("Actual (g)")
        ax.set_ylabel("Predicted (g)")
        ax.grid(True, alpha=0.3)

    # Remove last empty plot
    fig.delaxes(axes[5])
    plt.tight_layout()
    plt.show()

    return metrics_df, weighted_rmse

metrics_df, weighted_rmse = evaluate_and_plot_preds(
    y, oof, target_names=["Dry_Green_g", "Dry_Dead_g", "Dry_Clover_g", "GDM_g", "Dry_Total_g"]
)

print("\nOptimized metrics table:")
print(metrics_df)
print("Optimized weighted RMSE:", weighted_rmse)

In [None]:
evaluate_and_plot_preds(y, oof, target_names=['Dry_Green_g', 'Dry_Dead_g', 'Dry_Clover_g', 'GDM_g', 'Dry_Total_g'])

In [None]:
# =============================
# Submission Function
# =============================
def generate_submission(test_preds, test_df_long, test_pivot, output_csv="submission.csv"):
    # Convert pivot to pandas
    pivot_pd = test_pivot.to_pandas()

    # Create dataframe for predictions
    preds_df = pd.DataFrame(test_preds, columns=LABELS)
    preds_df["image_path"] = pivot_pd["image_path"]

    # Convert to long format
    preds_long = preds_df.melt(
        id_vars=["image_path"],
        value_vars=LABELS,
        var_name="target_name",
        value_name="target",
    )

    # Extract filenames only
    preds_long["image_name"] = preds_long["image_path"].apply(os.path.basename)
    test_df_long["image_name"] = test_df_long["image_path"].apply(os.path.basename)

    # Merge on image_name + target_name
    merged = test_df_long.merge(
        preds_long[["image_name", "target_name", "target"]],
        on=["image_name", "target_name"],
        how="left",
    )

    submission = merged[["sample_id", "target"]]
    submission.to_csv(output_csv, index=False)
    print(f"Submission saved to {output_csv}")
    return submission.head()

In [None]:
test_df_long = test.to_pandas()
submission_head = generate_submission(test_preds, test_df_long, test_pivot)
print("\nSubmission head:")
print(submission_head)