In [None]:
# Load cubes
import os
import xarray as xr
import numpy as np
import pandas as pd
import random
import gc
import matplotlib.pyplot as plt

path_dir = "processed_data_final/"
data_dir = os.listdir(path_dir)
print(data_dir)

cubes = {}

for key in data_dir:
    load_path = os.path.join(path_dir, key)
    cubes[key] = xr.open_zarr(load_path)

## Neue Strategie: Selecte 2000 pixel pro LC und trainiere auf gesamte zeitreihen

In [None]:
# Einstellungen
VEGETATION_CLASSES = [10, 20, 30, 40, 60, 90, 95, 100]
N_PIXEL_PER_LC = 2000
TARGET_VAR = "NDVI_strict"

def get_feature_list(ds):
    """Extrahiert alle relevanten Features aus dem Dataset."""
    all_vars = list(ds.data_vars)
    to_remove = [
        "COP_DEM", "ESA_LC", "NDVI_strict", "NDVI_basic", "is_veg", 
        "quality_mask_basic", "quality_mask_strict", "vh_norm", "vv_norm"
    ]
    return [f for f in all_vars if f not in to_remove]

# Feature-Liste initialisieren
features = get_feature_list(cubes["ds_1"])
features

In [None]:
def create_training_df(cubes, train_keys, features, n_pixel_per_lc=2000):
    all_cube_samples = []
    global_pixel_counter = 0

    for key in train_keys:
        print(f"Verarbeite Cube {key}...")
        ds = cubes[key]

        # 1. DEM vorbereiten
        dem_data = ds["COP_DEM"]
        if "time_cop_dem_glo_30_dged_cog" in dem_data.dims:
            dem_data = dem_data.mean(dim="time_cop_dem_glo_30_dged_cog", skipna=True)

        for lc_val in VEGETATION_CLASSES:
            # 2. Koordinaten f√ºr diese Klasse finden
            coords = np.argwhere((ds.ESA_LC.isel(time_esa_worldcover=0) == lc_val).values)
            if len(coords) == 0: continue

            n_to_draw = min(len(coords), n_pixel_per_lc)
            idx = np.random.choice(len(coords), n_to_draw, replace=False)
            selected_coords = coords[idx]

            # Vectorized Indexing
            y_idx = xr.DataArray(selected_coords[:, 0], dims="pixel_id")
            x_idx = xr.DataArray(selected_coords[:, 1], dims="pixel_id")
            
            pixel_bundle = ds.isel(y=y_idx, x=x_idx)
            stacked = pixel_bundle.stack(sample=("pixel_id", "time_sentinel_2_l2a"))
            n_t = ds.sizes["time_sentinel_2_l2a"]
            
            # Indizes f√ºr t und t+1 (Vektorisiert)
            idx_t = []
            idx_target = []
            for p in range(n_to_draw):
                start = p * n_t
                idx_t.extend(range(start, start + n_t - 1))
                idx_target.extend(range(start + 1, start + n_t))

            # 3. Dictionary bef√ºllen (mit Downcasting auf float32)
            data_dict = {
                f: stacked[f].isel(sample=idx_t).values.astype(np.float32) for f in features
            }

            pixel_ids = np.arange(global_pixel_counter, global_pixel_counter + n_to_draw)
            
            data_dict.update({
                "NDVI_basic_t": stacked["NDVI_basic"].isel(sample=idx_t).values.astype(np.float32),
                "target_basic": stacked["NDVI_basic"].isel(sample=idx_target).values.astype(np.float32),
                "NDVI_strict_t": stacked["NDVI_strict"].isel(sample=idx_t).values.astype(np.float32),
                "target_strict": stacked["NDVI_strict"].isel(sample=idx_target).values.astype(np.float32),
                "pixel_group": np.repeat(pixel_ids, n_t - 1),
                "timestep": np.tile(np.arange(n_t - 1, dtype=np.uint16), n_to_draw),
                "lc_class": lc_val, # Landcover hinzuf√ºgen
                "cube_origin": key,
                "COP_DEM": np.repeat(dem_data.isel(y=y_idx, x=x_idx).values.astype(np.float32), n_t - 1)
            })
            
            global_pixel_counter += n_to_draw
            df_t = pd.DataFrame(data_dict)

            # 4. Lags (Ged√§chtnis) berechnen
            features_to_lag = features + ["NDVI_basic_t", "NDVI_strict_t"]
            for f in features_to_lag:
                for lag in range(1, 3):
                    df_t[f"{f}_minus_{lag}"] = df_t.groupby("pixel_group")[f].shift(lag)

            # 5. Cleaning: Wir droppen NUR, wenn basic_target NaN ist
            # Da strict aucch immer nan ist wenn basic nan ist - passt das so
            df_t = df_t.dropna(subset=["target_basic"])
            if not df_t.empty:
                all_cube_samples.append(df_t)

            del stacked, pixel_bundle
            gc.collect()

    # Finaler Merge
    df_final = pd.concat(all_cube_samples, ignore_index=True)
    
    # lc_class als kategoriale Variable speichern
    df_final["lc_class"] = df_final["lc_class"].astype("category")
    
    return df_final

In [None]:
# Define keys
test_keys = ["ds_5", "ds_10"]
train_keys = [k for k in cubes.keys() if k not in test_keys]

# Anwendung
# Strategie: 7 Cubes zum Trainieren, 1 Cube zur Validierung
train_keys_subset = train_keys[:-1]  # Die ersten 7
val_key_subset = [train_keys[-1]]    # Der 8. Cube nur zur Validierung

# 1. Erstelle zwei separate DataFrames
df_train_final = create_training_df(cubes, train_keys_subset, features)
df_val_final = create_training_df(cubes, val_key_subset, features)
print(f"Fertig! {len(df_train_final)} Zeilen extrahiert.")

## Save data

In [None]:
# Save as parquet
df_train_final.to_parquet('XGBoost/new_data/df_train_final_strict_target.parquet', index=False)
df_val_final.to_parquet('XGBoost/new_data/df_val_final_strict_target.parquet', index=False)

## Load data

In [None]:
# Save as parquet
df_train_final = pd.read_parquet('XGBoost/new_data/df_train_final_strict_target.parquet')
df_val_final   = pd.read_parquet('XGBoost/new_data/df_val_final_strict_target.parquet')

## Create final sets (BASIC)

In [None]:
# 1. Start mit den Basis-Features und statischen Variablen
features_basic = features + ["NDVI_basic_t", "COP_DEM", "lc_class"]

# 2. F√ºge die Lags f√ºr ALLE Variablen aus der urspr√ºnglichen 'features'-Liste hinzu
# Das entspricht der Logik aus deiner create_training_df Loop
for f in features:
    features_basic += [f"{f}_minus_{i}" for i in range(1, 3)]

# 3. F√ºge zus√§tzlich die Lags f√ºr das Target (NDVI_basic_t) hinzu
features_basic += [f"NDVI_basic_t_minus_{i}" for i in range(1, 3)]

In [None]:
# # Training Set Basic
# X_train_basic = df_train_final[features_basic]
# y_train_basic = df_train_final["target_basic"]

# # Validation Set Basic
# X_val_basic = df_val_final[features_basic]
# y_val_basic = df_val_final["target_basic"]

In [None]:
pd.set_option('display.max_columns', None)
df_train_final

## Create final sets (STRICT)

In [None]:
# 1. Start mit den Basis-Features und statischen Variablen
features_strict = features + ["NDVI_strict_t", "COP_DEM", "lc_class"]

# 2. F√ºge die Lags f√ºr ALLE Variablen aus der urspr√ºnglichen 'features'-Liste hinzu
# Das entspricht der Logik aus deiner create_training_df Loop
for f in features:
    features_strict += [f"{f}_minus_{i}" for i in range(1, 3)]

# 3. F√ºge zus√§tzlich die Lags f√ºr das Target (NDVI_basic_t) hinzu
features_strict += [f"NDVI_strict_t_minus_{i}" for i in range(1,3)]

In [None]:
# Training Set Strict (Nur Zeilen mit validem strict-Target)
df_train_strict = df_train_final.dropna(subset=["target_strict"])
# X_train_strict = df_train_strict[features_strict]
# y_train_strict = df_train_strict["target_strict"]

# # Validation Set Strict
# df_val_strict = df_val_final.dropna(subset=["target_strict"])
# X_val_strict = df_val_strict[features_strict]
# y_val_strict = df_val_strict["target_strict"]

# Train models

In [None]:
import joblib
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split


def train_and_save_model(df_train, df_val, features, target_col, model_name, save_path="XGBoost/sar_experiments_strict/"):
    """
    Trainiert ein XGBoost Modell auf einem spezifischen Target (basic oder strict).
    """
    print(f"\n--- Starte Training f√ºr: {model_name} (Target: {target_col}) ---")
    
    # 1. Daten vorbereiten (Cleaning: Nur Zeilen behalten, wo DAS spezifische Target existiert)
    # Das ist wichtig f√ºr 'strict', da dort mehr NaNs sind
    train_clean = df_train.dropna(subset=[target_col])
    val_clean = df_val.dropna(subset=[target_col])
    
    X_train = train_clean[features]
    y_train = train_clean[target_col]
    X_val = val_clean[features]
    y_val = val_clean[target_col]

    # 2. Modell-Initialisierung
    model = XGBRegressor(
        n_estimators=1000,
        learning_rate=0.05,
        max_depth=7,
        subsample=0.8,
        colsample_bytree=0.8,
        tree_method="hist",
        n_jobs=-1,
        early_stopping_rounds=50,
        enable_categorical=True
    )
    
    # 3. Fit (Nutzt jetzt die √ºbergebenen Validierungs-Cubes)
    model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        verbose=100
    )
    
    # 4. Speichern
    model_package = {
        "model": model,
        "features": features,
        "model_name": model_name,
        "target_variable": target_col
    }
    
    os.makedirs(save_path, exist_ok=True)
    filename = f"{save_path}{model_name}.pkl"
    joblib.dump(model_package, filename)
    print(f"Modell gespeichert unter: {filename}")
    
    # Aufr√§umen
    del X_train, X_val, y_train, y_val
    gc.collect()
    
    return model

## New tests

In [None]:
# Konstante Basis (Klima immer dabei, da du sie sowieso nutzen wirst)
climate_vars = ['t2m_mean', 't2mmax_max', 't2mmin_min', 'tp_dailymean_mean', 'tp_dailymax_max', 'tp_rollingmax_max', 'pei_30_mean', 'pei_90_mean']
# Klima + Lags
climate_core = climate_vars + [f"{f}_minus_{i}" for f in climate_vars for i in range(1, 5)]

# Statische & NDVI Basis
basis = ['COP_DEM', 'lc_class', "NDVI_strict_t"] + [f"NDVI_strict_t_minus_{i}" for i in range(1, 5)]

# Das "Immer-Dabei"-Paket
full_basis = basis + climate_core

In [None]:
sar_experiments = [
    ("Full_Basis", full_basis),
    ("Basis_only", basis),
    ("Climate_only", climate_core),
    # Einzelsignale (Raw)
    ("VV_only", full_basis + ["vv"] + [f"vv_minus_{i}" for i in range(1, 5)]),
    ("VH_only", full_basis + ["vh"] + [f"vh_minus_{i}" for i in range(1, 5)]),
    
    # Kombination der Raw-B√§nder
    ("Dual_Raw", full_basis + ["vv", "vh"] + [f"{f}_minus_{i}" for f in ["vv", "vh"] for i in range(1, 5)]),
    
    # Einzelne Indizes
    ("Index_VHVVR", full_basis + ["VHVVR"] + [f"VHVVR_minus_{i}" for i in range(1, 5)]),
    ("Index_VVVHS", full_basis + ["VVVHS"] + [f"VVVHS_minus_{i}" for i in range(1, 5)]),
    ("Index_DpRVIVV", full_basis + ["DpRVIVV"] + [f"DpRVIVV_minus_{i}" for i in range(1, 5)])
]

for name, feat_list in sar_experiments:
    print(f"\nüöÄ Starte Training f√ºr Experiment: {name}")
    
    # Trainieren (Nutze dein vorhandenes train_and_save_model)
    # Falls die Funktion das Modell zur√ºckgibt, kannst du es hier speichern
    train_and_save_model(
        df_train=df_train_final, 
        df_val=df_val_final, 
        features=feat_list, 
        target_col="target_strict", 
        model_name=f"Model_Strict_{name}"
    )

## Define feature combinations for training

In [None]:
# --- BASIS-BAUSTEINE ---
# Struktur & Kategorie (Immer dabei)
base_structural = ['vh', 'vv', 'COP_DEM', 'lc_class']

# Spektrale Historie (Lags)
# Wir nehmen NDVI_t und die Lags t-1 bis t-2
ndvi_basic_core = ["NDVI_basic_t"] + [f"NDVI_basic_t_minus_{i}" for i in range(1, 3)]
ndvi_strict_core = ["NDVI_strict_t"] + [f"NDVI_strict_t_minus_{i}" for i in range(1, 3)]

# Klima-Bl√∂cke (ERA5)
temp_features = ['t2m_mean', 't2mmax_mean', 't2mmin_mean'] 
temp_features_with_lags = temp_features + [f"{f}_minus_{i}" for f in temp_features for i in range(1, 5)]
precip_features = ['tp_dailymean_mean', 'tp_rollingmax_mean', 'tp_dailymax_mean', 'pei_30_mean', 'pei_90_mean']
precip_features_with_lags = precip_features + [f"{f}_minus_{i}" for f in precip_features for i in range(1, 5)]

In [None]:
# Bausteine
base_vars = ['COP_DEM', 'lc_class'] #+ ndvi_basic_core
sar_indices = ['DpRVIVV', 'VHVVR', 'VVVHS']

# Kombination A: Nur Rohdaten
features_raw = base_vars + ['vv', 'vh']

# Kombination B: Nur Indizes
features_indices = base_vars + sar_indices

# Kombination C: Beides (um zu sehen, ob sie sich erg√§nzen)
features_combined = base_vars + ['vv', 'vh'] + sar_indices

In [None]:
# EXPERIMENT 1: Baseline (Nur Struktur + Spektral)
features_baseline_sar = base_structural 


features_baseline_spectral =  ['COP_DEM', 'lc_class', "NDVI_basic_t"]


# EXPERIMENT 2: Baseline + Temperatur
features_temp = base_structural + ndvi_basic_core + temp_features
features_temp_plus_lags = base_structural + ndvi_basic_core + temp_features_with_lags

# EXPERIMENT 3: Baseline + Wasser/Niederschlag
features_water = base_structural + ndvi_basic_core + precip_features
features_water_plus_lags = base_structural + ndvi_basic_core + precip_features_with_lags

# EXPERIMENT 4: Full Environmental (Alles zusammen)
features_full = base_structural + ndvi_basic_core + temp_features + precip_features

features_full_with_lags = base_structural + ndvi_basic_core + temp_features_with_lags + precip_features_with_lags

In [None]:
# Liste der Experimente zum Durchlaufen
experiments = [
    # ("Baseline SAR", features_baseline_sar),
    # ("Baseline Spectral", features_baseline_spectral),
    # ("Temp_Only", features_temp),
    # ("Temp_Only_with_Lags", features_temp_plus_lags),
    ("Water_Only", features_water),
    ("Water_Only_with_Lags", features_water_plus_lags),
    ("Full_Climate", features_full),
    ("Full_Climate_with_Lags", features_full_with_lags),
    ("SAR_Indices", features_indices),
    ("Combined_SAR", features_combined)
]

for name, feat_list in experiments:
    print(f"\nüöÄ Starte Training f√ºr Experiment: {name}")
    
    # Trainieren (Nutze dein vorhandenes train_and_save_model)
    # Falls die Funktion das Modell zur√ºckgibt, kannst du es hier speichern
    train_and_save_model(
        df_train=df_train_final, 
        df_val=df_val_final, 
        features=feat_list, 
        target_col="target_basic", 
        model_name=f"Model_Basic_{name}"
    )

## Create test set 

In [None]:
def create_sampled_master_test_df(cubes, test_keys, features, n_per_lc=2000):
    """
    Erstellt ein Master-Test-Set aus mehreren Cubes.
    Sorgt f√ºr eindeutige pixel_group IDs √ºber alle Cubes hinweg.
    """
    print(f"Erstelle Master-Test-Set aus Cubes: {test_keys}...")
    
    all_samples = []
    global_pixel_offset = 0

    for key in test_keys:
        print(f"  Verarbeite Cube {key}...")
        ds_test = cubes[key]
        n_t = ds_test.sizes["time_sentinel_2_l2a"]

        # DEM Vorbereitung
        dem_full = ds_test["COP_DEM"]
        if "time_cop_dem_glo_30_dged_cog" in dem_full.dims:
            dem_full = dem_full.mean(dim="time_cop_dem_glo_30_dged_cog", skipna=True)

        for lc_val in VEGETATION_CLASSES:
            # Koordinaten finden
            coords = np.argwhere((ds_test.ESA_LC.isel(time_esa_worldcover=0) == lc_val).values)
            if len(coords) == 0: continue

            n_draw = min(len(coords), n_per_lc)
            idx = np.random.choice(len(coords), n_draw, replace=False)
            selected_coords = coords[idx]

            y_idx = xr.DataArray(selected_coords[:, 0], dims="pixel_id")
            x_idx = xr.DataArray(selected_coords[:, 1], dims="pixel_id")
            
            pixel_bundle = ds_test.isel(y=y_idx, x=x_idx)
            stacked = pixel_bundle.stack(sample=("pixel_id", "time_sentinel_2_l2a"))

            idx_t, idx_target = [], []
            for p in range(n_draw):
                start = p * n_t
                idx_t.extend(range(start, start + n_t - 1))
                idx_target.extend(range(start + 1, start + n_t))

            # Eindeutige IDs: Offset vom vorherigen Cube/Klasse mitnehmen
            unique_pixel_ids = np.arange(global_pixel_offset, global_pixel_offset + n_draw, dtype=np.uint32)

            temp_dict = {f: stacked[f].isel(sample=idx_t).values.astype(np.float32) for f in features}
            
            temp_dict.update({
                "NDVI_basic_t": stacked["NDVI_basic"].isel(sample=idx_t).values.astype(np.float32),
                "target_basic": stacked["NDVI_basic"].isel(sample=idx_target).values.astype(np.float32),
                "NDVI_strict_t": stacked["NDVI_strict"].isel(sample=idx_t).values.astype(np.float32),
                "target_strict": stacked["NDVI_strict"].isel(sample=idx_target).values.astype(np.float32),
                "pixel_group": np.repeat(unique_pixel_ids, n_t - 1),
                "timestep": np.tile(np.arange(n_t - 1, dtype=np.uint16), n_draw),
                "lc_class": lc_val,
                "cube_origin": key, # Herkunft tracken
                "COP_DEM": np.repeat(dem_full.isel(y=y_idx, x=x_idx).values.astype(np.float32), n_t - 1)
            })
            
            all_samples.append(pd.DataFrame(temp_dict))
            global_pixel_offset += n_draw # Offset erh√∂hen f√ºr den n√§chsten Durchlauf
            
            del stacked, pixel_bundle
            gc.collect()

    # Finaler Merge aller Cubes
    df_master = pd.concat(all_samples, ignore_index=True)
    df_master["lc_class"] = df_master["lc_class"].astype("category")
    
    # Lags berechnen (jetzt √ºber alle Cubes hinweg sicher, da pixel_group eindeutig)
    print("Berechne Lags f√ºr Master-Set...")
    for f in features + ["NDVI_basic_t", "NDVI_strict_t"]:
        for lag in range(1, 6):
            df_master[f"{f}_minus_{lag}"] = df_master.groupby("pixel_group")[f].shift(lag)
            
    return df_master

# --- Ausf√ºhrung ---
test_keys = ["ds_5", "ds_10"]
df_test_master = create_sampled_master_test_df(cubes, test_keys, features)

# Speichern (z.B. mit Hinweis auf beide Cubes im Namen)
df_test_master.to_parquet(f"XGBoost/new_data/master_test_combined.parquet")