## Use the trained model to create probabilities and to predict

These zones were not used in training. Python 3.11.9 was in use in here. 

- First code makes predictions
- Taking the results to GeoTifs
- Last code makes the probabilities
- Taking the results to GeoTifs

It's faster to create the codes with AI.

### Claude code, predictions

In [None]:
import numpy as np
import pandas as pd
import zarr
import joblib
import os
import gc
import matplotlib.pyplot as plt
import multiprocessing
from sklearn.metrics import (
    accuracy_score, recall_score, precision_score, f1_score, 
    confusion_matrix, cohen_kappa_score
)
import logging

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

def generate_spatial_indices(spatial_shape):
    """Generate row and column indices for a given spatial shape."""
    total_pixels = spatial_shape[0] * spatial_shape[1]
    row_indices = np.array([i // spatial_shape[1] for i in range(total_pixels)])
    col_indices = np.array([i % spatial_shape[1] for i in range(total_pixels)])
    return row_indices, col_indices

def load_zone_data(zarr_file, zone, selected_features):
    """Load data for a specific zone with selected features."""
    try:
        root = zarr.open(zarr_file, mode='r')
        zone_data = root[zone]
        
        # Check if this zone has the target variable
        if 'label_3m' not in zone_data.keys():
            logger.warning(f"Zone {zone} does not have label_3m, skipping")
            return None
        
        # Create a dictionary to store data
        zone_dict = {'zone_id': []}
        
        # Get spatial indices
        if 'row_idx' in zone_data and 'col_idx' in zone_data:
            row_idx = zone_data['row_idx'][:]
            col_idx = zone_data['col_idx'][:]
        else:
            # Generate spatial indices if they don't exist
            spatial_shape = (5000, 5000)  # Adjust based on your actual data
            row_idx, col_idx = generate_spatial_indices(spatial_shape)
            
        # Load the target variable
        labels = zone_data['label_3m'][:]
        
        # Add zone_id (extracted from zone_name)
        try:
            zone_id = int(zone.split('_')[1]) - 1  # Convert to 0-indexed
            zone_dict['zone_id'] = [zone_id] * len(labels)
        except (IndexError, ValueError):
            logger.warning(f"Could not extract zone ID from {zone}")
            zone_dict['zone_id'] = [0] * len(labels)  # Default to 0
        
        # Add spatial indices
        zone_dict['row_idx'] = row_idx
        zone_dict['col_idx'] = col_idx
        
        # Add labels
        zone_dict['label_3m'] = labels
        
        # Add selected features
        for feature in selected_features:
            if feature in zone_data.keys():
                zone_dict[feature] = zone_data[feature][:]
            else:
                logger.warning(f"Feature {feature} not found in {zone}")
                zone_dict[feature] = np.zeros_like(labels)  # Default to zeros
        
        # Convert to DataFrame
        df = pd.DataFrame(zone_dict)
        logger.info(f"Loaded data from {zone}, shape: {df.shape}")
        
        return df
        
    except Exception as e:
        logger.error(f"Error loading data from {zone}: {e}")
        return None

def load_ground_truth(zones_to_predict):
    """
    Load ground truth labels for specified zones using parallelization.
    """
    zarr_file = "zones_data_2.zarr"  # Use your main zarr file path
    
    # Use multiprocessing to load data in parallel
    total_cpus = min(15, multiprocessing.cpu_count())
    zone_data_list = joblib.Parallel(n_jobs=total_cpus)(
        joblib.delayed(load_zone_data)(zarr_file, zone, ['label_3m']) 
        for zone in zones_to_predict
    )
    
    # Remove None values and extract labels
    ground_truth_dfs = [df for df in zone_data_list if df is not None]
    
    if not ground_truth_dfs:
        logger.warning("No ground truth data loaded")
        return np.array([])
    
    # Combine all data and return only the labels
    combined_df = pd.concat(ground_truth_dfs, ignore_index=True)
    return combined_df['label_3m'].values

def predict_for_batch(batch_idx, zones, rf_models, zarr_file, selected_features, results_dir):
    """Generate predictions for a batch of zones and save to zarr."""
    logger.info(f"Generating predictions for batch {batch_idx+1}")
    
    # Load data for this batch
    zone_data_list = joblib.Parallel(n_jobs=min(15, multiprocessing.cpu_count()))(
        joblib.delayed(load_zone_data)(zarr_file, zone, selected_features) 
        for zone in zones
    )
    
    # Remove None values
    zone_data_list = [df for df in zone_data_list if df is not None]
    
    if not zone_data_list:
        logger.warning(f"No valid data loaded for batch {batch_idx+1}")
        return False
    
    # Combine all zone data
    combined_df = pd.concat(zone_data_list, ignore_index=True)
    
    # Prepare feature data
    X = combined_df[selected_features].values
    
    # Make predictions using each model and average them
    predictions = []
    for model in rf_models:
        predictions.append(model.predict(X))
    
    # Average predictions from all models
    final_predictions = np.round(np.mean(predictions, axis=0)).astype(int)
    
    # Save predictions to zarr file
    pred_file = f"predicted_zones_batch_{batch_idx+1}.zarr"
    zarr_group = zarr.open(pred_file, mode="w")
    zarr_group.create_dataset("predictions", data=final_predictions)
    
    # Add metadata
    zarr_group.attrs["zones"] = zones
    zarr_group.attrs["prediction_date"] = str(pd.Timestamp.now())
    
    logger.info(f"Saved predictions for batch {batch_idx+1} to {pred_file}")
    return True

def process_batch(batch_idx, zones_to_predict, results_dir):
    """Process a single batch with parallelization."""
    logger.info(f"Processing batch {batch_idx+1}")
    
    # Load predicted data 
    pred_file = f"predicted_zones_batch_{batch_idx+1}.zarr"
    if not os.path.exists(pred_file):
        logger.warning(f"Prediction file {pred_file} not found. Skipping batch {batch_idx+1}.")
        return None
    
    zarr_group = zarr.open(pred_file, mode="r")
    pred_combined = zarr_group["predictions"][:]
    
    # Load ground truth for metrics
    y_test_batch = load_ground_truth(zones_to_predict)
    
    if len(y_test_batch) == 0:
        logger.warning(f"No ground truth data loaded for batch {batch_idx+1}. Skipping...")
        return None
    
    # Check that shapes match
    if y_test_batch.shape != pred_combined.flatten().shape:
        logger.warning(f"Shape mismatch between ground truth ({y_test_batch.shape}) and predictions ({pred_combined.flatten().shape}). Skipping batch {batch_idx+1}.")
        return None
    
    # Compute binary classification metrics (Water vs Background)
    binary_y_test = (y_test_batch > 0).astype(int)
    binary_pred = (pred_combined.flatten() > 0).astype(int)
    
    binary_metrics = {
        "kappa": cohen_kappa_score(binary_y_test, binary_pred),
        "accuracy": accuracy_score(binary_y_test, binary_pred),
        "recall": recall_score(binary_y_test, binary_pred),
        "precision": precision_score(binary_y_test, binary_pred),
        "f1_score": f1_score(binary_y_test, binary_pred),
    }
    
    # Multi-class classification metrics
    multi_metrics = {
        "accuracy": accuracy_score(y_test_batch, pred_combined.flatten()),
        "weighted_f1": f1_score(y_test_batch, pred_combined.flatten(), average="weighted"),
        "macro_f1": f1_score(y_test_batch, pred_combined.flatten(), average="macro"),
        "confusion_matrix": confusion_matrix(y_test_batch, pred_combined.flatten()),
    }
    
    multi_class_kappa = cohen_kappa_score(y_test_batch, pred_combined.flatten())
    
    # Save metrics for this batch
    metrics_file = os.path.join(results_dir, f"metrics_batch_{batch_idx+1}.joblib")
    joblib.dump({"binary": binary_metrics, "multi": multi_metrics}, metrics_file)
    
    logger.info(f"Metrics for batch {batch_idx+1} saved to {metrics_file}")
    
    # Generate and save plots for batch
    logger.info(f"Generating plots for batch {batch_idx+1}...")

    def plot_confusion_matrix(cm, class_names, filename):
        fig, ax = plt.subplots()
        ax.matshow(cm, cmap=plt.cm.Blues, alpha=0.7)

        for i in range(cm.shape[0]):
            for j in range(cm.shape[1]):
                ax.text(x=j, y=i, s=cm[i, j], va="center", ha="center", fontsize=12)

        ax.set_xlabel("Predicted Labels")
        ax.set_ylabel("True Labels")
        ax.set_title("Confusion Matrix")
        ax.set_xticks(range(len(class_names)))
        ax.set_xticklabels(class_names)
        ax.set_yticks(range(len(class_names)))
        ax.set_yticklabels(class_names)

        plt.savefig(os.path.join(results_dir, filename))
        plt.close()

    # Ensure confusion matrix exists
    if "confusion_matrix" in multi_metrics:
        plot_confusion_matrix(
            multi_metrics["confusion_matrix"], 
            ["Background", "Stream", "Ditch"], 
            f"confusion_matrix_batch_{batch_idx+1}.png"
        )
    else:
        logger.warning("Confusion matrix missing from multi_metrics!")

    # Reshape and visualize the streams (1) and ditches (2)
    if pred_combined.shape[0] == 5000 * 5000:
        pred_combined = pred_combined.reshape(5000, 5000)
    else:
        raise ValueError(f"Unexpected data shape: {pred_combined.shape}")

    plt.figure(figsize=(10, 6))
    plt.imshow(pred_combined, cmap='tab20c', interpolation='nearest', alpha=0.7)
    plt.title(f"Predicted Streams (1) and Ditches (2) for Batch {batch_idx+1}")
    plt.colorbar(label='Predicted Class')
    plt.xlabel("Pixel X")
    plt.ylabel("Pixel Y")
    plt.savefig(os.path.join(results_dir, f"streams_and_ditches_batch_{batch_idx+1}.png"))
    plt.close()


    gc.collect()
    logger.info(f"Finished processing batch {batch_idx+1}.")

    return {
        "binary_metrics": binary_metrics,
        "multi_metrics": multi_metrics
    }

def mean_confidence_interval(data, confidence=0.95):
    """Calculate mean and 95% confidence interval for given data."""
    import scipy.stats as st
    
    mean = np.mean(data)
    # Calculate confidence interval
    se = st.sem(data)
    h = se * st.t.ppf((1 + confidence) / 2, len(data) - 1)
    return mean, mean - h, mean + h

def main():
    # Define paths and settings
    results_dir = "../02_Results"
    zarr_file = "zones_data_2.zarr"
    models_dir = os.path.join(results_dir, "models")
    os.makedirs(results_dir, exist_ok=True)
    
    # Define zone batches (using your existing definition)
    zone_batches = [
        ["zone_1", "zone_3", "zone_4", "zone_6"], 
        ["zone_7", "zone_8", "zone_10"]
    ]
    
    # Selected features for prediction
    # Update your selected_features to include all 11 features
    selected_features = [
        'col_idx', 'row_idx', 'impoundment_amplified', 'zone_id', 'skyview_gabor', 
        'impoundment_raw', 'conic_mean', 'hpmf_raw', 'skyview_raw', 'hpmf_f', 'slope_channels'
    ]
        
    # First load the models
    logger.info("Loading trained models...")
    model_paths = [os.path.join(models_dir, f) for f in os.listdir(models_dir) if f.endswith(".joblib")]
    rf_models = [joblib.load(model_path) for model_path in model_paths]
    logger.info(f"Loaded {len(rf_models)} models")
    
    # Then generate predictions
    logger.info("Generating predictions for all batches...")
    prediction_results = []
    for i, zones in enumerate(zone_batches):
        prediction_success = predict_for_batch(i, zones, rf_models, zarr_file, selected_features, results_dir)
        prediction_results.append(prediction_success)

    if not any(prediction_results):
        logger.error("Failed to generate predictions for any batch. Exiting.")
        return
    
    # Process batches in parallel
    total_cpus = min(15, multiprocessing.cpu_count())
    logger.info(f"Using {total_cpus} CPU cores for processing")
    
    # Process each batch in parallel using a pool
    results = joblib.Parallel(n_jobs=min(len(zone_batches), total_cpus))(
        joblib.delayed(process_batch)(i, zones, results_dir) 
        for i, zones in enumerate(zone_batches)
    )
    
    # Filter out None results
    results = [r for r in results if r is not None]
    
    if not results:
        logger.warning("No valid results to analyze.")
        return
    
    # Extract metrics for confidence intervals
    original_stats = [r["binary_metrics"]["f1_score"] for r in results]
    enhanced_stats = [r["multi_metrics"]["weighted_f1"] for r in results]
    
    # Calculate confidence intervals and plot comparison
    if original_stats and enhanced_stats:
        # Calculate confidence intervals
        orig_mean, orig_low, orig_high = mean_confidence_interval(original_stats)
        enh_mean, enh_low, enh_high = mean_confidence_interval(enhanced_stats)
        
        print(f"Original method performance: {orig_mean:.4f} (95% CI: {orig_low:.4f}-{orig_high:.4f})")
        print(f"Enhanced method performance: {enh_mean:.4f} (95% CI: {enh_low:.4f}-{enh_high:.4f})")
        
        # Plot comparison of metrics
        plt.figure(figsize=(10, 6))
        plt.bar(['Original', 'Enhanced'], [orig_mean, enh_mean], yerr=[[orig_mean-orig_low, enh_mean-enh_low], 
                                                                 [orig_high-orig_mean, enh_high-enh_mean]])
        plt.title('Performance Comparison with 95% Confidence Intervals')
        plt.ylabel('Performance Metric')
        plt.savefig(os.path.join(results_dir, 'performance_comparison.png'), dpi=300)
        plt.show()
    
        # Save overall metrics and confidence intervals
        overall_results = {
            "original_metrics": {
                "mean": orig_mean,
                "ci_low": orig_low,
                "ci_high": orig_high
            },
            "enhanced_metrics": {
                "mean": enh_mean,
                "ci_low": enh_low,
                "ci_high": enh_high
            }
        }
    
        joblib.dump(overall_results, os.path.join(results_dir, "overall_metrics.joblib"))
        logger.info("Analysis complete. Overall results saved.")

if __name__ == "__main__":
    main()

In [5]:
import zarr
results_batch1 = zarr.open('predicted_zones_batch_1.zarr', mode='r')
# Explore the data structure
print(results_batch1.tree())
# Access specific arrays
# example: data = results_batch1['your_array_name'][:]

/
 └── predictions (100000000,) int64


### Predictions to GeoTifs

In [6]:
import os
import logging
import numpy as np
import zarr
import rasterio
from rasterio.transform import from_bounds

# Configure logging
logging.basicConfig(level=logging.INFO, 
                    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Define zone coordinates
zone_boundaries = {
    "zone_1": {"upper_left": (377982, 6854660), "lower_right": (380482, 6852160)},
    "zone_2": {"upper_left": (377982, 6857160), "lower_right": (380482, 6854660)},
    "zone_3": {"upper_left": (380482, 6857160), "lower_right": (382982, 6854660)},
    "zone_4": {"upper_left": (375482, 6859660), "lower_right": (377982, 6857160)},
    "zone_5": {"upper_left": (377982, 6859660), "lower_right": (380482, 6857160)},
    "zone_6": {"upper_left": (380482, 6859660), "lower_right": (382982, 6857160)},
    "zone_7": {"upper_left": (375482, 6862159.999999999), "lower_right": (377982, 6859660)},
    "zone_8": {"upper_left": (377982, 6862159.999999999), "lower_right": (380482, 6859660)},
    "zone_9": {"upper_left": (380482, 6862159.999999999), "lower_right": (382982, 6859660)},
    "zone_10": {"upper_left": (372982, 6864660), "lower_right": (375482, 6862159.999999999)},
    "zone_11": {"upper_left": (375482, 6864660), "lower_right": (377982, 6862159.999999999)},
    "zone_12": {"upper_left": (377982, 6864660), "lower_right": (380482, 6862159.999999999)},
    "zone_13": {"upper_left": (370482, 6867160), "lower_right": (372982, 6864660)},
    "zone_14": {"upper_left": (372982, 6867160), "lower_right": (375482, 6864660)},
    "zone_15": {"upper_left": (375482, 6867160), "lower_right": (377982, 6864660)},
    "zone_16": {"upper_left": (377982, 6867160), "lower_right": (380482, 6864660)},
    "zone_17": {"upper_left": (370482, 6869660), "lower_right": (372982, 6867160)},
    "zone_18": {"upper_left": (372982, 6869660), "lower_right": (375482, 6867160)},
    "zone_19": {"upper_left": (375482, 6869660), "lower_right": (377982, 6867160)},
    "zone_20": {"upper_left": (372982, 6872160), "lower_right": (375482, 6869660)},
    "zone_21": {"upper_left": (375482, 6872160), "lower_right": (377982, 6869660)}
}

def export_zarr_to_geotiff_by_batch(zarr_file, output_path, batch_number):
    """
    Export predictions from a Zarr file to GeoTIFF format for zones in a specific batch.
    
    Parameters:
    -----------
    zarr_file : str
        Path to the zarr file containing the prediction results
    output_path : str
        Directory to save the GeoTIFF files
    batch_number : int
        The batch number (1 or 2) to process
    """
    # Define the zones in each batch
    zone_batches = [
        ["zone_1", "zone_3", "zone_4", "zone_6"], 
        ["zone_7", "zone_8", "zone_10"]
    ]
    
    # Select the zones for the requested batch
    if batch_number < 1 or batch_number > len(zone_batches):
        logger.error(f"Invalid batch number: {batch_number}. Should be between 1 and {len(zone_batches)}")
        return False
    
    target_zones = zone_batches[batch_number - 1]
    logger.info(f"Processing batch {batch_number} with zones: {target_zones}")
    
    # Ensure output directory exists
    os.makedirs(output_path, exist_ok=True)
    
    try:
        # Open Zarr file
        root = zarr.open(zarr_file, mode="r")
        
        # Check for predictions array
        if "predictions" not in root:
            logger.error("No 'predictions' array found in Zarr file")
            return False
            
        predictions = root["predictions"][:]
        logger.info(f"Loaded predictions array with shape {predictions.shape}")
        
        # Standard grid size for all zones (5000x5000)
        grid_width, grid_height = 5000, 5000
        
        # Calculate prediction indices for each zone
        # For simplicity, we'll assume predictions are stored in same order as zones in the batch
        total_pixels_per_zone = grid_width * grid_height
        
        for i, zone_name in enumerate(target_zones):
            try:
                # Get coordinates for this zone
                if zone_name not in zone_boundaries:
                    logger.error(f"No coordinates found for {zone_name}, skipping")
                    continue
                
                boundaries = zone_boundaries[zone_name]
                ul_x, ul_y = boundaries["upper_left"]
                lr_x, lr_y = boundaries["lower_right"]
                
                # Calculate the index range for this zone
                start_idx = i * total_pixels_per_zone
                end_idx = start_idx + total_pixels_per_zone
                
                # Extract predictions for this zone
                if end_idx <= len(predictions):
                    # Reshape to 2D grid
                    zone_predictions = predictions[start_idx:end_idx].reshape(grid_height, grid_width)
                    logger.info(f"Extracted predictions for {zone_name} with shape {zone_predictions.shape}")
                else:
                    logger.error(f"Not enough data for zone {zone_name}, skipping")
                    continue
                
                # Create transform
                transform = from_bounds(ul_x, lr_y, lr_x, ul_y, grid_width, grid_height)
                
                # Save classification map
                class_filename = f"{output_path}/{zone_name}_classification.tif"
                meta = {
                    'driver': 'GTiff',
                    'height': grid_height,
                    'width': grid_width,
                    'count': 1,
                    'dtype': str(zone_predictions.dtype),
                    'crs': 'EPSG:3067',  # Finnish ETRS-TM35FIN coordinate system
                    'transform': transform,
                    'nodata': 0
                }
                
                with rasterio.open(class_filename, 'w', **meta) as dst:
                    dst.write(zone_predictions, 1)
                
                logger.info(f"Saved classification map for {zone_name} to {class_filename}")
                
            except Exception as e:
                logger.error(f"Error processing zone {zone_name}: {e}")
                import traceback
                logger.error(traceback.format_exc())
        
        return True
    
    except Exception as e:
        logger.error(f"Error exporting from Zarr to GeoTIFF: {e}")
        import traceback
        logger.error(traceback.format_exc())
    
    return False

if __name__ == "__main__":
    # Path to your Zarr files
    zarr_file_batch1 = "predicted_zones_batch_1.zarr"
    zarr_file_batch2 = "predicted_zones_batch_2.zarr"
    
    # Path for output GeoTIFFs
    output_path = "../02_Results/geotiff_results"
    
    logger.info("Starting export from Zarr files")
    
    # Export zones from batch 1
    logger.info("Processing batch 1")
    success_batch1 = export_zarr_to_geotiff_by_batch(zarr_file_batch1, output_path, 1)
    
    # Export zones from batch 2
    logger.info("Processing batch 2")
    success_batch2 = export_zarr_to_geotiff_by_batch(zarr_file_batch2, output_path, 2)
    
    if success_batch1 and success_batch2:
        logger.info("All exports completed successfully")
    else:
        logger.warning("Some exports failed, check the logs for details")

2025-04-01 11:47:42,533 - __main__ - INFO - Starting export from Zarr files
2025-04-01 11:47:42,534 - __main__ - INFO - Processing batch 1
2025-04-01 11:47:42,534 - __main__ - INFO - Processing batch 1 with zones: ['zone_1', 'zone_3', 'zone_4', 'zone_6']
2025-04-01 11:47:43,250 - __main__ - INFO - Loaded predictions array with shape (100000000,)
2025-04-01 11:47:43,250 - __main__ - INFO - Extracted predictions for zone_1 with shape (5000, 5000)
2025-04-01 11:47:43,956 - __main__ - INFO - Saved classification map for zone_1 to ../02_Results/geotiff_results/zone_1_classification.tif
2025-04-01 11:47:43,957 - __main__ - INFO - Extracted predictions for zone_3 with shape (5000, 5000)
2025-04-01 11:47:44,379 - __main__ - INFO - Saved classification map for zone_3 to ../02_Results/geotiff_results/zone_3_classification.tif
2025-04-01 11:47:44,380 - __main__ - INFO - Extracted predictions for zone_4 with shape (5000, 5000)
2025-04-01 11:47:44,612 - __main__ - INFO - Saved classification map fo

## ChatGpt + Claude code, probabilities

In [7]:
import numpy as np
import pandas as pd
import zarr
import joblib
import os
import gc
import matplotlib.pyplot as plt
import multiprocessing
from sklearn.metrics import (
    accuracy_score, recall_score, precision_score, f1_score, 
    confusion_matrix, cohen_kappa_score
)
import logging

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

def generate_spatial_indices(spatial_shape):
    """Generate row and column indices for a given spatial shape."""
    total_pixels = spatial_shape[0] * spatial_shape[1]
    row_indices = np.array([i // spatial_shape[1] for i in range(total_pixels)])
    col_indices = np.array([i % spatial_shape[1] for i in range(total_pixels)])
    return row_indices, col_indices

def load_zone_data(zarr_file, zone, selected_features):
    """Load data for a specific zone with selected features."""
    try:
        root = zarr.open(zarr_file, mode='r')
        zone_data = root[zone]
        
        if 'label_3m' not in zone_data.keys():
            logger.warning(f"Zone {zone} does not have label_3m, skipping")
            return None
        
        zone_dict = {'zone_id': []}
        
        # Get spatial dimensions from the data if available
        if 'spatial_shape' in zone_data.attrs:
            spatial_shape = zone_data.attrs['spatial_shape']
            logger.info(f"Using spatial shape from attributes: {spatial_shape}")
        elif 'row_idx' in zone_data and 'col_idx' in zone_data:
            row_idx = zone_data['row_idx'][:]
            col_idx = zone_data['col_idx'][:]
            # Calculate approximate spatial shape based on max indices
            max_row = np.max(row_idx) + 1
            max_col = np.max(col_idx) + 1
            spatial_shape = (max_row, max_col)
            logger.info(f"Calculated spatial shape from indices: {spatial_shape}")
        else:
            spatial_shape = (5000, 5000)  # Default fallback
            logger.warning(f"No spatial information found for {zone}, using default shape: {spatial_shape}")
        
        # Store spatial shape for later use
        zone_dict['spatial_shape'] = spatial_shape
        
        if 'row_idx' in zone_data and 'col_idx' in zone_data:
            row_idx = zone_data['row_idx'][:]
            col_idx = zone_data['col_idx'][:]
        else:
            row_idx, col_idx = generate_spatial_indices(spatial_shape)
        
        labels = zone_data['label_3m'][:]
        
        try:
            zone_id = int(zone.split('_')[1]) - 1
            zone_dict['zone_id'] = [zone_id] * len(labels)
        except (IndexError, ValueError):
            logger.warning(f"Could not extract zone ID from {zone}")
            zone_dict['zone_id'] = [0] * len(labels)
        
        zone_dict['row_idx'] = row_idx
        zone_dict['col_idx'] = col_idx
        zone_dict['label_3m'] = labels
        
        for feature in selected_features:
            if feature in zone_data.keys():
                zone_dict[feature] = zone_data[feature][:]
            else:
                logger.warning(f"Feature {feature} not found in {zone}")
                zone_dict[feature] = np.zeros_like(labels)
        
        df = pd.DataFrame(zone_dict)
        df['spatial_shape'] = df['spatial_shape'].astype(str)  # Convert tuple to string for DataFrame
        logger.info(f"Loaded data from {zone}, shape: {df.shape}")
        
        return df
        
    except Exception as e:
        logger.error(f"Error loading data from {zone}: {e}")
        return None

def load_ground_truth(zones_to_predict):
    zarr_file = "zones_data_2.zarr"
    
    total_cpus = min(15, multiprocessing.cpu_count())
    zone_data_list = joblib.Parallel(n_jobs=total_cpus)(
        joblib.delayed(load_zone_data)(zarr_file, zone, ['label_3m']) 
        for zone in zones_to_predict
    )
    
    ground_truth_dfs = [df for df in zone_data_list if df is not None]
    
    if not ground_truth_dfs:
        logger.warning("No ground truth data loaded")
        return np.array([])

    combined_df = pd.concat(ground_truth_dfs, ignore_index=True)
    return combined_df['label_3m'].values

def predict_for_batch(batch_idx, zones, rf_models, zarr_file, selected_features, results_dir):
    logger.info(f"Generating probabilities for batch {batch_idx+1}")

    # Load data
    zone_data_list = joblib.Parallel(n_jobs=min(15, multiprocessing.cpu_count()))(
        joblib.delayed(load_zone_data)(zarr_file, zone, selected_features) 
        for zone in zones
    )
    
    zone_data_list = [df for df in zone_data_list if df is not None]

    if not zone_data_list:
        logger.warning(f"No valid data loaded for batch {batch_idx+1}")
        return False

    combined_df = pd.concat(zone_data_list, ignore_index=True)
    
    # Extract spatial shape from the first zone (assuming all zones have the same shape)
    # Convert string back to tuple if necessary
    first_spatial_shape = combined_df['spatial_shape'].iloc[0]
    if isinstance(first_spatial_shape, str):
        import ast
        spatial_shape = ast.literal_eval(first_spatial_shape)
    else:
        spatial_shape = first_spatial_shape
        
    logger.info(f"Using spatial shape for prediction: {spatial_shape}")
    
    # Remove spatial_shape from features for model input
    feature_cols = [f for f in selected_features if f != 'spatial_shape']
    X = combined_df[feature_cols].values

    logger.info(f"Input feature matrix shape: {X.shape}")

    # Generate predictions from each model
    all_probabilities = [model.predict_proba(X) for model in rf_models]

    fixed_probabilities = []
    for i, prob in enumerate(all_probabilities):
        if prob.ndim == 2:
            if prob.shape[1] == 3:
                fixed_probabilities.append(prob)  # Expected case (all 3 classes)
            elif prob.shape[1] == 2:
                logger.warning(f"Model {i+1} returned 2-class probabilities. Assuming classes 0 & 1.")
                # Assume missing class 2 (ditches) → Add a column of zeros
                prob_fixed = np.hstack([prob, np.zeros((prob.shape[0], 1))])
                fixed_probabilities.append(prob_fixed)
            elif prob.shape[1] == 1:
                logger.warning(f"Model {i+1} returned single-class probabilities. Assuming only class 2 (ditches).")
                # Assume this is predicting only ditches (class 2) → Add two zero columns for classes 0 & 1
                prob_fixed = np.hstack([np.zeros((prob.shape[0], 2)), prob])
                fixed_probabilities.append(prob_fixed)
            else:
                raise ValueError(f"Unexpected probability shape from model {i+1}: {prob.shape}")
        elif prob.ndim == 1:
            logger.warning(f"Model {i+1} returned a 1D probability array. Assuming class 2 only.")
            prob_fixed = np.hstack([np.zeros((prob.shape[0], 2)), prob.reshape(-1, 1)])
            fixed_probabilities.append(prob_fixed)
        else:
            raise ValueError(f"Model {i+1} returned an unexpected shape: {prob.shape}")

    # Ensure all probability arrays are the same length
    prob_lengths = [len(p) for p in fixed_probabilities]
    if len(set(prob_lengths)) > 1:
        raise ValueError(f"Inconsistent probability array lengths: {set(prob_lengths)}")

    all_probabilities = fixed_probabilities

    for i, prob in enumerate(all_probabilities):
        logger.info(f"Model {i+1} probability shape: {prob.shape}")

    # Averaging probabilities
    averaged_probabilities = np.mean(all_probabilities, axis=0)

    # Debugging: Print shape before slicing
    logger.info(f"Averaged probabilities shape: {averaged_probabilities.shape}")

    # Extract probabilities for streams (1) and ditches (2)
    y_prob_final = averaged_probabilities[:, 1:3]  # Take only class 1 and 2

    # Calculate the expected size based on actual data dimensions
    expected_size = spatial_shape[0] * spatial_shape[1] * 2  # height * width * 2 classes
    actual_size = y_prob_final.size
    logger.info(f"Expected size based on spatial shape: {expected_size}, Actual size: {actual_size}")

    # Check if we need to pad or truncate data to match expected size
    if actual_size != expected_size:
        logger.warning(f"Size mismatch. Attempting to adjust data to match spatial dimensions.")
        
        # If we have fewer samples than expected (missing data)
        if actual_size < expected_size:
            # Calculate how many samples we're missing
            pixels_expected = spatial_shape[0] * spatial_shape[1]
            pixels_actual = y_prob_final.shape[0]
            pixels_missing = pixels_expected - pixels_actual
            
            logger.warning(f"Missing {pixels_missing} pixels. Padding with zeros.")
            padding = np.zeros((pixels_missing, 2))
            y_prob_final = np.vstack([y_prob_final, padding])
        
        # If we have more samples than expected (extra data)
        elif actual_size > expected_size:
            logger.warning(f"Extra data detected. Truncating to fit spatial dimensions.")
            pixels_expected = spatial_shape[0] * spatial_shape[1]
            y_prob_final = y_prob_final[:pixels_expected, :]
    
    # Reshape to spatial dimensions
    try:
        final_probabilities = y_prob_final.reshape(spatial_shape[0], spatial_shape[1], 2).astype(np.float32)
    except ValueError as e:
        logger.error(f"Reshape failed: {e}")
        # Alternative: Save unreshaped data
        logger.info("Saving unreshaped probabilities instead")
        final_probabilities = y_prob_final.astype(np.float32)
        
    # Save probabilities
    prob_file = os.path.join(results_dir, f"probabilities_zones_batch_{batch_idx+1}.zarr")
    zarr_group = zarr.open(prob_file, mode="w")
    zarr_group.create_dataset("probabilities", data=final_probabilities, dtype=np.float32)
    zarr_group.attrs["zones"] = zones
    zarr_group.attrs["spatial_shape"] = spatial_shape
    zarr_group.attrs["prediction_date"] = str(pd.Timestamp.now())

    logger.info(f"Saved probabilities for batch {batch_idx+1} to {prob_file}")
    return True

def main():
    results_dir = "../02_Results"
    zarr_file = "zones_data_2.zarr"
    models_dir = os.path.join(results_dir, "models")
    os.makedirs(results_dir, exist_ok=True)
    
    zone_batches = [["zone_1", "zone_3", "zone_4", "zone_6"], ["zone_7", "zone_8", "zone_10"]]
    
    # Add spatial_shape to selected features to track the dimensions
    selected_features = [
        'col_idx', 'row_idx', 'impoundment_amplified', 'zone_id', 'skyview_gabor', 
        'impoundment_raw', 'conic_mean', 'hpmf_raw', 'skyview_raw', 'hpmf_f', 'slope_channels',
        'spatial_shape'
    ]
        
    logger.info("Loading trained models...")
    model_paths = [os.path.join(models_dir, f) for f in os.listdir(models_dir) if f.endswith(".joblib")]
    rf_models = [joblib.load(model_path) for model_path in model_paths]
    logger.info(f"Loaded {len(rf_models)} models")

    logger.info("Generating probabilities for all batches...")
    prediction_results = []
    for i, zones in enumerate(zone_batches):
        prediction_success = predict_for_batch(i, zones, rf_models, zarr_file, selected_features, results_dir)
        prediction_results.append(prediction_success)

    if not any(prediction_results):
        logger.error("Failed to generate probabilities for any batch. Exiting.")
        return
    
    logger.info("Processing complete. Results saved.")
    
if __name__ == "__main__":
    main()

2025-04-01 15:07:19,286 - __main__ - INFO - Loading trained models...
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
2025-04-01 15:07:20,566 - __main__ - INFO - Loaded 9 models
2025-04-01 15:07:20,567 - __main__ - INFO - Generating probabilities for all batches...
2025-04-01 15:07:20,567 - __main__ - INFO - Generating probabilities for batch 1
Feature zone_id not found in zone_1
Feature spatial_shape not found in zone_1
Feature zone_id not found in zone_6
Feature zone_id not found in zone_3
Feature zone_id not found in zone_4
Feature spatial_shape not found in zone_6
Feature spatial_shape not found in zone_3
Feature spatial_shape not found in zone_4
2025-04-01 15:09:20,043 - __main__ - INFO - Using spatial shape for prediction: 0
2025-04-01 15:09:32,584 - __main__ - INFO - Input feature matrix shape: (100000000, 11)


### Probabilities to GeoTifs

In [13]:
import os
import logging
import numpy as np
import zarr
import rasterio
from rasterio.transform import from_bounds

# Configure logging
logging.basicConfig(level=logging.INFO, 
                    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Define zone coordinates
zone_boundaries = {
    "zone_1": {"upper_left": (377982, 6854660), "lower_right": (380482, 6852160)},
    "zone_2": {"upper_left": (377982, 6857160), "lower_right": (380482, 6854660)},
    "zone_3": {"upper_left": (380482, 6857160), "lower_right": (382982, 6854660)},
    "zone_4": {"upper_left": (375482, 6859660), "lower_right": (377982, 6857160)},
    "zone_5": {"upper_left": (377982, 6859660), "lower_right": (380482, 6857160)},
    "zone_6": {"upper_left": (380482, 6859660), "lower_right": (382982, 6857160)},
    "zone_7": {"upper_left": (375482, 6862159.999999999), "lower_right": (377982, 6859660)},
    "zone_8": {"upper_left": (377982, 6862159.999999999), "lower_right": (380482, 6859660)},
    "zone_9": {"upper_left": (380482, 6862159.999999999), "lower_right": (382982, 6859660)},
    "zone_10": {"upper_left": (372982, 6864660), "lower_right": (375482, 6862159.999999999)},
    "zone_11": {"upper_left": (375482, 6864660), "lower_right": (377982, 6862159.999999999)},
    "zone_12": {"upper_left": (377982, 6864660), "lower_right": (380482, 6862159.999999999)},
    "zone_13": {"upper_left": (370482, 6867160), "lower_right": (372982, 6864660)},
    "zone_14": {"upper_left": (372982, 6867160), "lower_right": (375482, 6864660)},
    "zone_15": {"upper_left": (375482, 6867160), "lower_right": (377982, 6864660)},
    "zone_16": {"upper_left": (377982, 6867160), "lower_right": (380482, 6864660)},
    "zone_17": {"upper_left": (370482, 6869660), "lower_right": (372982, 6867160)},
    "zone_18": {"upper_left": (372982, 6869660), "lower_right": (375482, 6867160)},
    "zone_19": {"upper_left": (375482, 6869660), "lower_right": (377982, 6867160)},
    "zone_20": {"upper_left": (372982, 6872160), "lower_right": (375482, 6869660)},
    "zone_21": {"upper_left": (375482, 6872160), "lower_right": (377982, 6869660)}
}

def export_zarr_to_geotiff_by_batch(zarr_file, output_path, batch_number):
    """
    Export probabilities from a Zarr file to GeoTIFF format for zones in a specific batch.
    
    Parameters:
    -----------
    zarr_file : str
        Path to the zarr file containing the probabilities results
    output_path : str
        Directory to save the GeoTIFF files
    batch_number : int
        The batch number (1 or 2) to process
    """
    # Define the zones in each batch
    zone_batches = [
        ["zone_1", "zone_3", "zone_4", "zone_6"], 
        ["zone_7", "zone_8", "zone_10"]
    ]
    
    # Select the zones for the requested batch
    if batch_number < 1 or batch_number > len(zone_batches):
        logger.error(f"Invalid batch number: {batch_number}. Should be between 1 and {len(zone_batches)}")
        return False
    
    target_zones = zone_batches[batch_number - 1]
    logger.info(f"Processing batch {batch_number} with zones: {target_zones}")
    
    # Ensure output directory exists
    os.makedirs(output_path, exist_ok=True)
    
    try:
        # Open Zarr file
        root = zarr.open(zarr_file, mode="r")
        
        # Check for probabilities array
        if "probabilities" not in root:
            logger.error("No 'probabilities' array found in Zarr file")
            return False
            
        probabilities = root["probabilities"][:]
        logger.info(f"Loaded probabilities array with shape {probabilities.shape}")
        
        # Standard grid size for all zones (5000x5000)
        grid_width, grid_height = 5000, 5000
        
        # Calculate probabilities indices for each zone
        # For simplicity, we'll assume probabilities are stored in same order as zones in the batch
        total_pixels_per_zone = grid_width * grid_height
        
        for i, zone_name in enumerate(target_zones):
            try:
                # Get coordinates for this zone
                if zone_name not in zone_boundaries:
                    logger.error(f"No coordinates found for {zone_name}, skipping")
                    continue
                
                boundaries = zone_boundaries[zone_name]
                ul_x, ul_y = boundaries["upper_left"]
                lr_x, lr_y = boundaries["lower_right"]
                
                # Calculate the index range for this zone
                start_idx = i * total_pixels_per_zone
                end_idx = start_idx + total_pixels_per_zone
                
                # Extract probabilities for this zone
                if end_idx <= len(probabilities):
                    # Reshape to 2D grid
                    zone_probabilities = probabilities[start_idx:end_idx].reshape(grid_height, grid_width)
                    logger.info(f"Extracted probabilities for {zone_name} with shape {zone_probabilities.shape}")
                else:
                    logger.error(f"Not enough data for zone {zone_name}, skipping")
                    continue
                
                # Create transform
                transform = from_bounds(ul_x, lr_y, lr_x, ul_y, grid_width, grid_height)
                
                # Save classification map
                class_filename = f"{output_path}/{zone_name}_classification.tif"
                meta = {
                    'driver': 'GTiff',
                    'height': grid_height,
                    'width': grid_width,
                    'count': 1,
                    'dtype': str(zone_probabilities.dtype),
                    'crs': 'EPSG:3067',  # Finnish ETRS-TM35FIN coordinate system
                    'transform': transform,
                    'nodata': 0
                }
                
                with rasterio.open(class_filename, 'w', **meta) as dst:
                    dst.write(zone_probabilities, 1)
                
                logger.info(f"Saved classification map for {zone_name} to {class_filename}")
                
            except Exception as e:
                logger.error(f"Error processing zone {zone_name}: {e}")
                import traceback
                logger.error(traceback.format_exc())
        
        return True
    
    except Exception as e:
        logger.error(f"Error exporting from Zarr to GeoTIFF: {e}")
        import traceback
        logger.error(traceback.format_exc())
    
    return False

if __name__ == "__main__":
    # Path to your Zarr files
    zarr_file_batch1 = "probabilities_zones_batch_1.zarr"
    zarr_file_batch2 = "probabilities_zones_batch_2.zarr"
    
    # Path for output GeoTIFFs
    output_path = "../02_Results/geotiff_results"
    
    logger.info("Starting export from Zarr files")
    
    # Export zones from batch 1
    logger.info("Processing batch 1")
    success_batch1 = export_zarr_to_geotiff_by_batch(zarr_file_batch1, output_path, 1)
    
    # Export zones from batch 2
    logger.info("Processing batch 2")
    success_batch2 = export_zarr_to_geotiff_by_batch(zarr_file_batch2, output_path, 2)
    
    if success_batch1 and success_batch2:
        logger.info("All exports completed successfully")
    else:
        logger.warning("Some exports failed, check the logs for details")

2025-04-01 13:14:05,129 - __main__ - INFO - Starting export from Zarr files
2025-04-01 13:14:05,130 - __main__ - INFO - Processing batch 1
2025-04-01 13:14:05,131 - __main__ - INFO - Processing batch 1 with zones: ['zone_1', 'zone_3', 'zone_4', 'zone_6']
2025-04-01 13:14:05,135 - __main__ - INFO - Loaded probabilities array with shape (5000,)
2025-04-01 13:14:05,136 - __main__ - ERROR - Not enough data for zone zone_1, skipping
2025-04-01 13:14:05,136 - __main__ - ERROR - Not enough data for zone zone_3, skipping
2025-04-01 13:14:05,136 - __main__ - ERROR - Not enough data for zone zone_4, skipping
2025-04-01 13:14:05,137 - __main__ - ERROR - Not enough data for zone zone_6, skipping
2025-04-01 13:14:05,137 - __main__ - INFO - Processing batch 2
2025-04-01 13:14:05,137 - __main__ - INFO - Processing batch 2 with zones: ['zone_7', 'zone_8', 'zone_10']
2025-04-01 13:14:05,139 - __main__ - INFO - Loaded probabilities array with shape (5000,)
2025-04-01 13:14:05,140 - __main__ - ERROR - No

In [15]:
zone = "zone_1"  # Adjust this to check specific zones
zone_data = root[zone]
print(zone_data.keys())  # Check which features are available for this zone

KeysView(<zarr.hierarchy.Group '/zone_1' read-only>)
