In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from skimage.morphology import remove_small_objects, skeletonize
from skimage.measure import label
import pickle
import pandas as pd
from dask.distributed import Client, LocalCluster, wait
from dask.diagnostics import ProgressBar
import dask.config  
import dask.dataframe as dd
import dask.array as da
from sklearn.metrics import classification_report, confusion_matrix, cohen_kappa_score, accuracy_score, recall_score, precision_score, f1_score, roc_auc_score
import gc
from functools import partial
import time
import zarr
import numcodecs
import h5py
import logging

from Functions.general_functions import create_balanced_dataset
from Functions.general_functions import create_balanced_mask2
from zarr_processing import load_data_parallel, process_zarr_chunk

More complex zarr saving test, to save all datatypes
- vei vain ~2 minuuttia Puhdissa!

### Zarr without coordinates

In [4]:
import zarr
import pickle
import numpy as np
import pandas as pd
import numcodecs
from datetime import datetime

def convert_pickle_to_zarr(pickle_paths, zarr_file):
    """
    Convert pickle files containing DataFrames with various feature arrays to Zarr format
    while preserving all data types and structure.
    """
    # Create a new Zarr group
    root = zarr.open_group(zarr_file, mode="w")
    
    for i, path in enumerate(pickle_paths):
        print(f"Processing zone {i+1} from {path}")
        
        with open(path, "rb") as f:
            data = pickle.load(f)
            
        # Create a group for each zone
        zone_group = root.create_group(f"zone_{i+1}")
        
        # Store the main features as separate arrays
        for col in data.columns:
            series = data[col]
            
            # Handle different data types appropriately
            if pd.api.types.is_numeric_dtype(series):
                # For numeric data, store directly with compression
                zone_group.create_dataset(
                    col,
                    data=series.values,
                    chunks=True,
                    compressor=numcodecs.Blosc(cname='zstd', clevel=5)
                )
            elif pd.api.types.is_object_dtype(series):
                # For object/mixed types, use specialized encoding
                encoded_data = numcodecs.JSON().encode(series.tolist())
                zone_group.create_dataset(
                    col,
                    data=encoded_data,
                    dtype=object,
                    object_codec=numcodecs.JSON()
                )
            elif pd.api.types.is_datetime64_any_dtype(series):
                # For datetime, store as int64 timestamps
                zone_group.create_dataset(
                    col,
                    data=series.astype(np.int64).values,
                    chunks=True,
                    compressor=numcodecs.Blosc(cname='zstd', clevel=5)
                )
            
            # Store dtype information for reconstruction
            zone_group.attrs[f"{col}_dtype"] = str(series.dtype)
        
        # Store DataFrame metadata
        zone_group.attrs['column_order'] = list(data.columns)
        zone_group.attrs['shape'] = data.shape
        zone_group.attrs['index_type'] = str(data.index.dtype)
        
def read_zarr_to_dataframe(zarr_file, zone_number):
    """
    Read a specific zone from Zarr file back into a DataFrame.
    """
    root = zarr.open_group(zarr_file, mode="r")
    zone_group = root[f"zone_{zone_number}"]
    
    # Initialize empty DataFrame
    data = {}
    
    # Get column order from metadata
    columns = zone_group.attrs['column_order']
    
    for col in columns:
        # Get original dtype
        dtype = zone_group.attrs[f"{col}_dtype"]
        
        # Read data based on dtype
        if "datetime" in dtype:
            # Convert int64 timestamps back to datetime
            data[col] = pd.to_datetime(zone_group[col][:])
        elif "object" in dtype:
            # Decode JSON-encoded object data
            data[col] = zone_group[col][:]
        else:
            # Regular numeric data
            data[col] = zone_group[col][:]
    
    # Reconstruct DataFrame with original column order
    df = pd.DataFrame(data, columns=columns)
    
    return df

def verify_conversion(original_pickle_path, zarr_file, zone_number):
    """
    Verify that the Zarr conversion preserved all data correctly.
    """
    # Load original pickle
    with open(original_pickle_path, "rb") as f:
        original_df = pickle.load(f)
    
    # Load from Zarr
    zarr_df = read_zarr_to_dataframe(zarr_file, zone_number)
    
    # Compare DataFrames
    comparison = original_df.equals(zarr_df)
    if not comparison:
        print("Differences found:")
        for col in original_df.columns:
            if not original_df[col].equals(zarr_df[col]):
                print(f"Column {col} differs:")
                print(f"Original dtype: {original_df[col].dtype}")
                print(f"Zarr dtype: {zarr_df[col].dtype}")
                print("First few values:")
                print("Original:", original_df[col].head())
                print("Zarr:", zarr_df[col].head())
    
    return comparison

# Example usage
if __name__ == "__main__":
    pickle_paths = [
        "../../01_Data/01_Raw/features/zone_2.pickle",
        "../../01_Data/01_Raw/features/zone_5.pickle",
        "../../01_Data/01_Raw/features/zone_9.pickle",
    ]
    zarr_file = "zones_data_2.zarr"
    
    # Convert pickles to Zarr
    convert_pickle_to_zarr(pickle_paths, zarr_file)
    
    # Verify each zone
    for i, path in enumerate(pickle_paths):
        print(f"\nVerifying zone {i+1}...")
        if verify_conversion(path, zarr_file, i+1):
            print(f"Zone {i+1} verified successfully!")
        else:
            print(f"Zone {i+1} verification failed!")

Processing zone 1 from ../../01_Data/01_Raw/features/zone_2.pickle
Processing zone 2 from ../../01_Data/01_Raw/features/zone_5.pickle
Processing zone 3 from ../../01_Data/01_Raw/features/zone_9.pickle

Verifying zone 1...
Zone 1 verified successfully!

Verifying zone 2...
Zone 2 verified successfully!

Verifying zone 3...
Zone 3 verified successfully!


### Coordinates to zarr

In [10]:
import numpy as np
import pandas as pd
import zarr
import numcodecs
import pickle
import re

# Define the zone boundaries (replace this with your actual data)
zone_boundaries = {
    0: {"upper_left": (372982.0, 6867160.0), "lower_right": (375482.0, 6864660.0)},
    1: {"upper_left": (377982.0, 6854660.0), "lower_right": (380482.0, 6852160.0)},
    2: {"upper_left": (372982.0, 6869660.0), "lower_right": (375482.0, 6867160.0)},
    3: {"upper_left": (380482.0, 6857160.0), "lower_right": (382982.0, 6854660.0)},
    4: {"upper_left": (372982.0, 6872160.0), "lower_right": (375482.0, 6869660.0)},
    5: {"upper_left": (380482.0, 6859660.0), "lower_right": (382982.0, 6857160.0)},
    6: {"upper_left": (380482.0, 6862160.0), "lower_right": (382982.0, 6859660.0)},
    7: {"upper_left": (372982.0, 6864660.0), "lower_right": (375482.0, 6862160.0)},
    8: {"upper_left": (375482.0, 6867160.0), "lower_right": (377982.0, 6864660.0)},
    9: {"upper_left": (375482.0, 6869660.0), "lower_right": (377982.0, 6867160.0)},
    10: {"upper_left": (375482.0, 6872160.0), "lower_right": (377982.0, 6869660.0)},
    11: {"upper_left": (375482.0, 6859660.0), "lower_right": (377982.0, 6857160.0)},
    12: {"upper_left": (375482.0, 6862160.0), "lower_right": (377982.0, 6859660.0)},
    13: {"upper_left": (375482.0, 6864660.0), "lower_right": (377982.0, 6862160.0)},
    14: {"upper_left": (377982.0, 6867160.0), "lower_right": (380482.0, 6864660.0)},
    15: {"upper_left": (377982.0, 6857160.0), "lower_right": (380482.0, 6854660.0)},
    16: {"upper_left": (377982.0, 6859660.0), "lower_right": (380482.0, 6857160.0)},
    17: {"upper_left": (377982.0, 6862160.0), "lower_right": (380482.0, 6859660.0)},
    18: {"upper_left": (377982.0, 6864660.0), "lower_right": (380482.0, 6862160.0)},
    19: {"upper_left": (370482.0, 6867160.0), "lower_right": (372982.0, 6864660.0)},
    20: {"upper_left": (370482.0, 6869660.0), "lower_right": (372982.0, 6867160.0)}
}

def convert_pickle_to_zarr(pickle_paths, zarr_file):
    """
    Convert pickle files containing DataFrames with various feature arrays to Zarr format
    while preserving all data types and structure, and adding spatial metadata.
    """
    # Create a new Zarr group
    root = zarr.open_group(zarr_file, mode="w")
    
    for path in pickle_paths:
        match = re.search(r'zone_(\d+)', path)
        if not match:
            raise ValueError(f"Could not extract zone number from {path}")

        zone_index = int(match.group(1))  # Extract actual zone number

        if zone_index not in zone_boundaries:
            raise ValueError(f"Zone {zone_index} not found in zone_boundaries!")

        print(f"Processing zone {zone_index} from {path}")

        # Load the pickle file
        with open(path, "rb") as f:
            data = pickle.load(f)

        if not isinstance(data, pd.DataFrame):
            raise ValueError(f"Expected a DataFrame in {path}, but got {type(data)}")

        # Retrieve zone boundaries
        ul_x, ul_y = zone_boundaries[zone_index]["upper_left"]
        lr_x, lr_y = zone_boundaries[zone_index]["lower_right"]

        # Calculate expected shape
        expected_cols = int((lr_x - ul_x) / 2.5)  # 2.5 meters per pixel (width)
        expected_rows = int((ul_y - lr_y) / 4.0)  # 4.0 meters per pixel (height)

        # Create a group for each zone
        zone_group = root.create_group(f"zone_{zone_index}")

        # Store metadata
        zone_group.attrs['boundary'] = zone_boundaries[zone_index]
        zone_group.attrs['expected_shape'] = (expected_rows, expected_cols)

        # Create spatial coordinates
        x_coords = np.linspace(ul_x, lr_x, expected_cols)
        y_coords = np.linspace(ul_y, lr_y, expected_rows)
        xx, yy = np.meshgrid(x_coords, y_coords)

        # Flatten to store as 1D arrays
        spatial_x = xx.flatten()
        spatial_y = yy.flatten()

        # Store spatial coordinates
        zone_group.create_dataset('x_coord', data=spatial_x, chunks=True)
        zone_group.create_dataset('y_coord', data=spatial_y, chunks=True)

        # Store feature data
        for col in data.columns:
            series = data[col]

            if pd.api.types.is_numeric_dtype(series):
                zone_group.create_dataset(
                    col,
                    data=series.values,
                    chunks=True,
                    compressor=numcodecs.Blosc(cname='zstd', clevel=5)
                )
            elif pd.api.types.is_object_dtype(series):
                encoded_data = np.array(series.tolist(), dtype=object)
                zone_group.create_dataset(
                    col,
                    data=encoded_data,
                    dtype=object,
                    object_codec=numcodecs.JSON()
                )
            elif pd.api.types.is_datetime64_any_dtype(series):
                zone_group.create_dataset(
                    col,
                    data=series.astype(np.int64).values,
                    chunks=True,
                    compressor=numcodecs.Blosc(cname='zstd', clevel=5)
                )

            # Store column dtype metadata
            zone_group.attrs[f"{col}_dtype"] = str(series.dtype)

        # Store overall metadata
        zone_group.attrs['column_order'] = list(data.columns)
        zone_group.attrs['shape'] = data.shape
        zone_group.attrs['index_type'] = str(data.index.dtype)

# Example usage
if __name__ == "__main__":
    pickle_paths = [
        "../../01_Data/01_Raw/features/zone_2.pickle",
        "../../01_Data/01_Raw/features/zone_5.pickle",
        "../../01_Data/01_Raw/features/zone_9.pickle",
    ]
    zarr_file = "zones_data_2.zarr"

    # Convert pickles to Zarr
    convert_pickle_to_zarr(pickle_paths, zarr_file)


Processing zone 2 from ../../01_Data/01_Raw/features/zone_2.pickle
Processing zone 5 from ../../01_Data/01_Raw/features/zone_5.pickle
Processing zone 9 from ../../01_Data/01_Raw/features/zone_9.pickle


In [14]:
import zarr

# Open the Zarr file
zarr_file = "zones_data_2.zarr"
root = zarr.open_group(zarr_file, mode="r")

# Example for zone_2
zone_index = 9  # Replace with the zone you're interested in

# Access the zone group
zone_group = root[f"zone_{zone_index}"]

# Access the stored coordinates
x_coords = zone_group['x_coord'][:]
y_coords = zone_group['y_coord'][:]

# Print the coordinates
print("X Coordinates:", x_coords)
print("Y Coordinates:", y_coords)


X Coordinates: [375482.        375484.5025025 375487.005005  ... 377976.994995
 377979.4974975 377982.       ]
Y Coordinates: [6869660. 6869660. 6869660. ... 6867160. 6867160. 6867160.]


In [11]:
import zarr

# Open the Zarr file in read mode
rf_zarr = zarr.open('zones_data_2.zarr', mode='r')

# List the available groups and datasets in the Zarr file
print(rf_zarr.tree())

/
 ├── zone_2
 │   ├── DEM_ditch_detection (25000000,) float64
 │   ├── DEM_ditch_detection_streams (25000000,) float64
 │   ├── conic_mean (25000000,) float64
 │   ├── hpmf_f (25000000,) float32
 │   ├── hpmf_f_visualisation (25000000,) bool
 │   ├── hpmf_mean_3 (25000000,) float64
 │   ├── hpmf_mean_4 (25000000,) float64
 │   ├── hpmf_mean_6 (25000000,) float64
 │   ├── hpmf_median_4 (25000000,) float64
 │   ├── hpmf_min_4 (25000000,) float64
 │   ├── hpmf_raw (25000000,) float64
 │   ├── hpmf_std_6 (25000000,) float64
 │   ├── impoundment_amplified (25000000,) float32
 │   ├── impoundment_max_6 (25000000,) float32
 │   ├── impoundment_mean_2 (25000000,) float32
 │   ├── impoundment_mean_3 (25000000,) float32
 │   ├── impoundment_mean_6 (25000000,) float32
 │   ├── impoundment_median_2 (25000000,) float32
 │   ├── impoundment_median_4 (25000000,) float32
 │   ├── impoundment_median_6 (25000000,) float32
 │   ├── impoundment_raw (25000000,) float32
 │   ├── impoundment_std_4 (25000000

In [15]:
def verify_zarr_structure(zarr_file):
    with zarr.open(zarr_file, mode='r') as root:
        print("Root groups:", list(root.group_keys()))
        
        # Check first group's structure
        first_group = next(iter(root.group_keys()))
        print(f"\nStructure of {first_group}:")
        group = root[first_group]
        print("Arrays:", list(group.array_keys()))
        
        # Print sample of data
        for array_name in group.array_keys():
            array = group[array_name]
            print(f"\n{array_name}:")
            print("Shape:", array.shape)
            print("Dtype:", array.dtype)
            print("First few values:", array[:5])

# Run verification
verify_zarr_structure("zones_data_2.zarr")

Root groups: ['zone_2', 'zone_5', 'zone_9']

Structure of zone_2:
Arrays: ['DEM_ditch_detection', 'DEM_ditch_detection_streams', 'conic_mean', 'hpmf_f', 'hpmf_f_visualisation', 'hpmf_mean_3', 'hpmf_mean_4', 'hpmf_mean_6', 'hpmf_median_4', 'hpmf_min_4', 'hpmf_raw', 'hpmf_std_6', 'impoundment_amplified', 'impoundment_max_6', 'impoundment_mean_2', 'impoundment_mean_3', 'impoundment_mean_6', 'impoundment_median_2', 'impoundment_median_4', 'impoundment_median_6', 'impoundment_raw', 'impoundment_std_4', 'impoundment_std_6', 'label_3m', 'skyview_gabor', 'skyview_max_2', 'skyview_max_4', 'skyview_max_6', 'skyview_median_2', 'skyview_median_6', 'skyview_min_6', 'skyview_raw', 'skyview_std_6', 'slope_channels', 'slope_median_6', 'slope_min_2', 'slope_min_4', 'slope_min_6', 'slope_raw', 'slope_std_4', 'slope_std_6', 'x_coord', 'y_coord']

DEM_ditch_detection:
Shape: (25000000,)
Dtype: float64
First few values: [0.10400391 0.10400391 0.10400391 0.1210022  0.1210022 ]

DEM_ditch_detection_streams:


2nd version

1st version
412 min

In [2]:
import zarr

# List the contents of the Zarr file
zarr_file = "resampled_zones_data.zarr"
store = zarr.open(zarr_file, mode="r")
print(store.tree())


/
 ├── zone_1
 │   ├── DEM_ditch_detection (24999297,) float64
 │   ├── DEM_ditch_detection_streams (24999297,) float64
 │   ├── conic_mean (24999297,) float64
 │   ├── hpmf_f (24999297,) float32
 │   ├── hpmf_f_visualisation (24999297,) bool
 │   ├── hpmf_mean_3 (24999297,) float64
 │   ├── hpmf_mean_4 (24999297,) float64
 │   ├── hpmf_mean_6 (24999297,) float64
 │   ├── hpmf_median_4 (24999297,) float64
 │   ├── hpmf_min_4 (24999297,) float64
 │   ├── hpmf_raw (24999297,) float64
 │   ├── hpmf_std_6 (24999297,) float64
 │   ├── impoundment_amplified (24999297,) float32
 │   ├── impoundment_max_6 (24999297,) float32
 │   ├── impoundment_mean_2 (24999297,) float32
 │   ├── impoundment_mean_3 (24999297,) float32
 │   ├── impoundment_mean_6 (24999297,) float32
 │   ├── impoundment_median_2 (24999297,) float32
 │   ├── impoundment_median_4 (24999297,) float32
 │   ├── impoundment_median_6 (24999297,) float32
 │   ├── impoundment_raw (24999297,) float32
 │   ├── impoundment_std_4 (24999297

In [3]:
import os

file_path = "resampled_zones_data.zarr"  # or "zones_data_2.zarr"
print(os.path.exists(file_path))


True
