In [1]:
# Needs hdf5 to run
# Instead of hf datasets lib alternatively use torchgeo 
# As of 02.11.24 failed to reach geo API, could attempt later
%pip install --upgrade torch; torchvision; datasets; h5py

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
Installing collected packages: datasets
  Attempting uninstall: datasets
    Found existing installation: datasets 3.0.1
    Uninstalling datasets-3.0.1:
      Successfully uninstalled datasets-3.0.1
Successfully installed datasets-3.1.0
Note: you may need to restart the kernel to use updated packages.


In [None]:
# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
from typing import List
import torch
import datasets
# from torchgeo.datasets import CaBuAr
from torch.utils.data import DataLoader
from torchvision import transforms
import h5py
import numpy as np

# Find for instance the citation on arxiv or on the dataset repo/website
_CITATION = """\
@article{cabuar,
  title={Ca{B}u{A}r: California {B}urned {A}reas dataset for delineation},
  author={Rege Cambrin, Daniele and Colomba, Luca and Garza, Paolo},
  journal={IEEE Geoscience and Remote Sensing Magazine},
  doi={10.1109/MGRS.2023.3292467},
  year={2023}
}
"""

# You can copy an official description
_DESCRIPTION = """\
CaBuAr dataset contains images from Sentinel-2 satellites taken before and after a wildfire.
The ground truth masks are provided by the California Department of Forestry and Fire Protection and they are mapped on the images.
"""

_HOMEPAGE = "https://huggingface.co/datasets/DarthReca/california_burned_areas"

_LICENSE = "OPENRAIL"

# Define the root directory for the dataset
# Change manually
_URLS = {'root': os.curdir,'cache_dir':"/raw"}
_BATCH_SIZE = 16
_NUM_WORKERS = 1


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Set the Hugging Face datasets cache directory
os.environ['HF_DATASETS_CACHE'] = os.path.join(_URLS['root'],_URLS['cache_dir'])


# You can verify by printing (optional)
print(f"HF_DATASETS_CACHE set to: {os.getenv('HF_DATASETS_CACHE')}")

HF_DATASETS_CACHE set to: D:\\python_course\\NN_Practice\\collab\\california_burned_areas\\raw


In [4]:
# Define device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [5]:
# Load the CaBuAr dataset
# Possible splits: 1-5, or chabud
# For docs check out 
# https://huggingface.co/datasets/DarthReca/california_burned_areas
# https://torchgeo.readthedocs.io/en/stable/api/datasets.html#cabuar 
dataset = datasets.load_dataset('DarthReca/california_burned_areas', name='pre-post-fire', split='chabud',trust_remote_code=True)


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Generating 0 split: 78 examples [00:11,  6.53 examples/s]
Generating 1 split: 55 examples [00:09,  6.10 examples/s]
Generating 2 split: 69 examples [00:11,  5.77 examples/s]
Generating 3 split: 85 examples [00:13,  6.22 examples/s]
Generating 4 split: 69 examples [00:10,  6.33 examples/s]
Generating chabud split: 68 examples [00:19,  3.54 examples/s]


In [6]:
# Define transformations (modify as needed)
transform = transforms.Compose([
    transforms.Lambda(np.array), # List converts to numpy array
    transforms.ToTensor(),  # Converts numpy arrays to torch tensors
    # Add more transforms if necessary, e.g., normalization
    # transforms.Normalize(mean=[0.5], std=[0.5]),
])

In [7]:
# Define a custom PyTorch Dataset to apply transformations
class CaBuArPyTorchDataset(datasets.GeneratorBasedBuilder):
    def __init__(self, hf_dataset, transform=None, load_prefire=True):
    # Commented - Alternative def using torchgeo parent class
    #def __init__(self, root, transform=None, download = False):
        """
        Args:
            hf_dataset: Hugging Face dataset object
            transform: Optional transform to be applied on a sample
            load_prefire: Boolean indicating whether to load pre_fire data
            download: if True, download dataset and store it in the root directory
        """
        # super().__init__(root=root, transforms=transform, download=download)
        self.hf_dataset = hf_dataset
        self.transform = transform
        self.load_prefire = load_prefire

    def __len__(self):
        return len(self.hf_dataset)

    def __getitem__(self, idx):
        sample = self.hf_dataset[idx]
        post_fire = sample['post_fire']  # Shape: (512, 512, 12)
        mask = sample['mask']            # Shape: (512, 512, 1)

        if self.load_prefire:
            pre_fire = sample.get('pre_fire', np.zeros_like(post_fire))
        else:
            pre_fire = None

        # Apply transformations
        if self.transform:
            post_fire = self.transform(post_fire)
            mask = self.transform(mask)
            if pre_fire is not None:
                pre_fire = self.transform(pre_fire)

        # Move tensors to device
        post_fire = post_fire.to(device)
        mask = mask.to(device)
        if pre_fire is not None:
            pre_fire = pre_fire.to(device)

        # Prepare the output dictionary
        if self.load_prefire:
            return {
                'post_fire': post_fire,    # Tensor: [12, 512, 512]
                'pre_fire': pre_fire,      # Tensor: [12, 512, 512]
                'mask': mask.squeeze(0)    # Tensor: [512, 512]
            }
        else:
            return {
                'post_fire': post_fire,    # Tensor: [12, 512, 512]
                'mask': mask.squeeze(0)    # Tensor: [512, 512]
            }

In [8]:
# Instantiate the custom dataset
pytorch_dataset = CaBuArPyTorchDataset(
    hf_dataset=dataset,
    transform=transform,
    load_prefire=True  # Set to False if you want to load only post_fire data
)
# pytorch_dataset = CaBuArPyTorchDataset(
#     root= _URLS['root'],
#     transform=transform,
#     download = False  # Set to False if you want to load only post_fire data
# )
# Create a DataLoader
dataloader = DataLoader(
    pytorch_dataset,
    batch_size=_BATCH_SIZE,       # Adjust batch size as needed
    shuffle=False,        # Shuffle for training
    num_workers=_NUM_WORKERS,       # Number of subprocesses for data loading
    pin_memory=True      # Speed up transfer to GPU
)

In [9]:
# Example: Iterate through the DataLoader
for batch_idx, batch in enumerate(dataloader):
    post_fire = batch['post_fire']      # Shape: [batch_size, 12, 512, 512]
    mask = batch['mask']                # Shape: [batch_size, 512, 512]
    pre_fire = batch.get('pre_fire')    # Shape: [batch_size, 12, 512, 512] or None

    # Now you can pass `post_fire`, `pre_fire`, and `mask` to your model
    # Example:
    # outputs = model(post_fire, pre_fire)
    # loss = criterion(outputs, mask)

    # For demonstration, we'll just print the batch shapes
    print(f"Batch {batch_idx}:")
    print(f"  post_fire shape: {post_fire.shape}")
    if pre_fire is not None:
        print(f"  pre_fire shape: {pre_fire.shape}")
    print(f"  mask shape: {mask.shape}")

    # Break after first batch for demonstration
    if batch_idx == 0:
        break

Batch 0:
  post_fire shape: torch.Size([16, 12, 512, 512])
  pre_fire shape: torch.Size([16, 12, 512, 512])
  mask shape: torch.Size([16, 512, 512])


# TODO
 - Create an class ratio graphic in output mask

 - **(Optional)** showcase density of classes on mask (how noisy is the data)

 - Extract spectre distribution on random images

 - Mean brightness, distribution from center of inputs

 - how **correlated** are adjacent 2d tensors in pre-fire, post-fire, could dataset be ordered by **time** if needed? 

In [10]:
# (Optional) Collect accuracy vector after training epochs
# Here, it's assumed you have a training loop where you collect accuracy

# Example training loop structure
def train_model(dataloader, model, criterion, optimizer, num_epochs=10):
    loss_vector = []
    accuracy_vector = []

    for epoch in range(1, num_epochs + 1):
        model.train()
        running_loss = 0.0
        correct = 0
        total = 0

        for batch in dataloader:
            post_fire = batch['post_fire']
            mask = batch['mask']
            pre_fire = batch.get('pre_fire')

            # Move data to device
            post_fire = post_fire.to(device)
            mask = mask.to(device)
            if pre_fire is not None:
                pre_fire = pre_fire.to(device)

            # Zero the parameter gradients
            optimizer.zero_grad()

            # Forward pass
            outputs = model(post_fire, pre_fire) if pre_fire is not None else model(post_fire)
            loss = criterion(outputs, mask)

            # Backward pass and optimization
            loss.backward()
            optimizer.step()

            # Statistics
            running_loss += loss.item() * post_fire.size(0)
            # Assuming outputs are logits; apply argmax to get predictions
            _, predicted = torch.max(outputs, 1)
            correct += (predicted == mask).sum().item()
            total += mask.numel()

        epoch_loss = running_loss / len(pytorch_dataset)
        epoch_acc = 100. * correct / total
        loss_vector.append(epoch_loss)
        accuracy_vector.append(epoch_acc)

        print(f"Epoch {epoch}/{num_epochs}: Loss={epoch_loss:.4f}, Accuracy={epoch_acc:.2f}%")

    print("Training complete.")
    print("Accuracy vector:", accuracy_vector)
    return loss_vector, accuracy_vector