# **Download and Explore Satellite Imagery Data for Floods**

In [None]:
import random, os, gc, psutil, glob, re, multiprocessing # mem management, parallel processing & file handling
import numpy as np
import pandas as pd
from time import time
from tqdm.notebook import tqdm # Progress bars
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
from joblib import Parallel, delayed
# Geospatial libraries
import rasterio
from rasterio.plot import show
import geopandas as gpd
from shapely.geometry import Point, Polygon
import folium
from pyproj import CRS #python interface to PROJ (cartographic projections and coordinate transformations lib)
# Image Processing
from skimage import exposure, filters, morphology
import cv2

In [None]:
import warnings 
warnings.filterwarnings('ignore')

## Sen1Floods11

This data's citation is:
> Bonafilia, D., Tellman, B., Anderson, T., Issenberg, E. 2020. Sen1Floods11: a georeferenced dataset to train and test deep learning flood algorithms for Sentinel-1. The IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops, 2020, pp. 210-211.

[The paper is available via open access](http://openaccess.thecvf.com/content_CVPRW_2020/html/w11/Bonafilia_Sen1Floods11_A_Georeferenced_Dataset_to_Train_and_Test_Deep_Learning_CVPRW_2020_paper.html)

The GCS bucket is split into subfolders containing data, checkpoints, training/testing splits and a [STAC](https://stacspec.org) compliant catalogue.

For additional details on how the data is structured and what is represented in the meta data, head over to the [github repo](https://github.com/cloudtostreet/Sen1Floods11/tree/master)

In [None]:
path_data1 = Path('/home/ba_ch/nbs/gis-exploration/data/downloads/v1.1/')

In [None]:
!ls -l /home/ba_ch/nbs/gis-exploration/data/downloads/v1.1/

total 10256
-rw-r--r-- 1 ba_ch ba_ch    14092 Oct 15  2020 Sen1Floods11_Metadata.geojson
drwxr-xr-x 6 ba_ch ba_ch     4096 Mar 10 16:03 catalog
-rw-r--r-- 1 ba_ch ba_ch 10467733 Oct 15  2020 catalog.zip
drwxr-xr-x 2 ba_ch ba_ch     4096 Mar 10 16:31 checkpoints
drwxr-xr-x 4 ba_ch ba_ch     4096 Mar 10 18:16 data
drwxr-xr-x 4 ba_ch ba_ch     4096 Mar 10 18:19 splits


Creating functions to explore the dataset structure. I'll work on ensuring that the function runs as expected before refactoring and chucking it into a class.

In [None]:
def explore_data(root_dir):
    """
    Explore the structure of the Sen1Floods11 dataset, handling multiple levels of folders
    and subfolders.
    """
    root_dir = str(root_dir)
    if not os.path.exists(root_dir):
        print(f"Error: Path {root_dir} does not exist")
        return {}
        
    dataset_info = {
        'total_size_gb': 0,
        'num_files' : 0,
        'file_types': {},
        'folders': {},
        'subfolders_structure': {},
        'tiff_stats': {
            'dimensions': [],
            'bands_count': {},
            'crs_types': {}
        },
    }

    # Walk through directories and files
    for dirpath, dirnames, filenames in os.walk(root_dir):
        rel_path = os.path.relpath(dirpath, root_dir)
        if rel_path == '.':
            rel_path = 'root'
        
        # Parse folder structure
        path_parts = rel_path.split(os.sep)
        current_level = dataset_info['subfolders_structure']
        
        # Nested folder dictionary
        if rel_path != 'root':
            for i, part in enumerate(path_parts):
                if part not in current_level:
                    current_level[part] = {'files': 0, 'size_mb': 0, 'subfolders': {}}
                if i < len(path_parts) - 1 :
                    current_level = current_level[part]['subfolders']
                else:
                    current_folder = current_level[part]
        else: current_folder = dataset_info['subfolders_structure']
        
        # Add folder info
        dataset_info['folders'][rel_path] = {
            'num_files': len(filenames),
            'size_mb': 0,
            'file_types': {},
        }
        # Process files
        for filename in filenames:
            filepath = os.path.join(dirpath, filename)
            try: # added additional exception handling 
                file_size = os.path.getsize(filepath) / (1024*1024) # result in MBs
            except (FileNotFoundError, PermissionError) as e: 
                print(f"Error accessing {filepath}: {e}")
                continue
            
            # Update dataset info
            dataset_info['num_files'] += 1
            dataset_info['total_size_gb'] += file_size / 1024
            dataset_info['folders'][rel_path]['size_mb'] += file_size

            # Update the nested structure of folders
            if rel_path != 'root':
                current_folder['files'] += 1
                current_folder['size_mb'] += file_size
            
            # Track file types
            file_ext = os.path.splitext(filename)[1].lower()
            if file_ext not in dataset_info['file_types']:
                dataset_info['file_types'][file_ext] = {'count': 0, 'size_mb': 0}
            dataset_info['file_types'][file_ext]['count'] += 1
            dataset_info['file_types'][file_ext]['size_mb'] += file_size

            # Track file types per folder
            if file_ext not in dataset_info['folders'][rel_path]['file_types']:
                dataset_info['folders'][rel_path]['file_types'][file_ext] = 0
            dataset_info['folders'][rel_path]['file_types'][file_ext] += 1
    
            # Get info on TIFF files (limited sample)
            if file_ext in ['.tif', '.tiff']:
                try:
                    with rasterio.open(filepath) as src:
                        if len(dataset_info['tiff_stats']['dimensions']) < 10: # Sample upto 10
                            dataset_info['tiff_stats']['dimensions'].append({
                                'file': os.path.relpath(filepath, rootdir),
                                'width': src.width,
                                'height': src.height,
                                'count': src.count,
                                'crs': str(src.crs),
                                'transform': str(src.transform),
                                'size_mb': file_size
                            })
                        # Track band counts
                        band_count = src.count
                        if band_count not in dataset_info['tiff_stats']['bands_count']:
                            dataset_info['tiff_stats']['bands_count'][band_count] = 0
                        dataset_info['tiff_stats']['bands_count'][band_count] += 1
                        
                        # Track CRS types
                        crs_str = str(src.crs)
                        if crs_str not in dataset_info['tiff_stats']['crs_types']:
                            dataset_info['tiff_stats']['crs_types'][crs_str] = 0
                        dataset_info['tiff_stats']['crs_types'][crs_str] += 1   
                except Exception as e: print(f"Error reading {filepath}: {e}")
    # Round the total size, folder sizes and file type sizes
    dataset_info['total_size_gb'] = round(dataset_info['total_size_gb'], 2)
    for folder in dataset_info['folders']:
        dataset_info['folders'][folder]['size_mb'] = round(dataset_info['folders'][folder]['size_mb'], 2)
    for file_type in dataset_info['file_types']:
        dataset_info['file_types'][file_type]['size_mb'] = round(dataset_info['file_types'][file_type]['size_mb'], 2)

    return dataset_info

In [None]:
#explore_data(path_data1) # exceptions

Run simplified function with debug prints.

In [None]:
def simple_check(root_dir):
    root_dir = str(root_dir)
    print(f"Checking directory: {root_dir}")
    if not os.path.exists(root_dir):
        print(f"Path doesn't exist: {root_dir}")
        return False

    file_count = 0
    for dirpath, dirnames, filenames in os.walk(root_dir):
        file_count += len(filenames)
        print(f"Found directory: {dirpath} with {len(filenames)} files")
        if len(filenames) > 0: print(f"Sample file: {filenames[0]}")

    print(f"Total files found: {file_count}")
    return file_count > 0

In [None]:
simple_check(path_data1)

Checking directory: /home/ba_ch/nbs/gis-exploration/data/downloads/v1.1
Found directory: /home/ba_ch/nbs/gis-exploration/data/downloads/v1.1 with 2 files
Sample file: Sen1Floods11_Metadata.geojson
Found directory: /home/ba_ch/nbs/gis-exploration/data/downloads/v1.1/splits with 0 files
Found directory: /home/ba_ch/nbs/gis-exploration/data/downloads/v1.1/splits/flood_handlabeled with 4 files
Sample file: flood_valid_data.csv
Found directory: /home/ba_ch/nbs/gis-exploration/data/downloads/v1.1/splits/perm_water with 4 files
Sample file: permanent_water_data.csv
Found directory: /home/ba_ch/nbs/gis-exploration/data/downloads/v1.1/catalog with 1 files
Sample file: catalog.json
Found directory: /home/ba_ch/nbs/gis-exploration/data/downloads/v1.1/catalog/sen1floods11_hand_labeled_source with 1 files
Sample file: collection.json
Found directory: /home/ba_ch/nbs/gis-exploration/data/downloads/v1.1/catalog/sen1floods11_hand_labeled_source/Spain_6537196 with 1 files
Sample file: Spain_6537196.jso

True