<a href="https://colab.research.google.com/github/aborbala/tree-canopy/blob/main/check_sliced_images.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


Sliced images

In [None]:
import os
import re
from collections import defaultdict

SOURCE_DIR = r"/content/drive/MyDrive/masterthesis/data/2020_DOP_all"
TARGET_DIR = r"/content/drive/MyDrive/masterthesis/data/DOP_all_sliced"

# Regex Pattern to extract the coordinate snippet (e.g., "410_5804")
# Example file: "dop20rgbi_33_410_5804_2_be_F_2025.jp2"
COORDINATE_PATTERN = re.compile(r'^dop\d+rgbi_\d+_(\d+_\d+)_.*\.[jJ][pP]2$', re.IGNORECASE)

def extract_and_count_files(source_dir: str, target_dir: str):
    """
    Extracts unique coordinate snippets from source files and counts matching
    files in the target directory.
    """
    # Use a set for efficient storage of unique snippets
    unique_snippets = set()
    # Use a dictionary to store the final counts
    file_counts = {}

    # --- Step 1: Extract Coordinate Snippets ---
    print(f"--- 1. Extracting unique coordinate snippets from {source_dir} ---")

    if not os.path.isdir(source_dir):
        print(f"Error: Source directory not found at {source_dir}")
        return

    # Iterate through all files in the source directory
    for filename in os.listdir(source_dir):
        if filename.lower().endswith('.jp2'):
            # Attempt to match the filename against the coordinate pattern
            match = COORDINATE_PATTERN.match(filename)

            if match:
                # Group 1 contains the required coordinate snippet
                snippet = match.group(1)
                if snippet not in unique_snippets:
                    unique_snippets.add(snippet)
                    print(f"Extracted: {snippet}")

    num_snippets = len(unique_snippets)
    print("----------------------------------------------------------------")
    print(f"Found {num_snippets} unique coordinate snippets.")
    print("----------------------------------------------------------------\n")


    # --- Step 2: Count Files in Target Folder for Each Snippet ---
    print(f"--- 2. Counting corresponding .tif files in {target_dir} ---")

    if not os.path.isdir(target_dir):
        print(f"Error: Target directory not found at {target_dir}")
        return

    # Get a list of all .tif files in the target directory for efficient counting
    # We store the full list of filenames (lowercased for case-insensitive matching)
    target_files = [f.lower() for f in os.listdir(target_dir) if f.lower().endswith('.tif')]

    for snippet in sorted(list(unique_snippets)):
        # Construct the search pattern: any file containing the snippet and ending in .tif
        search_pattern = snippet.lower()
        count = 0

        # Count how many target files contain the snippet
        for target_filename in target_files:
            if search_pattern in target_filename:
                count += 1

        file_counts[snippet] = count
        print(f"  {snippet}: Found {count} matching .tif files.")


    # --- Step 3: Output Results ---
    print("\n--- Final Results ---")
    print(f"{'Coordinate':<15} | {'File Count':<10}")
    print("-" * 27)

    for snippet, count in file_counts.items():
        print(f"{snippet:<15} | {count:<10}")

    print("\nOperation complete.")

if __name__ == "__main__":
    extract_and_count_files(SOURCE_DIR, TARGET_DIR)

--- 1. Extracting unique coordinate snippets from /content/drive/MyDrive/masterthesis/data/DOP_all ---
Extracted: 368_5808
Extracted: 370_5806
Extracted: 370_5808
Extracted: 370_5810
Extracted: 370_5812
Extracted: 370_5814
Extracted: 372_5804
Extracted: 372_5806
Extracted: 372_5808
Extracted: 372_5810
Extracted: 372_5812
Extracted: 372_5814
Extracted: 372_5816
Extracted: 372_5820
Extracted: 372_5822
Extracted: 372_5824
Extracted: 372_5826
Extracted: 372_5828
Extracted: 374_5806
Extracted: 374_5808
Extracted: 374_5810
Extracted: 374_5812
Extracted: 374_5814
Extracted: 374_5816
Extracted: 374_5818
Extracted: 374_5820
Extracted: 374_5822
Extracted: 374_5824
Extracted: 374_5826
Extracted: 374_5828
Extracted: 376_5806
Extracted: 376_5808
Extracted: 376_5810
Extracted: 376_5812
Extracted: 376_5814
Extracted: 376_5816
Extracted: 376_5818
Extracted: 376_5820
Extracted: 376_5822
Extracted: 376_5824
Extracted: 376_5826
Extracted: 376_5828
Extracted: 378_5808
Extracted: 378_5810
Extracted: 378_58

Predictions (window)

In [None]:
import os
import re
from collections import defaultdict

# Source orthophotos directory (Google Drive path)
SOURCE_DIR = r"/content/drive/MyDrive/masterthesis/data/2020_DOP_all"

# Predictions directory (Corrected Google Drive path)
PREDICTIONS_DIR = r"/content/drive/MyDrive/masterthesis/data/2020_DOP_all_predictions/preds_no_struct_veg_mask_all_crowns_dice_loss_experiment_lr_0.0025_L0.3_D0.7"

# Regex to find coordinate snippets from SOURCE files
COORDINATE_PATTERN = re.compile(r'^(?:dop\d+rgbi_\d+|truedop\d+rgb)_(\d+_\d+)_.*\.[jJ][pP]2$', re.IGNORECASE)

# Filename pattern to search for in the PREDICTIONS_DIR.
PRED_SEARCH_PATTERN = "_{}_"
PRED_FILE_EXTENSION = ".geojson"


In [None]:
def get_source_coordinates(source_dir, pattern):
    """
    Scans the source directory for files matching the pattern
    and returns a set of unique coordinate strings.
    """
    print(f"\n[Step 1] Scanning Source Directory: {source_dir}")
    source_coordinates = set()

    if not os.path.exists(source_dir):
        print(f"ERROR: Source directory not found: {source_dir}")
        print("Please make sure your Google Drive is mounted and the path is correct.")
        return None  # Return None on failure

    try:
        for filename in os.listdir(source_dir):
            match = pattern.match(filename)
            if match:
                coord_pattern = match.group(1)
                source_coordinates.add(coord_pattern)
    except Exception as e:
        print(f"ERROR reading source directory: {e}")
        return None  # Return None on failure

    if not source_coordinates:
        print("WARNING: Found 0 matching source files. Check `SOURCE_DIR` and `COORDINATE_PATTERN`.")
    else:
        print(f"Found {len(source_coordinates)} unique coordinate patterns.")

    return source_coordinates

def get_prediction_filenames(predictions_dir, extension):
    """
    Scans the predictions directory for all files with the given extension.
    Returns a list of filenames.
    """
    print(f"\n[Step 2] Scanning Predictions Directory: {predictions_dir}")
    prediction_filenames = []

    if not os.path.exists(predictions_dir):
        print(f"ERROR: Predictions directory not found: {predictions_dir}")
        print("Please make sure the path is correct.")
        return None  # Return None on failure

    try:
        file_count = 0
        for pred_file in os.listdir(predictions_dir):
            file_count += 1
            if pred_file.endswith(extension):
                prediction_filenames.append(pred_file)

            # Progress indicator for large directories
            if file_count % 10000 == 0:
                print(f"  ... scanned {file_count} files ...")

    except Exception as e:
        print(f"ERROR reading predictions directory: {e}")
        return None  # Return None on failure

    print(f"Found {len(prediction_filenames)} total prediction files ('{extension}').")
    return prediction_filenames

def count_matches(source_coordinates, prediction_filenames, search_pattern_template):
    """
    Counts how many prediction files match each source coordinate.
    """
    if source_coordinates is None or prediction_filenames is None:
        print("ERROR: Cannot count matches due to previous errors.")
        return None

    print(f"\n[Step 3] Counting matches...")
    # Use defaultdict to initialize counts to 0
    prediction_counts = defaultdict(int)

    # Pre-calculate search keys for efficiency
    search_keys = {coord: search_pattern_template.format(coord) for coord in source_coordinates}

    # Iterate through prediction files and match against all known keys
    for pred_name in prediction_filenames:
        for coord, key in search_keys.items():
            if key in pred_name:
                prediction_counts[coord] += 1
                # Assuming one prediction file can only match one coordinate
                # If it can match multiple, remove this 'break'
                break

    print("Counting complete.")
    return prediction_counts

def report_results(source_coordinates, prediction_counts):
    """
    Prints a summary report of the counts.
    """
    if source_coordinates is None or prediction_counts is None:
        print("ERROR: Cannot generate report due to previous errors.")
        return

    print("\n--- Results Summary ---")

    missing_coords = []
    found_coords = []

    # Ensure all source coordinates are in the report, even if count is 0
    for coord in sorted(list(source_coordinates)):
        count = prediction_counts[coord] # Gets 0 if not found, thanks to defaultdict
        if count == 0:
            missing_coords.append(coord)
        else:
            found_coords.append((coord, count))

    print(f"Total unique coordinates from source: {len(source_coordinates)}")
    print(f"Coordinates with one or more predictions: {len(found_coords)}")
    print(f"Coordinates with ZERO predictions: {len(missing_coords)}")

    if missing_coords:
        print("\n--- Coordinates with 0 Predictions ---")
        for coord in missing_coords:
            print(f"  {coord}")
    else:
        print("\nExcellent! All source coordinates have at least one prediction file.")

    if found_coords:
        print("\n--- Counts for Found Predictions ---")
        for coord, count in found_coords:
            print(f"  {coord}: {count} file(s)")

def check_processed_data():
    """
    Orchestrator function to run all steps in sequence.
    """
    print(f"Starting data check...")
    print(f"Source Dir: {SOURCE_DIR}")
    print(f"Predictions Dir: {PREDICTIONS_DIR}")

    # Step 1
    coords = get_source_coordinates(SOURCE_DIR, COORDINATE_PATTERN)
    if coords is None:
        print("Halting due to error in Step 1.")
        return

    # Step 2
    pred_files = get_prediction_filenames(PREDICTIONS_DIR, PRED_FILE_EXTENSION)
    if pred_files is None:
        print("Halting due to error in Step 2.")
        return

    # Step 3
    counts = count_matches(coords, pred_files, PRED_SEARCH_PATTERN)
    if counts is None:
        print("Halting due to error in Step 3.")
        return

    # Step 4
    report_results(coords, counts)

    print("\n--- Data check complete. ---")

In [None]:
#     First, try the one that failed:
pred_files = get_prediction_filenames(PREDICTIONS_DIR, PRED_FILE_EXTENSION)


[Step 2] Scanning Predictions Directory: /content/drive/MyDrive/masterthesis/data/2020_DOP_all_predictions/preds_no_struct_veg_mask_all_crowns_dice_loss_experiment_lr_0.0025_L0.3_D0.7
  ... scanned 10000 files ...
  ... scanned 20000 files ...
  ... scanned 30000 files ...
  ... scanned 40000 files ...
  ... scanned 50000 files ...
  ... scanned 60000 files ...
  ... scanned 70000 files ...
  ... scanned 80000 files ...
  ... scanned 90000 files ...
  ... scanned 100000 files ...
Found 103726 total prediction files ('.geojson').


In [None]:
# 2c. If that succeeds, get the coordinates:
coords = get_source_coordinates(SOURCE_DIR, COORDINATE_PATTERN)




[Step 1] Scanning Source Directory: /content/drive/MyDrive/masterthesis/data/2020_DOP_all
Found 319 unique coordinate patterns.


In [None]:
# 2d. Then run the last two steps (they are fast and run in memory):
counts = count_matches(coords, pred_files, PRED_SEARCH_PATTERN)
report_results(coords, counts)


[Step 3] Counting matches...
Counting complete.

--- Results Summary ---
Total unique coordinates from source: 319
Coordinates with one or more predictions: 168
Coordinates with ZERO predictions: 151

--- Coordinates with 0 Predictions ---
  390_5802
  390_5804
  390_5806
  390_5808
  390_5810
  390_5816
  390_5818
  390_5820
  390_5822
  390_5824
  390_5826
  390_5828
  390_5830
  390_5832
  390_5834
  392_5810
  392_5812
  392_5814
  392_5816
  392_5818
  392_5820
  392_5822
  392_5824
  392_5826
  392_5828
  392_5830
  392_5832
  392_5834
  394_5798
  394_5800
  394_5802
  394_5804
  394_5806
  394_5808
  394_5810
  394_5812
  394_5814
  394_5816
  394_5818
  394_5820
  394_5822
  394_5824
  394_5826
  394_5828
  394_5830
  394_5832
  394_5834
  394_5836
  396_5798
  396_5800
  396_5802
  396_5804
  396_5806
  396_5808
  396_5810
  396_5812
  396_5814
  396_5816
  396_5818
  396_5820
  396_5822
  396_5824
  396_5826
  396_5828
  396_5830
  396_5832
  396_5834
  396_5836
  398_5798
