<a href="https://colab.research.google.com/github/aborbala/tree-canopy/blob/main/calculate_priors_from_yolo_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import numpy as np
from tqdm import tqdm
import pandas as pd
from collections import defaultdict


In [None]:
label_dir = "/content/drive/MyDrive/masterthesis/data/386_5818/yolo_dataset_no_structures_veg_mask/labels/train"

foreground_classes = [1]

def polygon_area(x, y):
    """
    Calculates the area of a polygon using the Shoelace formula.
    Args:
        x (list): A list of x-coordinates of the polygon's vertices.
        y (list): A list of y-coordinates of the polygon's vertices.
    Returns:
        float: The area of the polygon.
    """
    # The coordinates are assumed to be in order around the polygon.
    return 0.5 * np.abs(np.dot(x, np.roll(y, 1)) - np.dot(y, np.roll(x, 1)))

def calculate_priors(label_path, fg_classes):
    """
    Calculates the total foreground area and image count for a YOLO dataset.
    """
    total_foreground_area_normalized = 0.0

    label_files = [f for f in os.listdir(label_path) if f.endswith('.txt')]
    if not label_files:
        raise FileNotFoundError(f"No .txt label files found in '{label_path}'. Please check the path.")

    num_images = len(label_files)

    print(f"Found {num_images} label files. Analyzing polygon areas...")

    for label_file in tqdm(label_files, desc="Processing labels"):
        with open(os.path.join(label_path, label_file), 'r') as f:
            for line in f:
                parts = line.strip().split()
                if not parts:
                    continue

                class_id = int(parts[0])

                # Check if the class is one of our foreground classes
                if class_id in fg_classes:
                    coords = np.array(list(map(float, parts[1:])))
                    x_coords = coords[0::2] # All even indices are x
                    y_coords = coords[1::2] # All odd indices are y

                    area = polygon_area(x_coords, y_coords)
                    total_foreground_area_normalized += area

    if num_images == 0:
        return 0.0, 0.0

    # The total normalized area is simply the number of images (since each has a normalized area of 1.0)
    # The average foreground proportion is the total foreground area divided by the total area.
    pi_foreground = total_foreground_area_normalized / num_images
    pi_background = 1.0 - pi_foreground

    return pi_background, pi_foreground


Found 536 label files. Analyzing polygon areas...


Processing labels: 100%|██████████| 536/536 [01:11<00:00,  7.48it/s]


--- Calculation Complete ---
Foreground Classes specified: [1]
Foreground (Tree) Prior (pi_1): 0.130932
Background Prior (pi_0): 0.869068

>>> IMPORTANT: Update these values in your loss.py file! <<<
Example update in v8SegmentationLoss.__init__:
    background_prior = 0.869068
    foreground_prior = 0.130932





In [None]:
pi_background, pi_foreground = calculate_priors(label_dir, [0,1])

print("\n--- Calculation Complete ---")
print(f"Foreground Classes specified: {[0,1]}")
print(f"Foreground (Tree) Prior (pi_1): {pi_foreground:.6f}")
print(f"Background Prior (pi_0): {pi_background:.6f}")
print("\n>>> IMPORTANT: Update these values in your loss.py file! <<<")
print("Example update in v8SegmentationLoss.__init__:")
print(f"    background_prior = {pi_background:.6f}")
print(f"    foreground_prior = {pi_foreground:.6f}")


Found 536 label files. Analyzing polygon areas...


Processing labels: 100%|██████████| 536/536 [00:05<00:00, 94.54it/s] 


--- Calculation Complete ---
Foreground Classes specified: [0, 1]
Foreground (Tree) Prior (pi_1): 0.225960
Background Prior (pi_0): 0.774040

>>> IMPORTANT: Update these values in your loss.py file! <<<
Example update in v8SegmentationLoss.__init__:
    background_prior = 0.774040
    foreground_prior = 0.225960





In [None]:
def analyze_dataset_counts(base_label_dir: str):
    """
    Iterates through train and val label directories of a YOLO dataset to count polygons.

    The function provides a breakdown of total polygons and counts for each class
    for both the training and validation splits, plus an overall total.

    Args:
        base_label_dir (str): The path to the base 'labels' directory, which should
                              contain 'train' and 'val' subdirectories.

    Returns:
        pandas.DataFrame: A DataFrame summarizing the polygon counts for each
                          class and data split.
    """
    splits_to_process = ['train', 'val']

    # Use defaultdict to gracefully handle any class ID, even if not 0 or 1
    counts = {split: defaultdict(int) for split in splits_to_process}

    # Iterate through each split (train, val)
    for split in splits_to_process:
        split_path = os.path.join(base_label_dir, split)

        if not os.path.isdir(split_path):
            print(f"Warning: Directory not found for split '{split}': {split_path}")
            continue

        label_files = [f for f in os.listdir(split_path) if f.endswith('.txt')]
        print(f"\nProcessing '{split}' split with {len(label_files)} images...")

        # Iterate through each label file in the current split
        for label_file in tqdm(label_files, desc=f"Analyzing {split} labels"):
            with open(os.path.join(split_path, label_file), 'r') as f:
                for line in f:
                    parts = line.strip().split()
                    if not parts:
                        continue

                    # Each line in a YOLO label file represents one polygon/object
                    counts[split]['Total Polygons'] += 1

                    try:
                        class_id = int(parts[0])
                        counts[split][f'Class {class_id} Polygons'] += 1
                    except (ValueError, IndexError):
                        print(f"Warning: Could not parse line in {label_file}: {line.strip()}")

    # Convert the nested dictionary to a pandas DataFrame and fill any missing values with 0
    summary_df = pd.DataFrame.from_dict(counts, orient='index').fillna(0).astype(int)

    if summary_df.empty:
        print("\nNo data processed. Cannot create summary.")
        return pd.DataFrame()

    # Calculate a 'Grand Total' row
    summary_df.loc['Grand Total'] = summary_df.sum()

    # Ensure columns are in a logical order (Total first, then sorted class columns)
    class_cols = sorted([col for col in summary_df.columns if 'Class' in col])
    other_cols = [col for col in summary_df.columns if 'Class' not in col]
    summary_df = summary_df[other_cols + class_cols]

    return summary_df


base_label_dir = "/content/drive/MyDrive/masterthesis/data/386_5818/yolo_dataset_no_structures_veg_mask/labels"

# Run the analysis
dataset_summary = analyze_dataset_counts(base_label_dir)

# Print the final summary table
print("\n\n--- Dataset Polygon Count Summary ---")
print(dataset_summary.to_string())


Processing 'train' split with 536 images...


Analyzing train labels: 100%|██████████| 536/536 [00:12<00:00, 41.63it/s] 



Processing 'val' split with 135 images...


Analyzing val labels: 100%|██████████| 135/135 [00:02<00:00, 61.62it/s] 



--- Dataset Polygon Count Summary ---
             Total Polygons  Class 0 Polygons  Class 1 Polygons
train                  7641              4032              3609
val                    1993              1092               901
Grand Total            9634              5124              4510



