# clean_dataset.ipynb

This Jupyter workbook cleans a YOLOv8 dataset by removing bounding boxes from label (*.txt) files where:
* a bounding box classed as zero, low, medium or high [0 .. 3] overlaps another bounding box with class [0..3] with an intersection over union value (IOU) greater than a specified threshold (IOU_THRESHOLD)
* a bounding box classed as vcut which does not overlap another bounding box

Data is expected to be in one or more folders containing paired image files and labels files such as IMG_20221115_114221.jpg and IMG_20221115_114221.txt.

The *.txt files contains labels in the standard YOLOv8 format (class, x_center, y_center, width, height). For example:

```
1 0.212500 0.859722 0.219792 0.280556
0 0.541146 0.808796 0.167708 0.225000
0 0.856771 0.861574 0.212500 0.236111
0 0.035417 0.716204 0.069792 0.397222
5 0.532813 0.761111 0.018750 0.035185
4 0.069531 0.669444 0.020313 0.490741
```

<p style="color:red">WARNING: Label files (*.txt) will be modified. It is highly recommended to backup data before running this code.</p>

In [12]:
import pandas as pd
import numpy as np
import glob

In [13]:
IOU_THRESHOLD = 0.5
IMAGE_PATH =  '/home/aubrey/Desktop/Guam07-training-set/datasets/Guam07v3/images/002/*.jpg'

In [14]:
class Rectangle:
    
    def __init__(self, x, y, w, h):
        '''Create rectangle with center at (x, y) width w, and height h'''
        # Coordinates for YOLO formatted bounding box (xc, yc, w, h; normalized over 0, 1)
        self.x = float(x)
        self.y = float(y)
        self.w = float(w)
        self.h = float(h)
        # Coordinates for XYXY formatted bounding box (x1, y1, x2, y2; normalized over 0, 1)
        self.x1 = self.x - self.w / 2
        self.x2 = self.x + self.w / 2
        self.y1 = self.y - self.h / 2
        self.y2 = self.y + self.h / 2
        
    def intersection_over_union(self, other):
        '''Usage: iou = rect1.intersection_over_union(rect2)'''
        assert self.x1 < self.x2
        assert self.y1 < self.y2
        assert other.x1 < other.x2
        assert other.y1 < other.y2
        
        # calc coordinates of the intersection rectangle
        x1 = max(self.x1, other.x1)
        y1 = max(self.y1, other.y1)
        x2 = min(self.x2, other.x2)
        y2 = min(self.y2, other.y2)
        
        if x2 < x1 or y2 < y1:
            return 0.0   # rectangles do not overlap
        
        intersection_area = (x2 - x1) * (y2 - y1)
        self.area = (self.x2 - self.x1) * (self.y2 - self.y1)
        other.area = (other.x2 - other.x1) * (other.y2 - other.y1)
        iou = intersection_area / float(self.area + other.area - intersection_area)
        assert iou >= 0.0
        assert iou <= 1.0
        return iou
    
# rect1 = Rectangle(0.5, 0.5, 0.1, 0.1)
# rect2 = Rectangle(0.51, 0.51, 0.11, 0.11)
# iou = rect1.intersection_over_union(rect2)
# print(f'{iou=}')

In [15]:
def get_image_list():
    return glob.glob(IMAGE_PATH, recursive=True)

# get_image_list()


In [16]:
def create_df(image_path):
    labels_path = image_path.replace('.jpg', '.txt')
    df = pd.read_csv(labels_path, sep=' ', names=['cls', 'x', 'y', 'w', 'h'])
    return df

In [17]:
def create_iou_array(df):
    n = len(df)
    iou_array = np.zeros((n, n))

    for i1, r1 in df.iterrows(): 
        rect1 = Rectangle(r1.x, r1.y, r1.w, r1.h)
        for i2, r2 in df.iterrows():
            if i2 > i1:   # populate only values which are above the diagonal
                rect2 = Rectangle(r2.x, r2.y, r2.w, r2.h)
                iou = rect1.intersection_over_union(rect2)
                # print(f'{i1=} {i2=} {iou=}')
                iou_array[i1, i2] = iou
    return iou_array

# iou_array = create_iou_array(df)                      
# with np.printoptions(precision=3):
#     print(iou_array)

In [18]:
def create_above_iou_threshold_array(iou_array):
    return iou_array > IOU_THRESHOLD

# above_iou_threshold_array = create_above_iou_threshold_array(iou_array)
# with np.printoptions():
#     print(above_iou_threshold_array)

In [19]:
# MAIN

image_list = get_image_list()
for image_path in image_list:
    df = create_df(image_path)
    iou_array = create_iou_array(df)
    above_iou_threshold_array = create_above_iou_threshold_array(iou_array)
 
    df1 = df.copy()
    n = above_iou_threshold_array.shape[0]

    # Drop overlapping bounding boxes for classes 0..3 where IOU_THRESHOLD is exceeded
    for i in range(n):
        for j in range(n):
            if above_iou_threshold_array[i, j]:
                class0 = int(df.iloc[i].cls)
                class1 = int(df.iloc[j].cls)
                if class0 in [0,1,2,3] and class1 in [0,1,2,3]:
                    df1.drop(index=j, inplace=True)
                    
    # Drop records for class 5 (vcut) which do not overlap other bounding boxes
    # i.e. a vcut not found within a coconut palm object
    
    # get row indices for class 5 (vcut) bounding boxes
    vcut_indices = df.query('cls==5').index.values.tolist()

    for i in vcut_indices:
        if sum(iou_array[:,i]) == 0:
            df1.drop(index=i, inplace=True)
 
    # if any records were dropped, overwrite the labels file (*.txt)    
    if df1.shape[0] < df.shape[0]:
        label_path = image_path.replace('.jpg', '.txt')
        df1.to_csv(label_path, sep=' ', header=False, index=False)
        print(f'{label_path} updated')
        
print('FINISHED')

/home/aubrey/Desktop/Guam07-training-set/datasets/Guam07v3/images/002/IMG_20221115_113054.txt updated
FINISHED
/home/aubrey/Desktop/Guam07-training-set/datasets/Guam07v3/images/002/IMG_20221117_115456.txt updated
FINISHED
/home/aubrey/Desktop/Guam07-training-set/datasets/Guam07v3/images/002/IMG_20221116_113821.txt updated
FINISHED
/home/aubrey/Desktop/Guam07-training-set/datasets/Guam07v3/images/002/IMG_20221128_135145.txt updated
FINISHED
/home/aubrey/Desktop/Guam07-training-set/datasets/Guam07v3/images/002/IMG_20221116_113941.txt updated
FINISHED
FINISHED
FINISHED
/home/aubrey/Desktop/Guam07-training-set/datasets/Guam07v3/images/002/IMG_20221116_124532.txt updated
FINISHED
/home/aubrey/Desktop/Guam07-training-set/datasets/Guam07v3/images/002/IMG_20221117_125853.txt updated
FINISHED
/home/aubrey/Desktop/Guam07-training-set/datasets/Guam07v3/images/002/IMG_20221117_100732.txt updated
FINISHED
FINISHED
/home/aubrey/Desktop/Guam07-training-set/datasets/Guam07v3/images/002/IMG_20221115_12