# create_new_dataset.ipynb

This notebook implements my workflow for fine tuning a YOLOv8 object detection model which
detects coconut rhinoceros beetle damage in coconut palms.

# Installation

Clone the repo
```
git clone https://github.com/aubreymoore/CRB-Damage-Dataset-Improvement
```

Move to the new folder
```
cd CRB-Damage-Dataset-Improvement
```

Create a virtual environment
```
python3 -m venv .venv
```

Activate the new virtual environment
```
source venv/bin/activate
```

Install required python modules
```
pip install -r code/requirements.txt
```

Create a .gitignore file and add .venv to the list of files and folders to be ignored.
Adding a virtual environmant to a repository is bad practice.
```
echo ".venv" >> .gitignore
```

# References

https://pybit.es/articles/a-better-place-to-put-your-python-virtual-environments/

[Image Deduplication](https://github.com/voxel51/fiftyone-examples/blob/master/examples/image_deduplication.ipynb)

[CVAT <> FiftyOne: Data-Centric Machine Learning with Two Open Source Tools](https://www.cvat.ai/post/data-centric)

[FiftyOne - Ultralytics Integration](https://docs.voxel51.com/integrations/ultralytics.html)

[Finding Detection Mistakes with FiftyOne](https://docs.voxel51.com/tutorials/detection_mistakes.html)

[Fine-tune YOLOv8 models for custom use cases with the help of FiftyOne](https://docs.voxel51.com/tutorials/yolov8.html)

[FiftyOne Brain](https://docs.voxel51.com/brain.html)

[Tracking Datasets in FiftyOne](https://voxel51.com/blog/tracking-datasets-in-fiftyone/)

In [3]:
import os
import shutil
import glob
import fiftyone as fo
import fiftyone.brain as fob
import fiftyone.zoo as foz
from fiftyone import ViewField as F
import logging
import sys
from icecream import ic
from datetime import datetime
import numpy as np
from numpy.linalg import norm
from ultralytics import YOLO
import glob

In [4]:
def add_timestamp_field():
    dataset.add_sample_field("timestamp", fo.DateTimeField)

    for sample in dataset:
        timestamp_str = os.path.basename(sample.filepath)[4:-4]
        dt = datetime.strptime(timestamp_str, '%Y%m%d_%H%M%S')
        # ic(timestamp_str, dt)
        sample['timestamp'] = dt
        sample.save()
    
    # Create view  
    view = dataset.sort_by(F'timestamp')
    dataset.save_view('sorted_by_timestamp', view, overwrite=True)

In [5]:
def update_requirements_file():
    os.system('pip list --format=freeze > requirements.txt')

# update_requirements_file()

In [6]:
def create_new_dataset(ORIGINAL_DS_PATH, NEW_DS_PATH):
    """ 
    """
    os.mkdir(NEW_DS_PATH)
    os.mkdir(f'{NEW_DS_PATH}/images')
    os.mkdir(f'{NEW_DS_PATH}/images/train')
    os.mkdir(f'{NEW_DS_PATH}/images/val')
    os.mkdir(f'{NEW_DS_PATH}/labels')
    os.mkdir(f'{NEW_DS_PATH}/labels/train')
    os.mkdir(f'{NEW_DS_PATH}/labels/val')
    
    for filepath in glob.glob(f'{ORIGINAL_DS_PATH}/train/*.jpg'):
        shutil.copy2(filepath, f'{NEW_DS_PATH}/images/train')
    for filepath in glob.glob(f'{ORIGINAL_DS_PATH}/train/*.txt'):
        shutil.copy2(filepath, f'{NEW_DS_PATH}/labels/train')
    for filepath in glob.glob(f'{ORIGINAL_DS_PATH}/val/*.jpg'):
        shutil.copy2(filepath, f'{NEW_DS_PATH}/images/val')
    for filepath in glob.glob(f'{ORIGINAL_DS_PATH}/val/*.txt'):
        shutil.copy2(filepath, f'{NEW_DS_PATH}/labels/val')
        
    s = f'path: {NEW_DS_PATH} \n'
    s += 'train: ./images/train/ \n'
    s += 'val: ./images/val/ \n'
    s += 'names: \n'
    s += '  0: live \n'
    s += '  1: dead \n'
    s += '  2: vcut \n'
    with open(f'{NEW_DS_PATH}/dataset.yaml', 'w') as f:
        f.write(s)

In [7]:
def yolo2fiftyone(FO_DATASET_NAME, dataset_dir, splits=["train", "val"]):
    """ 
    Imports a dataset in YOLO5 format into FiftyOne, using tags to mark the samples in each split 
    """ 
    dataset = fo.Dataset(name, persistent=True)
    for split in splits:
        dataset.add_dir(
            dataset_dir=dataset_dir,
            dataset_type=fo.types.YOLOv5Dataset,
            split=split,
            tags=split,
    )
    return dataset

In [8]:
def add_embeddings_field():
    """ 
    """ 
    model = foz.load_zoo_model("mobilenet-v2-imagenet-torch")
    dataset.compute_embeddings(model=model, embeddings_field='embeddings')

In [9]:
def cosine_similarity(a, b):
    return np.dot(a,b)/(norm(a)*norm(b))
 
# a = np.array([2,1,2,3,2,9])
# b = np.array([3,4,2,4,5,5])
# cosine_similarity(a, b)

In [10]:
def add_similarity_with_prev_img_field():
    """ 
    """
    view = dataset.load_saved_view('sorted_by_timestamp')
    # thresh = 0.92
    first_sample = True
    for sample in view:
        if first_sample:
            current_embeddings = sample.embeddings
            similarity = 0.0
            first_sample = False
        else:
            previous_embeddings = current_embeddings
            current_embeddings = sample.embeddings
            similarity = cosine_similarity(previous_embeddings, current_embeddings)
        sample['similarity_with_prev_img'] = similarity
        # if similarity > thresh:
        #     sample.tags.append(f'similarity>{thresh}')
        # else:
        #     sample.tags.append('similarity OK') 
        sample.save()

In [11]:
def add_predictions_field():
    """ 
    """
    # Load YOLOv8 model
    # from ultralytics import YOLO
    model = YOLO(ORIGINAL_MODEL_PATH)
    dataset.apply_model(model, label_field="yolov8")
    
# add_predictions_field()

In [12]:
def add_mistakenness_field():
    """ 
    Adds mistakenness, possible_missing and possible_spurious fields.
    See docs at https://docs.voxel51.com/brain.html#label-mistakes for details.
    """
    fob.compute_mistakenness(dataset, "yolov8", label_field="ground_truth")  
    
# add_mistakenness_field() 

In [13]:
def add_field(fieldname, func):
    """ 
    This utility function checks for existence of a field in a dataset.
    If the field does not exist it is added by running func.
    """
    if dataset.get_field(fieldname):
        logger.info(f'"{fieldname}" field already exists')
    else:
        logger.info(f'Adding "{fieldname}" field')
        func()

# def add_new_field():
#     """ 
#     Code for adding a field named 'new' should be inserted in this function.
#     """
#     pass
    
# add_field('new', add_new_field)

In [14]:
def create_autocorrelated_images_view(threshold, delete=False):
    """ 
    """
    dataset = fo.load_dataset(FO_DATASET_NAME)
    sorted_by_timestamp_view = dataset.load_saved_view('sorted_by_timestamp')
    view = sorted_by_timestamp_view.match(
        F('similarity_with_prev_img') > threshold)
    dataset.save_view("autocorrelated_images_view", view, overwrite=True)
    count = view.count()
    
    if delete:
        dataset.delete_samples(view) 
        dataset.save()
     
    return count
  
# create_autocorrelated_images_view(0.98, True)

In [15]:
def count_ground_truth_bbs(dataset):
    total_detections = 0
    for sample in dataset:
        total_detections += len(sample.ground_truth.detections)
    return total_detections

# count_ground_truth_bbs()

In [16]:
def create_bb_touching_edge_view(delete=False):
    """ 
    https://docs.voxel51.com/recipes/remove_duplicate_annos.html
    """
    dataset = fo.load_dataset(FO_DATASET_NAME)
    view = dataset.filter_labels('ground_truth', 
        (F('bounding_box')[0] <= 0) | # left
        (F('bounding_box')[1] <= 0) | # top
        ((F('bounding_box')[0] + F('bounding_box')[3]) >= 1) # right
    )
    dataset.save_view('bb_touching_edge', view, overwrite=True) 
    count = view.count()
           
    if delete:
        dataset.delete_labels(view)
    dataset.save()
            
    return  count

# create_bb_touching_edge_view()

In [17]:
def remove_unannotated_images(yolo5_dataset_path: str) -> int:
    """ 
    Removes unannoted images from a YOLO5 data set
    Arguments:
        yolo5_dataset_path -- absolute path for the YOLO5 dataset
    Returns:
        count -- number of image (*.jpg) and annotation file pairs removed
    """ 
    search_str = f'{yolo5_dataset_path}/**/*.txt'
    txt_paths = glob.glob(search_str, recursive=True)
    count = 0
    for txt_path in txt_paths:
        if os.path.getsize(txt_path) == 0:
            img_path = txt_path.replace('labels', 'images').replace('.txt', '.jpg')
            os.remove(txt_path)
            os.remove(img_path)
            count += 1
    return count

# remove_unannotated_images(
#     yolo5_dataset_path='/home/aubrey/myexport')

In [18]:
def export_51_to_YOLO(dataset_name: str, 
                      export_dir: str, 
                      remove_unannotated: bool) -> int:
    """
    Export a dataset from 51 format to YOLO5 format.
    Optionally, unannotated images will be removed from the export_dir.
    
    Arguments:
    dataset_name -- a saved (persistent) 51 dataset
    export_dir -- absolute destination path for the YOLO5 dataset
    remove_unannoted -- if True, unannoted images are removed from the new YOLO5 dataset

    Reference https://docs.voxel51.com/user_guide/export_datasets.html#yolov5dataset
    """
    label_field = "ground_truth"

    # The splits to export
    splits = ["train", "val"]

    # All splits must use the same classes list
    classes = ["live", "dead", "vcut"]

    # The dataset or view to export
    # We assume the dataset uses sample tags to encode the splits to export
    dataset_or_view = fo.load_dataset(dataset_name)

    # Export the splits
    for split in splits:
        split_view = dataset_or_view.match_tags(split)
        split_view.export(
            export_dir=export_dir,
            dataset_type=fo.types.YOLOv5Dataset,
            label_field=label_field,
            split=split,
            classes=classes,
        )
        
    # Remove unannotated images (optional)
    images_removed = 0
    if remove_unannotated:
        images_removed = remove_unannotated_images(
            yolo5_dataset_path=export_dir)
    return images_removed     

# export_51_to_YOLO(
#     dataset_name='Guam07v3', 
#     export_dir='/home/aubrey/myexport', 
#     remove_unannotated=True)

In [19]:
def train_model():

    model = YOLO('/home/aubrey/label-studio-ml-backend/runs/detect/newt/weights/best.pt')
    results = model.train(
        resume = True,
        imgsz=1920,
        rect=True,
        # data= '/home/aubrey/myexport/dataset.yaml',
        epochs=5,
        batch=-1,
        patience=5,
        name='newt'
    )

# train_model()
 
# train model
# !yolo \
# task=detect \
# mode=train \
# model= /home/aubrey/Desktop/Guam07-training-set/datasets/3class-no-symlinks/runs/detect/train5/weights/best.pt \
# imgsz=1920 \
# data= /home/aubrey/myexport/dataset.yaml \
# epochs=1000 \
# batch=-1 \
# patience=50 \
# name=dataset3_yolov8n

In [20]:
def launch_cvat(anno_key_suffix: str, view) -> str:
    """ 
    Saves a FiftyOne view in CVAT and launches the CVAT annotator
    
    Arguments:
    anno_key_suffix - string     
    view - the view to be imported into CVAT
    
    Result:
    
    anno_key - a unique string in the form of myview-2024-11-27-16:57
    """
    timestamp = datetime.strftime(datetime.now(), '%Y%m%d%H%M')
    anno_key = f'{anno_key_suffix}_{timestamp}'
    view.annotate(
        anno_key= anno_key,
        label_field="ground_truth", 
        launch_editor=True
    )
    return anno_key
    
# random_dozen_view = dataset.take(12)
# launch_cvat('random_dozen', random_dozen_view)


In [21]:
def configure_logger(LOGFILE):
    """
    Configure logger to send messages to notebook and LOGFILE
    """
    logging.root.handlers = []
    logging.basicConfig(
        level=logging.INFO, 
        format='%(asctime)s %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S',
        handlers=[
            logging.FileHandler(filename=LOGFILE),
            logging.StreamHandler(sys.stdout)
        ]
    )
    logger = logging.getLogger()
    return logger

```
if NEW_DS_PATH exists:
    continue
else:
    create_new_dataset(ORIGINAL_DS_PATH, NEW_DS_PATH)

if FO_DATASET_NAME exists:
    continue
else:
    yolo2fiftyone(name=FO_DATASET_NAME, dataset_dir=NEW_DS_PATH)
    
dataset = fo.load_dataset(FO_DATASET_NAME)

# Add sample fields if they don't already exist
add_field('timestamp', add_timestamp_field)
add_field('embeddings', add_embeddings_field)
add_field('similarity_with_prev_img', add_similarity_with_prev_img_field)
add_field('yolov8', add_predictions_field)
add_field('mistakenness', add_mistakenness_field)

if 'bb_touching_edge' in dataset.list_saved_views():
    continue
else:
    create_bb_touching_edge_view(DELETE_BBS_TOUCHING_EDGES)

if 'autocorrelated_images_view' in dataset.list_saved_views():
    continue
else:
    create_autocorrelated_images_view(AUTOCORRELATED_IMAGES_THRESHOLD, DELETE_AUTOCORRELATED_IMAGES)

if RETRAIN_MODEL:
    export_51_to_YOLO()
    train_model()
else:
    continue

if LAUNCH_51:
    dataset = fo.load_dataset(FO_DATASET_NAME)
    session = fo.launch_app(dataset, auto=False)
```

In [22]:
# MAIN

# Start of constants #############################################################################

# path to dataset in new YOLO format 
ORIGINAL_DS_PATH = '/home/aubrey/Desktop/Guam07-training-set/datasets/3class-no-symlinks'

# path to latest weights file
ORIGINAL_MODEL_PATH = f'{ORIGINAL_DS_PATH}/runs/detect/train5/weights/best.pt'

# path to dataset in YOLOv5 format
NEW_DS_PATH = '/home/aubrey/datasets/Guam07v3'

# name of FiftyOne dataset
FO_DATASET_NAME = 'Guam07v3'

# file name for log file saved in the same folder as this notebook
LOGFILE = 'create_new_dataset.log'

# Arguments for create_autocorrelated_images_view function.
AUTOCORRELATED_IMAGES_THRESHOLD = 0.98
DELETE_AUTOCORRELATED_IMAGES = True

# Argument for create_autocorrelated_images_view function
DELETE_BBS_TOUCHING_EDGES = True

# Option to retrain model. Usually FALSE.
RETRAIN_MODEL = False

# Option to launch FiftyOne in browser at end of workflow. Usually True.
LAUNCH_51 = True

# End of constants ########################################################################

#configure logger
logger = configure_logger(LOGFILE)

logger.info(globals())

# update requirements.txt
logger.info('Updating "requirements.txt"')
update_requirements_file()

# wrangle dataset into YOLOv5 format
if os.path.exists(NEW_DS_PATH):
    logger.info(f'"{NEW_DS_PATH}" already exists in YOLOv5 format')
else:
    logger.info(f'creating dataset "{NEW_DS_PATH}" in YOLOv5 format')
    create_new_dataset(ORIGINAL_DS_PATH, NEW_DS_PATH)

# Create new FiftyOne dataset
if FO_DATASET_NAME in fo.list_datasets():
    logger.info(f'FiftyOne dataset "{FO_DATASET_NAME}" already exists') 
else:
    logger.info(f'Creating FiftyOne dataset "{FO_DATASET_NAME}"')
    dataset = yolo2fiftyone(name=FO_DATASET_NAME, dataset_dir=NEW_DS_PATH)
    
# Load dataset
logger.info(f'Loading FiftyOne dataset "{FO_DATASET_NAME}"')
dataset = fo.load_dataset(FO_DATASET_NAME)
logger.info(f'    Ground truth bounding boxes: {count_ground_truth_bbs(dataset)}')

# Add fields if they don't already exist
add_field('timestamp', add_timestamp_field)
add_field('embeddings', add_embeddings_field)
add_field('similarity_with_prev_img', add_similarity_with_prev_img_field)
add_field('yolov8', add_predictions_field)
add_field('mistakenness', add_mistakenness_field)

# Find bounding boxes touching left, top or right edges of images
if 'bb_touching_edge' in dataset.list_saved_views():
    logger.info('"bb_touching_edge_view" already exists')
else:
    logger.info('Creating "bb_touching_edge_view"')
    if DELETE_BBS_TOUCHING_EDGES:
        logger.info('    "DELETE_BBS_TOUCHING_EDGES" is True; bbs will be deleted')
    else:
        logger.info('    "DELETE_BBS_TOUCHING_EDGES" is False; bbs will not be deleted')
    bb_touching_edge_count = create_bb_touching_edge_view(DELETE_BBS_TOUCHING_EDGES)
    logger.info(f'    {bb_touching_edge_count} ground truth bounding boxes touching image edges were found')

# Find autocorrelated images
if 'autocorrelated_images_view' in dataset.list_saved_views():
    logger.info('"autocorrelated_images_view" already exists')
else:
    logger.info('Creating "autocorrelated_images_view"')
    if DELETE_BBS_TOUCHING_EDGES:
        logger.info('    "DELETE_AUTOCORRELATED_IMAGES" is True; samples will be deleted')
    else:
        logger.info('    "DELETE_AUTOCORRELATED_IMAGES" is False; bbs will not be deleted')
    autocorrelated_image_count = create_autocorrelated_images_view(
        threshold=AUTOCORRELATED_IMAGES_THRESHOLD, delete=DELETE_AUTOCORRELATED_IMAGES)
    logger.info(f'    With a threshold of {AUTOCORRELATED_IMAGES_THRESHOLD}, {autocorrelated_image_count} autocorrelated images were found')

if RETRAIN_MODEL:
    export_51_to_YOLO(
        dataset_name='Guam07v3', 
        export_dir='/home/aubrey/myexport', 
        remove_unannotated=True)
    train_model()

if LAUNCH_51:
    
    # Reload dataset
    logger.info(f'Loading FiftyOne dataset "{FO_DATASET_NAME}"')
    dataset = fo.load_dataset(FO_DATASET_NAME)
    logger.info(f'    Ground truth bounding boxes: {count_ground_truth_bbs(dataset)}')

    # Launch FiftyOne app in browser
    logger.info(f'Launching FifyOne app in browser')
    session = fo.launch_app(dataset, auto=False)
    logger.info(session)

logger.info('FINISHED')

2024-11-27 17:41:07 {'__name__': '__main__', '__doc__': 'Automatically created module for IPython interactive environment', '__package__': None, '__loader__': None, '__spec__': None, '__builtin__': <module 'builtins' (built-in)>, '__builtins__': <module 'builtins' (built-in)>, '_ih': ['', 'import os\nimport shutil\nimport glob\nimport fiftyone as fo\nimport fiftyone.brain as fob\nimport fiftyone.zoo as foz\nfrom fiftyone import ViewField as F\nimport logging\nimport sys\nfrom icecream import ic\nfrom datetime import datetime\nimport numpy as np\nfrom numpy.linalg import norm\nfrom ultralytics import YOLO\nimport glob\nimport mysecrets\n# import ipywidgets as widgets\n# from IPython.display import display', 'import os\nimport shutil\nimport glob\nimport fiftyone as fo\nimport fiftyone.brain as fob\nimport fiftyone.zoo as foz\nfrom fiftyone import ViewField as F\nimport logging\nimport sys\nfrom icecream import ic\nfrom datetime import datetime\nimport numpy as np\nfrom numpy.linalg impo

In [23]:
# dataset.info['notes'] = 'Here is another note about this dataset.' 
# dataset.save()
# dataset.info