# Data Prepration for YOLOv5



In [1]:
# imports
import shutil
import json
from random import random
import numpy as np
import os

The YOLOv5 docs recommend a split 80% training, 10% validation and 10% test. Considering the decent size of dataset we will have plenty of examples in both datasets

In [2]:
TRAIN_SPLIT = 0.8
TEST_SPLIT = 0.1
VAL_SPLIT = 0.1
DATASET_PATH = "Implementation/dataset/"

Loading merged annotations

In [3]:
annotations = json.load(open("full_annotations.json"))

To speed up the process we will be dividing the dataset based on a uniform random distribution, where probability of ending up in trianing, validation, and test is 80%, 10% and 10% respectively. Considering the large size of dataset we choose this to speed up the combined dataset creation.

In [4]:
train = 0
test = 0
val = 0
for i in range(5000):
    s = random()
    if s <= 0.8:
        train += 1
    elif s <= 0.9:
        test += 1
    else:
        val += 1

        
train, test, val

(4038, 483, 479)

In [5]:
annotations["annotations"][0], annotations["images"][0]

({'iscrowd': 0,
  'image_id': 1,
  'bbox': [877.0, 119.0, 23.0, 8.0],
  'segmentation': [],
  'category_id': 2,
  'id': 1,
  'area': 184},
 {'height': 720,
  'width': 1280,
  'id': 1,
  'file_name': 'KILI_COCO/batch_1/000001.jpg'})

Original annotation : \[x_min, y_min, width, height\]  
Updated annotation (YOLO): \[cx, cy, width, height\] (norm)

In [4]:
def yoloBbox(bbox, imageW, imageH):
    cx = (bbox[0] + bbox[2]/2)/imageW
    cy = (bbox[1] + bbox[3]/2)/imageH
    
    return [cx, cy, bbox[2]/imageW, bbox[3]/imageH]

In [5]:
def getSplit():
    s = random()
    if s <= TRAIN_SPLIT:
        return "train"
    elif s <= TRAIN_SPLIT + VAL_SPLIT:
        return "val"
    else:
        return "test"

In [7]:
def writeBbox(bboxes, filePath):
    with open(filePath, "w") as f:
        text = ""
        for bbox in bboxes:
            text += f"{bbox[0]} "
            text += " ".join([str(c) for c in bbox[1]])
            text += "\n"
        f.write(text)

In [9]:
annotations["annotations"][2]

{'iscrowd': 0,
 'image_id': 2,
 'bbox': [988.0, 493.0, 34.0, 22.0],
 'segmentation': [],
 'category_id': 1,
 'id': 3,
 'area': 748}

In [17]:
# initialize directories

# clear all contents
!rm -rf Implementation/dataset/

'rm' is not recognized as an internal or external command,
operable program or batch file.


In [8]:
# create dirs
%mkdir "Implementation/dataset/images"
%mkdir "Implementation/dataset/labels"

%mkdir "Implementation/dataset/images/train"
%mkdir "Implementation/dataset/images/val"
%mkdir "Implementation/dataset/images/test"

%mkdir "Implementation/dataset/labels/train"
%mkdir "Implementation/dataset/labels/val"
%mkdir "Implementation/dataset/labels/test"

with open("Implementation/dataset/dataset.yaml", "w") as f:
    f.write("""# train, val and test data
train: ../dataset/images/train/
val: ../dataset/images/val/
test: ../dataset/images/test/

# number of classes
nc: 4

# class names
names: ["Plastic bag", "Plastic bottle", "Plastic other", "Non plastic waste"]""")

In [9]:
img_idx = 0
annotation_idx = 0

# number of images
N_IMAGES = len(annotations["images"])
# N_IMAGES = 100

# looping over every image
while img_idx < N_IMAGES:
    # get the image
    img_data = annotations["images"][img_idx]
    
    # get annotations
    img_annotations = []
    while (annotation_idx < len(annotations["annotations"]) 
           and annotations["annotations"][annotation_idx]["image_id"] == img_data["id"]):
        bbox = annotations["annotations"][annotation_idx]["bbox"]
        if len(bbox) == 4:
            img_annotations += [(annotations["annotations"][annotation_idx]["category_id"],
                                yoloBbox(bbox, img_data["width"], img_data["height"]))]
        
        annotation_idx += 1
    
    # copying them into files    
    split = getSplit()
    
    if os.path.exists(img_data["file_name"]):
        # copying image
        shutil.copy(img_data["file_name"],
                   os.path.join(DATASET_PATH, "images", split, f"{img_idx}.jpg"))

        # creating labels
        writeBbox(img_annotations, os.path.join(DATASET_PATH,
                                               "labels",
                                               split,
                                               f"{img_idx}.txt"))
    
    print(f"Image: {img_idx}", end="\r")
    img_idx += 1
    

Image: 5758

Data Split done

## Experiments

Picking categroies:
1. Plastic all, no other waste (Binary classification) 
1. Plastic bottles, plastic bags

Changing just the labels folders

### Method 1

creating directories for labesl

In [None]:
%mkdir "Implementation/dataset/labels_bi"

%mkdir "Implementation/dataset/labels_bi/test"
%mkdir "Implementation/dataset/labels_bi/train"
%mkdir "Implementation/dataset/labels_bi/val"

In [19]:
src = "Implementation/dataset/labels/"
dest = "Implementation/dataset/labels_bi/"
i = 0
counter = {"0": 0, "1": 0, "2": 0, "3": 0}

for folder in ["train", "test", "val"]:
    for file in os.listdir(src + f"{folder}/"):
        with open(src + f"{folder}/{file}") as f:
            text = f.readlines()
            new_text = ""
            for line in text:
                line = line.split()
                counter[line[0]] += 1
                if line[0] in "012":
                    line[0] = "0"
                    new_text += " ".join(line)
                    new_text += "\n"
            with open(dest + f"{folder}/{file}", "w") as f2:
                f2.write(new_text)
            i += 1
            print(f"File {i}", end="\r")

File 5735

In [20]:
counter

{'0': 1508, '1': 7929, '2': 3835, '3': 1911}

In [23]:
1508+7929+3835

13272

In [22]:
src = "Implementation/dataset/labels_bi/"
dest = "Implementation/dataset/labels_bi/"
i = 0
count = {"0": 0, "1": 0, "2": 0, "3": 0}

for folder in ["train", "test", "val"]:
    for file in os.listdir(src + f"{folder}/"):
        with open(src + f"{folder}/{file}") as f:
            text = f.readlines()
            for line in text:
                line = line.split()
                count[line[0]] += 1
            
            i += 1
            print(f"File {i}", end="\r")
            
count

File 5735

{'0': 13272, '1': 0, '2': 0, '3': 0}

## Method 2: Undersampling for bottles

In [None]:
src = "Implementation/dataset/labels/"
i = 0
counter = {"0": 0, "1": 0, "2": 0, "3": 0}

for folder in ["train", "test", "val"]:
    for file in os.listdir(src + f"{folder}/"):
        with open(src + f"{folder}/{file}") as f:
            text = f.readlines()
            new_text = ""
            for line in text:
                line = line.split()
                counter[line[0]] += 1
            i += 1
            print(f"File {i}", end="\r")

## Method 3

In [24]:
%mkdir "Implementation/dataset/labels_b2"

%mkdir "Implementation/dataset/labels_b2/test"
%mkdir "Implementation/dataset/labels_b2/train"
%mkdir "Implementation/dataset/labels_b2/val"

In [25]:
src = "Implementation/dataset/labels/"
dest = "Implementation/dataset/labels_b2/"
i = 0
counter = {"0": 0, "1": 0, "2": 0, "3": 0}

for folder in ["train", "test", "val"]:
    for file in os.listdir(src + f"{folder}/"):
        with open(src + f"{folder}/{file}") as f:
            text = f.readlines()
            new_text = ""
            for line in text:
                line = line.split()
                counter[line[0]] += 1
                if line[0] in "01":
                    line[0] = "0"
                    new_text += " ".join(line)
                    new_text += "\n"
            with open(dest + f"{folder}/{file}", "w") as f2:
                f2.write(new_text)
            i += 1
            print(f"File {i}", end="\r")

File 5735

# Reloading model
https://stackoverflow.com/questions/70167811/how-to-load-custom-model-in-pytorch

`model = torch.hub.load('ultralytics/yolov5', 'custom', path='yolov5/runs/train/exp/weights/last.pt', force_reload=True) `