In [None]:
#please: pip install -r requirements.txt before running
#if this import still fails saying module not found _lzma please uninstall python 3.10.8 from pyenv, run: brew install xz, reinstall python version 3.10.8
#note: the above error is not found with conda installations of python, only pyenv has this problem

import torchvision

In [2]:
%%capture

import os
import json
import random
import shutil
import pybboxes as pbx

import torch
from IPython.display import Image, clear_output

In [None]:
#helper functions to handle data

def getFile(root_path):
    file_list = list()
    for file in os.listdir(root_path):
        if ('.' in file) == False and file != "coco":
            file_list.append(file)
    return file_list


def getAllJson(path):
    file_list = list()
    for file in os.listdir(path):
        if file.split('.')[1] != "vott":
            file_list.append(path + '/' + file)
    return file_list


def read_json(path: str):
    with open(path, 'r') as f:
        data = json.load(f)
    return data


def getImg(path):
    img_list = list()
    for img_file in os.listdir(path):
        if img_file != "annotations":
            img_list.append(img_file)
    return img_list

In [None]:
#function to convert the original kaggle dataset to coco format

def convert2coco(root_path,coco_path,split_rate=0.1):
    os.makedirs(coco_path + "annotations/")
    os.makedirs(coco_path + "train/")
    os.makedirs(coco_path + "val/")
    
    all_class_file = getFile(root_path) 
    coco_train_annotations = dict(images=list(), annotations=list(), categories=list())  
    coco_val_annotations = dict(images=list(), annotations=list(), categories=list())  
    coco_annotations = [coco_train_annotations, coco_val_annotations]  
    img_idx = 0
    bbox_idx = 0
    label_idx = 0
    
    
    for idx, label in enumerate(all_class_file):
        category = dict(id=idx, supercategory="Arthropod", name=label)
        coco_annotations[0]["categories"].append(category)
        coco_annotations[1]["categories"].append(category)

    for each_class in all_class_file:
        print("start to convert "+each_class+'\n')
        root_img = root_path + each_class + '/' 
        all_annotations = getAllJson(root_path + each_class + "/annotations")  
        for json_file in all_annotations: 
            data = read_json(json_file)
            flag = 1 if random.random() < split_rate else 0  
            shutil.copyfile(root_img + data["asset"]["name"],
                        coco_path + ("train/" if flag == 0 else "val/") + data["asset"]["name"])
        
            img = dict(file_name=data["asset"]["name"],
                       height=data["asset"]["size"]["height"],
                       width=data["asset"]["size"]["width"],
                       id=img_idx)
            coco_annotations[flag]["images"].append(img)

            for region in data["regions"]:
                bbox = region["boundingBox"]
                anno = dict(image_id=img_idx, segmentation=[[]], area=240, iscrowd=0,
                            bbox=[bbox["left"], bbox["top"], bbox["width"], bbox["height"]],
                            id=bbox_idx, category_id=label_idx)
                bbox_idx += 1
                coco_annotations[flag]["annotations"].append(anno)

            img_idx += 1
        label_idx += 1

    with open(coco_path+"annotations/train.json", "w") as f:
        json.dump(coco_annotations[0], f)
    with open(coco_path + "annotations/val.json", "w") as f:
        json.dump(coco_annotations[1], f)
    print("finish...")

In [None]:
# %%capture
# #Uncomment and run this cell to set up Kaggle API token, download the dataset, unzip it and convert it into coco format(only if coco.zip is not downloaded)


# !mkdir ~/.kaggle
# !touch kaggle.json
# api_token = {"username":"bishakhdutta","key":"bb5c64ace271719d673b785e520d4fa8"}

# with open('kaggle.json', 'w') as file:
#     json.dump(api_token, file)

# !mv kaggle.json ~/.kaggle/kaggle.json
# !chmod 600 ~/.kaggle/kaggle.json


# !kaggle datasets download -d mistag/arthropod-taxonomy-orders-object-detection-dataset
# !unzip arthropod-taxonomy-orders-object-detection-dataset.zip


# !mkdir("coco/")
# convert2coco("ArTaxOr/","coco/",0.3)

In [None]:
# %%capture
# #Uncomment and run this cell to unzip the coco.zip file downloaded from drive(only if already not unzipped manually)


# !unzip coco.zip

In [None]:
json_file = "coco/annotations/train.json"
with open(json_file) as datafile:
  data_train = json.load(datafile)

json_file = "coco/annotations/val.json"
with open(json_file) as datafile:
  data_val= json.load(datafile)


In [None]:
#verify whether converted into coco properly

print(data_train['images'][0])
print(data_train['categories'][0])
print(data_train['annotations'][0])

In [None]:
#verify num_classes = 7 or else class names will be just numbers

num_classes = len(data_train['categories'])
print(num_classes)

In [None]:
print(len(data_train['images']))
print(len(data_val['images']))

In [None]:
#makes folders for storing data in yolo(darknet) format

!mkdir data
!mkdir data/labels
!mkdir data/labels/train data/labels/val

In [None]:
#function to convert the coco data into yolo(darknet) format

def coco2yolo(json_file_path, yolo_annotations_path):
    
    with open(json_file_path) as datafile:
        data = json.load(datafile)

    for image in data["images"]:
        image_name = image['file_name'].split(".")[0]
        txt_file_path = os.path.join(yolo_annotations_path, image_name+".txt")
        image_list = []
        for annotation in data['annotations']:
            if annotation['image_id'] == image['id']:
                size = [image['width'], image['height']]
                yolobbox = pbx.convert_bbox(annotation['bbox'], from_type="coco", to_type="yolo", image_size=size)
                category = annotation['category_id']
                image_list.append((category, yolobbox))
        input = []
        for output in image_list:
            string_line = "{} {} {} {} {}".format(output[0], output[1][0], output[1][1], output[1][2], output[1][3])
            input.append(string_line)
        with open(txt_file_path, 'w') as fp:
            for line in input:
                fp.write(line)
                fp.write('\n')


In [None]:
#convert coco to yolo(darknet)

coco2yolo('coco/annotations/train.json','data/labels/train/')
coco2yolo('coco/annotations/val.json','data/labels/val/')

shutil.move('coco/images','data')

In [3]:
shutil.move('data/images','coco')

'coco/images'

In [None]:
#make sure git is installed and .gitconfig file is set up to clone the yolov5 repo

!git clone https://github.com/ultralytics/yolov5 

In [None]:
cd yolov5

In [None]:
%%capture
pip install -r requirements.txt

In [None]:
#line magic function that creates and writes into files

from IPython.core.magic import register_line_cell_magic

@register_line_cell_magic
def writetemplate(line, cell):
    with open(line, 'w') as f:
        f.write(cell.format(**globals()))

In [None]:
#make sure there are 7 classes or else names of classes will be just numbers

classes = ['Coleoptera','Diptera','Hymenoptera','Hemiptera','Lepidoptera','Araneae','Odonata']
if(num_classes!=7):
    classes = list(range(num_classes))
    classes = [str(x) for x in classes]
print(classes)

In [None]:
%%writetemplate data/data_config.yaml

train: ../data/images/train/
val: ../data/images/val/

nc: {num_classes}

names: {classes}

In [None]:
%cat models/yolov5n.yaml

In [None]:
%%writetemplate models/model_config.yaml
# YOLOv5 🚀 by Ultralytics, GPL-3.0 license

# Parameters
nc: {num_classes} # number of classes
depth_multiple: 0.33  # model depth multiple
width_multiple: 0.25  # layer channel multiple
anchors:
  - [10,13, 16,30, 33,23]  # P3/8
  - [30,61, 62,45, 59,119]  # P4/16
  - [116,90, 156,198, 373,326]  # P5/32

# YOLOv5 v6.0 backbone
backbone:
  # [from, number, module, args]
  [[-1, 1, Conv, [64, 6, 2, 2]],  # 0-P1/2
   [-1, 1, Conv, [128, 3, 2]],  # 1-P2/4
   [-1, 3, C3, [128]],
   [-1, 1, Conv, [256, 3, 2]],  # 3-P3/8
   [-1, 6, C3, [256]],
   [-1, 1, Conv, [512, 3, 2]],  # 5-P4/16
   [-1, 9, C3, [512]],
   [-1, 1, Conv, [1024, 3, 2]],  # 7-P5/32
   [-1, 3, C3, [1024]],
   [-1, 1, SPPF, [1024, 5]],  # 9
  ]

# YOLOv5 v6.0 head
head:
  [[-1, 1, Conv, [512, 1, 1]],
   [-1, 1, nn.Upsample, [None, 2, 'nearest']],
   [[-1, 6], 1, Concat, [1]],  # cat backbone P4
   [-1, 3, C3, [512, False]],  # 13

   [-1, 1, Conv, [256, 1, 1]],
   [-1, 1, nn.Upsample, [None, 2, 'nearest']],
   [[-1, 4], 1, Concat, [1]],  # cat backbone P3
   [-1, 3, C3, [256, False]],  # 17 (P3/8-small)

   [-1, 1, Conv, [256, 3, 2]],
   [[-1, 14], 1, Concat, [1]],  # cat head P4
   [-1, 3, C3, [512, False]],  # 20 (P4/16-medium)

   [-1, 1, Conv, [512, 3, 2]],
   [[-1, 10], 1, Concat, [1]],  # cat head P5
   [-1, 3, C3, [1024, False]],  # 23 (P5/32-large)

   [[17, 20, 23], 1, Detect, [nc, anchors]],  # Detect(P3, P4, P5)
  ]



In [None]:
%%time
#trains the model on our dataset, adjust batch size and epoch based on available hardware

!python train.py --batch 8 --epochs 100 --data data/data_config.yaml --cfg models/model_config.yaml --weights '' --project 'results' --name 'custom_yolov5s_results' 