# YOLO Step 2

## Import custom images and annotations

In [57]:
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import numpy as np
import boto3
import io
import pandas as pd
import json
#!pip3 install s3fs
import s3fs

my_bucket = 'autoarki-ground-truth-labeling'
client = boto3.client('s3')
response = client.list_objects_v2(Bucket=my_bucket)
session = boto3.Session() 

## Gather list of annotated and non-annotated files

In [3]:
# list of all annotated files
bucket = 'autoarki-ground-truth-labeling'
dest = "bounding_box/ground_truth_annots/yolo-bbox-train"
file_key = "annot.csv"
annot_file = 's3://{}/{}/{}'.format(bucket, dest, file_key)
df_ann = pd.read_csv(annot_file)

annotated_img_list = list(df_ann.img_file.unique())

In [4]:
# list of all files that were reviewed in annotation job
import s3fs

reviewed_img_list = []

mani_path = "s3://autoarki-ground-truth-labeling/bounding_box/ground_truth_annots/yolo-bbox-train/manifests/output.manifest"
job_name = 'yolo-bbox-train'

fs = s3fs.S3FileSystem(anon=False)
with fs.open(mani_path, 'rb') as fin:    
    annot_list = []

    for line in fin.readlines():
        record = json.loads(line)
        if job_name in record.keys():  # is it necessary?
            image_file_path = record["source-ref"]
            image_file_name = image_file_path.split("/")[-1]
            reviewed_img_list.append(image_file_name)
            

In [5]:
print("Number of reviewed files: ", len(reviewed_img_list))
print("Number of annotated files: ", len(annotated_img_list))

Number of reviewed files:  4206
Number of annotated files:  140


## Class Balance

In [6]:
# remove all annotated files from list
non_annot_review_list = [x for x in reviewed_img_list if x not in annotated_img_list]
print("Number of non-annotated but reviewed files: ", len(non_annot_review_list))

Number of non-annotated but reviewed files:  4066


In [7]:
# select 125% of the annotated files 

import random
seed = 123
random.Random(seed).shuffle(non_annot_review_list)

num_non_annotated = round(len(annotated_img_list)*1.25)

non_ann_to_use_list = non_annot_review_list[:num_non_annotated]
print("Number of non-annotated but reviewed files to be used in train + test: ", num_non_annotated)

Number of non-annotated but reviewed files to be used in train + test:  175


## Split into Train and Test

Split annotated and non-annotated data separately into train and test then append

In [8]:
def split_train_test(full_list, train_pct = 0.8):
    '''Return list of files for train and list of files for test''' 
    split_1 = round(int(train_pct * len(full_list)),0)

    train_filenames = full_list[ :split_1]
    test_filenames = full_list[split_1: ]
    
    return train_filenames, test_filenames


In [9]:
train_files, test_files = split_train_test(annotated_img_list) # annotated list
non_ann_train, non_ann_test = split_train_test(non_ann_to_use_list)

train_files.extend(non_ann_train)
test_files.extend(non_ann_test)
print("Number of Training Files", len(train_files))
print("Number of Test Files", len(test_files))

Number of Training Files 252
Number of Test Files 63


# Create a subdirectory with copies of the files
Note: Delete originals once proof of concept is complete.


In [62]:
train_dir = "training_data/"
test_dir = "test_data/"

image_file_location = "bounding_box/images/"
ann_text_file_location = 'bounding_box/ground_truth_annots/yolo-bbox-train/yolo_friendly_format/'

In [107]:
s3_resource = boto3.resource('s3')
import botocore
import os
import shutil

def copy_files_for_YOLO(list_file_names, new_dir):
    for i in list_file_names:
        '''Copy over txt and jpg files. If the text file does not exist create a blank copy'''
        
        # get the txt file version of the name
        add_txt = i.replace('.jpg', '.txt')

        # identify old and new directories
        image_old_key =  image_file_location + i
        image_new_key = new_dir + i
        txt_old_key = ann_text_file_location + add_txt
        txt_new_key = new_dir + add_txt

        # copy over the jpg
        copy_source = {
            'Bucket': my_bucket,
            'Key': image_old_key
        }
        s3_resource.meta.client.copy(copy_source, my_bucket,image_new_key)
        
        # now repeat for txt. Create a txt file if it doesn't exist
        copy_source_txt = {
            'Bucket': my_bucket,
            'Key': txt_old_key
        }
        
        try:
            s3_resource.Object(my_bucket, txt_old_key).load()
        except botocore.exceptions.ClientError as e:
            if e.response['Error']['Code'] == "404":
                # The object does not exist.
                # if the file doesn't already exist create an empty txt file
                open(add_txt, 'w').close()
                s3_resource.Bucket(my_bucket).upload_file(add_txt, txt_new_key)
                os.remove(add_txt)

            else:
                # Something else has gone wrong.
                raise
        else:
            # The object does exist. Then Copy it over
            s3_resource.meta.client.copy(copy_source_txt, my_bucket,txt_new_key)
            
            

            

In [None]:
copy_files_for_YOLO(train_files, train_dir)
copy_files_for_YOLO(test_files, test_dir)


# Learning

use smallest YOLO model to start yolov5s.pt

In [None]:
#!pip install torch
#!pip install torchvision 
#!conda install pytorch torchvision -c pytorch


In [86]:
# change directory
import os
os.chdir(r"/root/AutoArki/yolov5/yolov5")
! python train.py\
    --data AutoArki/kineret - /root/AutoArki/kineret - AWS experiments/custom_dataset.yaml \
    --epochs 30 \
    --project custom_yolov5 \
    --bbox_interval 1 \
    --save_period 1 \
    --weights yolov5s.pt

Traceback (most recent call last):
  File "train.py", line 26, in <module>
    import torch
ModuleNotFoundError: No module named 'torch'


In [None]:
# Train
train_dataset = KangarooDataset()
train_dataset.load_dataset(dataset_dir='./kangaroo-transfer-learning/kangaroo', is_train=True)
train_dataset.prepare()

# Validation
validation_dataset = KangarooDataset()
validation_dataset.load_dataset(dataset_dir='./kangaroo-transfer-learning/kangaroo', is_train=False)
validation_dataset.prepare()

In [None]:
class KangarooConfig(mrcnn.config.Config):
    NAME = "kangaroo_cfg"

    GPU_COUNT = 1
    IMAGES_PER_GPU = 1
    
    NUM_CLASSES = 2
    
    LEARNING_RATE = 0.001

    STEPS_PER_EPOCH = 131