# Import custom images and annotations

In [29]:
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import numpy as np
import boto3
import io
import pandas as pd
import json

my_bucket = 'autoarki-ground-truth-labeling'
client = boto3.client('s3')
response = client.list_objects_v2(Bucket=bucket)


## Gather list of annotated and non-annotated files

In [27]:
# list of all annotated files
bucket = 'autoarki-ground-truth-labeling'
dest = "bounding_box/ground_truth_annots/yolo-bbox-train"
file_key = "annot.csv"
annot_file = 's3://{}/{}/{}'.format(bucket, dest, file_key)
df_ann = pd.read_csv(annot_file)

annotated_img_list = list(df_ann.img_file.unique())

In [45]:
# list of all files that were reviewed in annotation job
import s3fs

reviewed_img_list = []

mani_path = "s3://autoarki-ground-truth-labeling/bounding_box/ground_truth_annots/yolo-bbox-train/manifests/output.manifest"
job_name = 'yolo-bbox-train'

fs = s3fs.S3FileSystem(anon=False)
with fs.open(mani_path, 'rb') as fin:    
    annot_list = []

    for line in fin.readlines():
        record = json.loads(line)
        if job_name in record.keys():  # is it necessary?
            image_file_path = record["source-ref"]
            image_file_name = image_file_path.split("/")[-1]
            reviewed_img_list.append(image_file_name)
            

In [49]:
print("Number of reviewed files: ", len(reviewed_img_list))
print("Number of annotated files: ", len(annotated_img_list))

Number of reviewed files:  4206
Number of annotated files:  140


## Class Balance

In [50]:
# remove all annotated files from list
non_annot_review_list = [x for x in reviewed_img_list if x not in annotated_img_list]
print("Number of non-annotated but reviewed files: ", len(non_annot_review_list))

Number of non-annotated but reviewed files:  4066


In [55]:
# select 125% of the annotated files 

import random
seed = 123
random.Random(seed).shuffle(non_annot_review_list)

num_non_annotated = round(len(annotated_img_list)*1.25)

non_ann_to_use_list = non_annot_review_list[:num_non_annotated]

## Split into Train and Test

In [84]:



# split 80/20
train_pct = .8

split_1 = int(train_pct * len(list_imgs))

train_filenames = list_imgs[ :split_1]
val_filenames = list_imgs[split_1: ]


In [95]:
# create subdirectories and move files
len_filenames = len("_filenames")
s3_resource = boto3.resource('s3')

def move_to_new_directories(list_files, file_subdir = "train"):
    for file_name in list_files:
        new_dir = "s3://label-test3/" + file_subdir + '/raw_images/' + file_name
        old_dir = "s3://label-test3/" + file_name
        print(new_dir)
        #s3_resource.Object(bucket, new_dir).copy_from(CopySource=old_dir)

move_to_new_directories(train_filenames, "train")
#move_to_new_directories(val_filenames, "val")
# Delete the former object A
#client.Object(bucket, “path/to/your/object_A.txt”).delete()

s3://label-test3/train/raw_images/0029-0147.png
s3://label-test3/train/raw_images/0157-1224.png
s3://label-test3/train/raw_images/0018-0053.png
s3://label-test3/train/raw_images/0029-0128.png
s3://label-test3/train/raw_images/0010-0044.png
s3://label-test3/train/raw_images/0022-0045.png
s3://label-test3/train/raw_images/0010-0053.png
s3://label-test3/train/raw_images/0011-0043.png
s3://label-test3/train/raw_images/0029-0144.png
s3://label-test3/train/raw_images/0006-0053.png
s3://label-test3/train/raw_images/0060-0074.png
s3://label-test3/train/raw_images/0018-0045.png
s3://label-test3/train/raw_images/0025-0044.png
s3://label-test3/train/raw_images/0193-0025.png
s3://label-test3/train/raw_images/0010-0045.png
s3://label-test3/train/raw_images/0025-0043.png
s3://label-test3/train/raw_images/0141-0009.png
s3://label-test3/train/raw_images/0193-0028.png
s3://label-test3/train/raw_images/0011-0053.png
s3://label-test3/train/raw_images/0141-0014.png
s3://label-test3/train/raw_images/0025-0

In [75]:
class KangarooDataset(mrcnn.utils.Dataset):

    def load_dataset(self, dataset_dir, is_train=True):
        self.add_class("dataset", 1, "Overlappng Text")

        images_dir = dataset_dir + '/images/'
        annotations_dir = dataset_dir + '/annots/'

        for filename in os.listdir(images_dir):
            image_id = filename[:-4]

            if is_train and int(image_id) >= 150:
                continue

            if not is_train and int(image_id) < 150:
                continue

            img_path = images_dir + filename
            ann_path = annotations_dir + image_id + '.xml'

            self.add_image('dataset', image_id=image_id, path=img_path, annotation=ann_path)

    def load_mask(self, image_id):
        info = self.image_info[image_id]
        path = info['annotation']
        boxes, w, h = self.extract_boxes(path)
        masks = zeros([h, w, len(boxes)], dtype='uint8')

        class_ids = list()
        for i in range(len(boxes)):
            box = boxes[i]
            row_s, row_e = box[1], box[3]
            col_s, col_e = box[0], box[2]
            masks[row_s:row_e, col_s:col_e, i] = 1
            class_ids.append(self.class_names.index('kangaroo'))
        return masks, asarray(class_ids, dtype='int32')

    # A helper method to extract the bounding boxes from the annotation file
    def extract_boxes(self, filename):
        tree = xml.etree.ElementTree.parse(filename)

        root = tree.getroot()

        boxes = list()
        for box in root.findall('.//bndbox'):
            xmin = int(box.find('xmin').text)
            ymin = int(box.find('ymin').text)
            xmax = int(box.find('xmax').text)
            ymax = int(box.find('ymax').text)
            coors = [xmin, ymin, xmax, ymax]
            boxes.append(coors)

        width = int(root.find('.//size/width').text)
        height = int(root.find('.//size/height').text)
        return boxes, width, height

In [None]:
# Train
train_dataset = KangarooDataset()
train_dataset.load_dataset(dataset_dir='./kangaroo-transfer-learning/kangaroo', is_train=True)
train_dataset.prepare()

# Validation
validation_dataset = KangarooDataset()
validation_dataset.load_dataset(dataset_dir='./kangaroo-transfer-learning/kangaroo', is_train=False)
validation_dataset.prepare()

In [None]:
class KangarooConfig(mrcnn.config.Config):
    NAME = "kangaroo_cfg"

    GPU_COUNT = 1
    IMAGES_PER_GPU = 1
    
    NUM_CLASSES = 2
    
    LEARNING_RATE = 0.001

    STEPS_PER_EPOCH = 131