In [2]:
!pip install --upgrade pip
!pip install pandas
!pip install tqdm
!pip install --upgrade pip setuptools wheel
!pip install fiftyone==0.15.1
!pip install fiftyone-brain==0.8.2
!pip install fiftyone-db==0.3.0

Collecting pip
  Using cached pip-21.3.1-py3-none-any.whl (1.7 MB)
Installing collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 21.0.1
    Uninstalling pip-21.0.1:
      Successfully uninstalled pip-21.0.1
Successfully installed pip-21.3.1
Collecting pandas
  Using cached pandas-1.1.5-cp36-cp36m-manylinux1_x86_64.whl (9.5 MB)
Collecting pytz>=2017.2
  Using cached pytz-2022.1-py2.py3-none-any.whl (503 kB)
Collecting numpy>=1.15.4
  Using cached numpy-1.19.5-cp36-cp36m-manylinux2010_x86_64.whl (14.8 MB)
Installing collected packages: pytz, numpy, pandas
Successfully installed numpy-1.19.5 pandas-1.1.5 pytz-2022.1
Collecting tqdm
  Using cached tqdm-4.64.0-py2.py3-none-any.whl (78 kB)
Collecting importlib-resources
  Using cached importlib_resources-5.4.0-py3-none-any.whl (28 kB)
Collecting zipp>=3.1.0
  Using cached zipp-3.6.0-py3-none-any.whl (5.3 kB)
Installing collected packages: zipp, importlib-resources, tqdm
Successfully installed importlib-r

In [10]:
import os
import shutil
import pandas as pd
import numpy as np
import re
from tqdm import tqdm
import yaml
import fiftyone as fo
import fiftyone.zoo as foz
from yaml.loader import SafeLoader

# Open the file and load the file
with open(os.path.join(os.getcwd(),'user_trg_config.yaml')) as f:
    data = yaml.load(f, Loader=SafeLoader)
    trg_class = data['trg_class']
    user_path = data['user_path']

In [4]:
print(trg_class)

['Box', 'Vehicle registration plate']


In [5]:
### images download

DATA_PATH = os.path.join(user_path,f"fiftyone/open-images-v6")

## now we're suppposed to be under project folder

main_path = os.getcwd()
OUTPUT_PATH = os.path.join(main_path, f"custom_data")

In [33]:
## define data processing function
def process_data(data, class_dict, data_type="train"):
    for _, row in tqdm(data.iterrows(), total=len(data)):
        image_name = row["image_id"]
        bboxes = row["bboxes"]
        label = class_dict[row['LabelName']]['new_id']
#         label = 0
        yolo_data = []
        for bbox in bboxes:
            x_center = round((bbox[1]+bbox[0])/2,6)
            y_center = round((bbox[3]+bbox[2])/2,6)
            w = round(bbox[1]-bbox[0],6)
            h = round(bbox[3]-bbox[2],6)

            yolo_data.append([label, x_center, y_center, w, h])
        yolo_data = np.array(yolo_data)
        
        txt_output_filepath = os.path.join(OUTPUT_PATH, f"labels/{data_type}/{image_name}.txt")
        
        if os.path.exists(txt_output_filepath):
            with open(txt_output_filepath, "a") as f:
                np.savetxt(f, yolo_data, fmt=["%d","%f","%f","%f","%f"])
        
        else:
            np.savetxt(
                txt_output_filepath,
                yolo_data,
                fmt=["%d","%f","%f","%f","%f"]
            )
            
        shutil.copyfile(
            os.path.join(DATA_PATH, f"{data_type}/data/{image_name}.jpg"),
            os.path.join(OUTPUT_PATH, f"images/{data_type}/{image_name}.jpg")
        )

In [34]:
if __name__ == "__main__":

    ## download images
    
    for split in ['train','validation']:
        dataset = foz.load_zoo_dataset(
            "open-images-v6",
            split=split,
            label_types="detections",
            classes=[trg_class],
            only_matching=True
#             ,
#             max_samples=500,
        )

    ### create custom data folder with below tree structure

    # custom_data
    #   images
    #       train
    #       validation
    #   labels
    #       train
    #       validation

    custom_data_path = os.path.join(main_path,f"custom_data")
    path_list = [custom_data_path]
    for fldr in ['images','labels']:
        path = os.path.join(custom_data_path,f"{fldr}")
        path_list.append(path)
        for data_type in ['train','validation']:
            path = os.path.join(custom_data_path,f"{fldr}/{data_type}")
            path_list.append(path)
    
    print(f"path_list:{path_list}")
    for path in path_list:
        print(f"path:{path}")
        if path == custom_data_path: 
            ## remove custom_data path if exist
            if os.path.exists(path):
                shutil.rmtree(path)
            os.mkdir(path)
        else:
            if not os.path.exists(path):
                os.mkdir(path)


    ## get class names
    class_file = os.path.join(DATA_PATH, f"train/metadata/classes.csv")
    class_dict= pd.read_csv(class_file, names=['path','class']).reset_index() \
    .rename(columns={'index':'class_id'}).set_index('path').to_dict('index')
    # class_dict
    trg_class_dict = {}
    for key,val in class_dict.items():
        if val['class'] in trg_class:
            trg_class_dict[key] = {
                'new_id':trg_class.index(val['class']),
                'class':val['class']
            }
    # trg_class_dict

    for data_type in ["train","validation"]:

        detection_file = os.path.join(DATA_PATH, 
                                     f"{data_type}/labels/detections.csv")
        ## get relevant image ids
        imagefile_path = os.path.join(DATA_PATH, 
                                     f"{data_type}/data")
        image_ids = [re.split('\.',f )[0] for f in os.listdir(imagefile_path) if os.path.isfile(os.path.join(imagefile_path, f))]

        df_detects= pd.read_csv(detection_file)
        df_detects = df_detects.rename(columns={'ImageID':'image_id'})
        imgid_filter = df_detects['image_id'].isin(image_ids)
        df_rel_detects = df_detects.iloc[np.where(imgid_filter)].reset_index(drop=True)
        df_rel_detects['class_id'] = df_rel_detects['LabelName'].apply(lambda x: None if x not in class_dict.keys() else class_dict[x]['class_id'])
        df_rel_detects['class'] = df_rel_detects['LabelName'].apply(lambda x: None if x not in class_dict.keys() else class_dict[x]['class'])
        class_filter = df_rel_detects['class'].str.lower().isin([i.lower() for i in trg_class])
        df_rel_detects = df_rel_detects.iloc[np.where(class_filter)].reset_index(drop=True)

        df_rel_detects['bboxes'] = df_rel_detects.apply(lambda row: [row['XMin'],row['XMax'],row['YMin'],row['YMax']],axis=1)
        df_rel_detects = df_rel_detects.groupby(by=['image_id','LabelName'])['bboxes'].apply(list).reset_index()
        # print(df_rel_detects.shape)
        
        ### moving images and labels to custom_data path
        ### edit labels to fit yolo format
        process_data(df_rel_detects, trg_class_dict, data_type=data_type)

        ## dump dict object into yaml file
        custom = {
            'train': "custom_data/images/train",
            'val': "custom_data/images/validation",
            'nc': len(trg_class),
            'names': trg_class
        }

        with open('custom.yaml', 'w') as f:
            data = yaml.dump(custom, f)

path_list:['/root/openimages-custom-yolov5/custom_data', '/root/openimages-custom-yolov5/custom_data/images', '/root/openimages-custom-yolov5/custom_data/images/train', '/root/openimages-custom-yolov5/custom_data/images/validation', '/root/openimages-custom-yolov5/custom_data/labels', '/root/openimages-custom-yolov5/custom_data/labels/train', '/root/openimages-custom-yolov5/custom_data/labels/validation']
path:/root/openimages-custom-yolov5/custom_data
path:/root/openimages-custom-yolov5/custom_data/images
path:/root/openimages-custom-yolov5/custom_data/images/train
path:/root/openimages-custom-yolov5/custom_data/images/validation
path:/root/openimages-custom-yolov5/custom_data/labels
path:/root/openimages-custom-yolov5/custom_data/labels/train
path:/root/openimages-custom-yolov5/custom_data/labels/validation


100%|██████████| 7580/7580 [05:31<00:00, 22.89it/s]
100%|██████████| 801/801 [00:34<00:00, 23.38it/s]
