# Converting the Drainage Crossings Dataset to COCO Format

In [24]:
import os
import pandas as pd
import numpy as np
import shutil
import json
import xml.etree.ElementTree as ET

In [2]:
# change to dataset path, all else can then be run without modifying
dataset_path = "/home/denys/dl-gpu/Data_share"

In [3]:
directories = ['CA', 'IL', 'ND', 'NE']

## 1. Renaming files, creating a data catalog, and splitting the dataset

In [14]:
# rename all image files to reflect their physical location
for d in directories:
    dirpath = os.path.join(dataset_path, d, f'{d}_data')
    for file in os.listdir(dirpath):
        rename = f"{d}_{file}"
        new_path = os.path.join(dirpath, rename)
        os.rename(os.path.join(dirpath, file), new_path)

In [15]:
# rename annotation directories for consistency
os.rename(os.path.join(dataset_path, 'CA', 'annotations'), os.path.join(dataset_path, 'CA', 'CA_annotations'))
os.rename(os.path.join(dataset_path, 'IL', 'annotations'), os.path.join(dataset_path, 'IL', 'IL_annotations'))

In [16]:
# rename all annotation files to match their corresponding image
for d in directories:
    dirpath = os.path.join(dataset_path, d, f'{d}_annotations')
    for file in os.listdir(dirpath):
        rename = f"{d}_{file}"
        new_path = os.path.join(dirpath, rename)
        os.rename(os.path.join(dirpath, file), new_path)

In [4]:
# create a dataframe with files, annotations, and corresponding paths
file_list = []
for d in directories:
    dirpath = os.path.join(dataset_path, d, f'{d}_data')
    for file in os.listdir(dirpath):
        if file.endswith('.tif'):
            annotation = file[:-4]+'.xml'
            file_list.append({
                'filename': file, 
                'filepath':os.path.join(dirpath, file),
                'annpath': os.path.join(dataset_path, d, f'{d}_annotations', annotation),
                'ann': annotation
            })

In [5]:
df = pd.DataFrame(file_list)
df.head()

Unnamed: 0,filename,filepath,annpath,ann
0,CA_682.tif,/home/denys/dl-gpu/Data_share/CA/CA_data/CA_68...,/home/denys/dl-gpu/Data_share/CA/CA_annotation...,CA_682.xml
1,CA_1810.tif,/home/denys/dl-gpu/Data_share/CA/CA_data/CA_18...,/home/denys/dl-gpu/Data_share/CA/CA_annotation...,CA_1810.xml
2,CA_1671.tif,/home/denys/dl-gpu/Data_share/CA/CA_data/CA_16...,/home/denys/dl-gpu/Data_share/CA/CA_annotation...,CA_1671.xml
3,CA_1866.tif,/home/denys/dl-gpu/Data_share/CA/CA_data/CA_18...,/home/denys/dl-gpu/Data_share/CA/CA_annotation...,CA_1866.xml
4,CA_969.tif,/home/denys/dl-gpu/Data_share/CA/CA_data/CA_96...,/home/denys/dl-gpu/Data_share/CA/CA_annotation...,CA_969.xml


In [6]:
# randomly split dataset
np.random.seed(0)
def assign_usage(file):
    n = np.random.rand()
    if n < 0.7:
        return 'train'
    elif n < 0.9:
        return 'validate'
    else:
        return 'test'

df['usage'] = df.apply(assign_usage, axis=1)
df.head()

Unnamed: 0,filename,filepath,annpath,ann,usage
0,CA_682.tif,/home/denys/dl-gpu/Data_share/CA/CA_data/CA_68...,/home/denys/dl-gpu/Data_share/CA/CA_annotation...,CA_682.xml,train
1,CA_1810.tif,/home/denys/dl-gpu/Data_share/CA/CA_data/CA_18...,/home/denys/dl-gpu/Data_share/CA/CA_annotation...,CA_1810.xml,validate
2,CA_1671.tif,/home/denys/dl-gpu/Data_share/CA/CA_data/CA_16...,/home/denys/dl-gpu/Data_share/CA/CA_annotation...,CA_1671.xml,train
3,CA_1866.tif,/home/denys/dl-gpu/Data_share/CA/CA_data/CA_18...,/home/denys/dl-gpu/Data_share/CA/CA_annotation...,CA_1866.xml,train
4,CA_969.tif,/home/denys/dl-gpu/Data_share/CA/CA_data/CA_96...,/home/denys/dl-gpu/Data_share/CA/CA_annotation...,CA_969.xml,train


## Restructuring the data directory to conform to COCO specifications

In [19]:
# create training, testing, and validation directories
output_directory = '/home/denys/dl-gpu/processed-data'

os.makedirs(output_directory, exist_ok=True)
os.chdir(output_directory)
os.makedirs('train', exist_ok=True)
os.makedirs('validate', exist_ok=True)
os.makedirs('test', exist_ok=True)

# use shutil to move files into directories based on the usage column
for index, row in df.iterrows():
    filename = row['filename']
    filepath = row['filepath']
    usage = row['usage']
    new_path = os.path.join(output_directory, usage, filename)
    shutil.copy(filepath, new_path)

## Iterating through annotation files to create the COCO json for each directory

In [62]:
# change to annotation directory
annotation_directory = os.path.join(output_directory, 'annotations')
os.makedirs(annotation_directory, exist_ok=True)
os.chdir(annotation_directory)

In [None]:
# function to load in annotation xml as a dictionary
def xml_to_dict(element):
    if len(element) == 0:
        return element.text
    result = {}
    for child in element:
        if child.tag not in result:
            result[child.tag] = xml_to_dict(child)
        else:
            if not isinstance(result[child.tag], list):
                result[child.tag] = [result[child.tag]]
            result[child.tag].append(xml_to_dict(child))
    return result

In [61]:
# for each given directory:
for usage in ['train', 'test', 'validate']:
    # initialize COCO json for the directory
    usage_json = {}
    info = {'year': 2024,
            'version': 1.0,
            'description': f'Data to {usage} drainage culvert detection task',
            'contributor': 'Godwin, Denys',
            'url': 'NA',
            'date_created':None}
    licenses = [{'id':1,
                'url':'https://creativecommons.org/publicdomain/zero/1.0/',
                'name':'Public Domain'}]
    categories = [{'id':0,
                   'name':'Drainage Culvert',
                   'supercategory':'none'}]
    images = []
    annotations = []
    # get a dataframe of only files for a given usage
    usage_df = df[df['usage'] == usage]
    for index, row in usage_df.iterrows():
        # add image information to json
        image_dict = {'id':row['filename'][:-4],
                      'license':1,
                      'file_name':row['filename'],
                      'height':800,
                      'width':800,
                      'date_captured':'none'}
        images.append(image_dict)
        # load in annotation xml to dict
        tree = ET.parse(row['annpath'])
        root = tree.getroot()
        data_dict = {root.tag: xml_to_dict(root)}

        ann_number = 0
        # extract bounding box from dictionary and append to json as annotation
        for bbox in root.findall('object'):
            xmin = int(bbox.find('bndbox/xmin').text)
            ymin = int(bbox.find('bndbox/ymin').text)
            xmax = int(bbox.find('bndbox/xmax').text)
            ymax = int(bbox.find('bndbox/ymax').text)
            
            width = xmax - xmin
            height = ymax - ymin
            annotation_dict = {'id': ann_number,
                          'image_id':row['filename'][:-4],
                          'category_id': 0,
                          'bbox': [xmin, ymax, width, height],
                          'area': width*height,
                          'segmentation':[],
                          'iscrowd':0}
            annotations.append(annotation_dict)
            ann_number += 1
        usage_json['info'] = info
        usage_json['licenses'] = licenses
        usage_json['categories'] = categories
        usage_json['images'] = images
        usage_json['annotations'] = annotations
        
        json_data = json.dumps(usage_json, indent=4)

        
    with open(f'{usage}.json', 'w') as json_file:
        json_file.write(json_data)