# Converting the Drainage Crossings Dataset to COCO Format

In [1]:
import os
import pandas as pd
import numpy as np
import shutil
import json
import xml.etree.ElementTree as ET
import rasterio
from tqdm import tqdm

In [6]:
# change to dataset path, all else can then be run without modifying
dataset_path = "/workspace/Data_share/Data_share"

In [7]:
directories = ['CA', 'IL', 'NE']

## 1. Renaming files, creating a data catalog, and splitting the dataset

In [8]:
# rename all image files to reflect their physical location
for d in directories:
    dirpath = os.path.join(dataset_path, d, f'{d}_data')
    for file in os.listdir(dirpath):
        rename = f"{d}_{file}"
        new_path = os.path.join(dirpath, rename)
        os.rename(os.path.join(dirpath, file), new_path)

In [9]:
# rename annotation directories for consistency
os.rename(os.path.join(dataset_path, 'CA', 'annotations'), os.path.join(dataset_path, 'CA', 'CA_annotations'))
os.rename(os.path.join(dataset_path, 'IL', 'annotations'), os.path.join(dataset_path, 'IL', 'IL_annotations'))

In [10]:
# rename all annotation files to match their corresponding image
for d in directories:
    dirpath = os.path.join(dataset_path, d, f'{d}_annotations')
    for file in os.listdir(dirpath):
        rename = f"{d}_{file}"
        new_path = os.path.join(dirpath, rename)
        os.rename(os.path.join(dirpath, file), new_path)

In [11]:
# create a dataframe with files, annotations, and corresponding paths
file_list = []
for d in directories:
    dirpath = os.path.join(dataset_path, d, f'{d}_data')
    for file in os.listdir(dirpath):
        if file.endswith('.tif'):
            annotation = file[:-4]+'.xml'
            file_list.append({
                'filename': file, 
                'filepath':os.path.join(dirpath, file),
                'annpath': os.path.join(dataset_path, d, f'{d}_annotations', annotation),
                'ann': annotation
            })

In [12]:
df = pd.DataFrame(file_list)
df.head()

Unnamed: 0,filename,filepath,annpath,ann
0,CA_14.tif,/workspace/Data_share/Data_share/CA/CA_data/CA...,/workspace/Data_share/Data_share/CA/CA_annotat...,CA_14.xml
1,CA_1058.tif,/workspace/Data_share/Data_share/CA/CA_data/CA...,/workspace/Data_share/Data_share/CA/CA_annotat...,CA_1058.xml
2,CA_2004.tif,/workspace/Data_share/Data_share/CA/CA_data/CA...,/workspace/Data_share/Data_share/CA/CA_annotat...,CA_2004.xml
3,CA_467.tif,/workspace/Data_share/Data_share/CA/CA_data/CA...,/workspace/Data_share/Data_share/CA/CA_annotat...,CA_467.xml
4,CA_1564.tif,/workspace/Data_share/Data_share/CA/CA_data/CA...,/workspace/Data_share/Data_share/CA/CA_annotat...,CA_1564.xml


## Ensure that image chips do not contain invalid values

In [13]:
df['valid'] = True
for index, row in tqdm(df.iterrows(), total=len(df)):
    with rasterio.open(row['filepath'], 'r') as src:
        data = src.read()
    # this operation catches invalid values that are too high or too low compared to the rest of the DEM
    df.at[index, 'valid'] = False if (np.max(data) - np.min(data)) > 10000 else True
    if (np.max(data) - np.min(data)) > 10000:
        print (np.min(data), np.max(data))

filtered = len(df[df['valid']==False])

print(f'{filtered} out of {len(df)} chips filtered out due to invalid pixel values')

 66%|█████████████████████████▊             | 3579/5400 [01:00<00:26, 67.50it/s]

0.0 3.402823e+38


 68%|██████████████████████████▋            | 3691/5400 [01:01<00:27, 62.58it/s]

0.0 3.402823e+38


 70%|███████████████████████████            | 3755/5400 [01:02<00:24, 68.13it/s]

0.0 3.402823e+38


 83%|████████████████████████████████▌      | 4507/5400 [01:14<00:13, 68.57it/s]

0.0 3.402823e+38


100%|███████████████████████████████████████| 5400/5400 [01:27<00:00, 61.59it/s]

4 out of 5400 chips filtered out due to invalid pixel values





In [14]:
df = df[df['valid']==True]

## Randomly split dataset

In [15]:
# randomly split dataset
np.random.seed(0)
def assign_usage(file):
    n = np.random.rand()
    if n < 0.7:
        return 'train'
    elif n < 0.9:
        return 'validate'
    else:
        return 'test'

df['usage'] = df.apply(assign_usage, axis=1)
df.head()

Unnamed: 0,filename,filepath,annpath,ann,valid,usage
0,CA_14.tif,/workspace/Data_share/Data_share/CA/CA_data/CA...,/workspace/Data_share/Data_share/CA/CA_annotat...,CA_14.xml,True,train
1,CA_1058.tif,/workspace/Data_share/Data_share/CA/CA_data/CA...,/workspace/Data_share/Data_share/CA/CA_annotat...,CA_1058.xml,True,validate
2,CA_2004.tif,/workspace/Data_share/Data_share/CA/CA_data/CA...,/workspace/Data_share/Data_share/CA/CA_annotat...,CA_2004.xml,True,train
3,CA_467.tif,/workspace/Data_share/Data_share/CA/CA_data/CA...,/workspace/Data_share/Data_share/CA/CA_annotat...,CA_467.xml,True,train
4,CA_1564.tif,/workspace/Data_share/Data_share/CA/CA_data/CA...,/workspace/Data_share/Data_share/CA/CA_annotat...,CA_1564.xml,True,train


## Restructuring the data directory to conform to COCO specifications

In [16]:
# create training, testing, and validation directories
output_directory = '/workspace/processed_data'

os.makedirs(output_directory, exist_ok=True)
os.chdir(output_directory)
os.makedirs('train', exist_ok=True)
os.makedirs('validate', exist_ok=True)
os.makedirs('test', exist_ok=True)

In [17]:
# use shutil to move files into directories based on the usage column
for index, row in tqdm(df.iterrows(), total=len(df)):
    filepath = row['filepath']
    usage = row['usage']
    new_path = os.path.join(output_directory, usage, f'{index}.tif')
    shutil.copy(filepath, new_path)

100%|██████████████████████████████████████| 5396/5396 [00:28<00:00, 187.32it/s]


## Iterating through annotation files to create the COCO json for each directory

In [18]:
# change to annotation directory
annotation_directory = os.path.join(output_directory, 'annotations')
os.makedirs(annotation_directory, exist_ok=True)
os.chdir(annotation_directory)

In [18]:
# for each given directory:
ann_number = 0
for usage in ['train', 'test', 'validate']:
    # initialize COCO json for the directory
    usage_json = {}
    info = {'year': 2024,
            'version': 1.0,
            'description': f'Data to {usage} drainage culvert detection task',
            'contributor': 'none',
            'url': 'none',
            'date_created':None}
    licenses = [{'id':1,
                'url':'https://creativecommons.org/publicdomain/zero/1.0/',
                'name':'Public Domain'}]
    categories = [{'id':1,
                   'name':'Drainage Culvert',
                   'supercategory':'none'}]
    images = []
    annotations = []
    # get a dataframe of only files for a given usage
    usage_df = df[df['usage'] == usage]
    for index, row in tqdm(usage_df.iterrows(), total=len(usage_df)):
        # add image information to json
        image_dict = {'id':index,
                      'license':1,
                      'file_name':f'{index}.tif',
                      'height':800,
                      'width':800,
                      'date_captured':'none'}
        images.append(image_dict)
        # load in annotation xml to dict
        tree = ET.parse(row['annpath'])
        root = tree.getroot()

        
        # extract bounding box from dictionary and append to json as annotation
        for bbox in root.findall('object'):
            xmin = int(bbox.find('bndbox/xmin').text)
            ymin = int(bbox.find('bndbox/ymin').text)
            xmax = int(bbox.find('bndbox/xmax').text)
            ymax = int(bbox.find('bndbox/ymax').text)
            
            width = xmax - xmin
            height = ymax - ymin
            annotation_dict = {'id': ann_number,
                          'image_id':index,
                          'category_id': 1,
                          'bbox': [xmin, ymin, width, height],
                          'area': width*height,
                          'segmentation':[],
                          'iscrowd':0}
            annotations.append(annotation_dict)
            ann_number += 1
        usage_json['info'] = info
        usage_json['licenses'] = licenses
        usage_json['categories'] = categories
        usage_json['images'] = images
        usage_json['annotations'] = annotations
        
        json_data = json.dumps(usage_json, indent=4)

        
    with open(f'{usage}.json', 'w') as json_file:
        json_file.write(json_data)

100%|███████████████████████████████████████| 3800/3800 [03:51<00:00, 16.40it/s]
100%|████████████████████████████████████████| 547/547 [00:05<00:00, 108.23it/s]
100%|███████████████████████████████████████| 1049/1049 [00:19<00:00, 55.21it/s]
