# Converting the Drainage Crossings Dataset to COCO Format

In [4]:
import os
import pandas as pd
import numpy as np
import shutil
import json
import xml.etree.ElementTree as ET
import rasterio
from tqdm import tqdm

In [5]:
# change to dataset path, all else can then be run without modifying
dataset_path = "/workspace/Data_share/"

In [6]:
directories = ['CA', 'IL', 'NE']

## 1. Renaming files, creating a data catalog, and splitting the dataset

In [7]:
# rename all image files to reflect their physical location
for d in directories:
    dirpath = os.path.join(dataset_path, d, f'{d}_data')
    for file in os.listdir(dirpath):
        rename = f"{d}_{file}"
        new_path = os.path.join(dirpath, rename)
        os.rename(os.path.join(dirpath, file), new_path)

In [8]:
# rename annotation directories for consistency
os.rename(os.path.join(dataset_path, 'CA', 'annotations'), os.path.join(dataset_path, 'CA', 'CA_annotations'))
os.rename(os.path.join(dataset_path, 'IL', 'annotations'), os.path.join(dataset_path, 'IL', 'IL_annotations'))

In [9]:
# rename all annotation files to match their corresponding image
for d in directories:
    dirpath = os.path.join(dataset_path, d, f'{d}_annotations')
    for file in os.listdir(dirpath):
        rename = f"{d}_{file}"
        new_path = os.path.join(dirpath, rename)
        os.rename(os.path.join(dirpath, file), new_path)

In [10]:
# create a dataframe with files, annotations, and corresponding paths
file_list = []
for d in directories:
    dirpath = os.path.join(dataset_path, d, f'{d}_data')
    for file in os.listdir(dirpath):
        if file.endswith('.tif'):
            annotation = file[:-4]+'.xml'
            file_list.append({
                'filename': file, 
                'filepath':os.path.join(dirpath, file),
                'annpath': os.path.join(dataset_path, d, f'{d}_annotations', annotation),
                'ann': annotation
            })

In [11]:
df = pd.DataFrame(file_list)
df.head()

Unnamed: 0,filename,filepath,annpath,ann
0,CA_612.tif,/workspace/Data_share/CA/CA_data/CA_612.tif,/workspace/Data_share/CA/CA_annotations/CA_612...,CA_612.xml
1,CA_341.tif,/workspace/Data_share/CA/CA_data/CA_341.tif,/workspace/Data_share/CA/CA_annotations/CA_341...,CA_341.xml
2,CA_1878.tif,/workspace/Data_share/CA/CA_data/CA_1878.tif,/workspace/Data_share/CA/CA_annotations/CA_187...,CA_1878.xml
3,CA_1504.tif,/workspace/Data_share/CA/CA_data/CA_1504.tif,/workspace/Data_share/CA/CA_annotations/CA_150...,CA_1504.xml
4,CA_2214.tif,/workspace/Data_share/CA/CA_data/CA_2214.tif,/workspace/Data_share/CA/CA_annotations/CA_221...,CA_2214.xml


## Ensure that image chips do not contain invalid values

In [12]:
df['valid'] = True
for index, row in tqdm(df.iterrows(), total=len(df)):
    with rasterio.open(row['filepath'], 'r') as src:
        data = src.read()
    # this operation catches invalid values that are too high or too low compared to the rest of the DEM
    df.at[index, 'valid'] = False if (np.max(data) - np.min(data)) > 10000 else True
    if (np.max(data) - np.min(data)) > 10000:
        print (np.min(data), np.max(data))

filtered = len(df[df['valid']==False])

print(f'{filtered} out of {len(df)} chips filtered out due to invalid pixel values')

 80%|████████████████████████████       | 4336/5400 [01:35<00:21, 48.59it/s]

-3.402823e+38 479.9973


 83%|█████████████████████████████      | 4487/5400 [01:38<00:18, 48.35it/s]

-3.402823e+38 458.95322


 84%|█████████████████████████████▍     | 4543/5400 [01:39<00:17, 48.01it/s]

-3.402823e+38 475.28494


 98%|██████████████████████████████████▏| 5279/5400 [01:55<00:02, 47.71it/s]

-3.402823e+38 478.6593


100%|███████████████████████████████████| 5400/5400 [01:57<00:00, 45.92it/s]

4 out of 5400 chips filtered out due to invalid pixel values





In [13]:
df = df[df['valid']==True]

## Randomly split dataset

In [14]:
# randomly split dataset
np.random.seed(0)
def assign_usage(file):
    n = np.random.rand()
    if n < 0.7:
        return 'train'
    elif n < 0.9:
        return 'validate'
    else:
        return 'test'

df['usage'] = df.apply(assign_usage, axis=1)
df.head()

Unnamed: 0,filename,filepath,annpath,ann,valid,usage
0,CA_612.tif,/workspace/Data_share/CA/CA_data/CA_612.tif,/workspace/Data_share/CA/CA_annotations/CA_612...,CA_612.xml,True,train
1,CA_341.tif,/workspace/Data_share/CA/CA_data/CA_341.tif,/workspace/Data_share/CA/CA_annotations/CA_341...,CA_341.xml,True,validate
2,CA_1878.tif,/workspace/Data_share/CA/CA_data/CA_1878.tif,/workspace/Data_share/CA/CA_annotations/CA_187...,CA_1878.xml,True,train
3,CA_1504.tif,/workspace/Data_share/CA/CA_data/CA_1504.tif,/workspace/Data_share/CA/CA_annotations/CA_150...,CA_1504.xml,True,train
4,CA_2214.tif,/workspace/Data_share/CA/CA_data/CA_2214.tif,/workspace/Data_share/CA/CA_annotations/CA_221...,CA_2214.xml,True,train


## Restructuring the data directory to conform to COCO specifications

In [15]:
# create training, testing, and validation directories
output_directory = '/workspace/processed_data/initial_data'

os.makedirs(output_directory, exist_ok=True)
os.chdir(output_directory)
os.makedirs('train', exist_ok=True)
os.makedirs('validate', exist_ok=True)
os.makedirs('test', exist_ok=True)

In [16]:
# use shutil to move files into directories based on the usage column
for index, row in tqdm(df.iterrows(), total=len(df)):
    filepath = row['filepath']
    usage = row['usage']
    new_path = os.path.join(output_directory, usage, f'{index}.tif')
    shutil.copy(filepath, new_path)

100%|██████████████████████████████████| 5396/5396 [00:07<00:00, 719.86it/s]


## Iterating through annotation files to create the COCO json for each directory

In [17]:
# change to annotation directory
annotation_directory = os.path.join(output_directory, 'annotations')
os.makedirs(annotation_directory, exist_ok=True)
os.chdir(annotation_directory)

In [19]:
# define function to get true centroid coordinates given an input bounding box
def get_true_centroid(xmin, ymin, xmax, ymax):

    # get true centroid x coordinate
    if xmin == 0:
        cx = xmax - 50
    else:
        cx = xmin + 50

    # get true centroid y coordinate
    if ymin == 0:
        cy = ymax - 50
    else:
        cy = ymin + 50

    return (cx, cy)

In [27]:
def create_annotation_json(box_size):
    # for each given directory:    
    ann_number = 0
    for usage in ['train','validate','test']:
        # initialize COCO json for the directory
        usage_json = {}
        info = {'year': 2024,
                'version': 1.0,
                'description': f'Data to {usage} drainage culvert detection task with annotation bounding boxes of size {box_size}',
                'contributor': 'none',
                'url': 'none',
                'date_created':None}
        licenses = [{'id':1,
                    'url':'https://creativecommons.org/publicdomain/zero/1.0/',
                    'name':'Public Domain'}]
        categories = [{'id':1,
                       'name':'Drainage Culvert',
                       'supercategory':'none'}]
        images = []
        annotations = []
        # get a dataframe of only files for a given usage
        usage_df = df[df['usage'] == usage]
        for index, row in tqdm(usage_df.iterrows(), total=len(usage_df)):
            # add image information to json
            image_dict = {'id':index,
                          'license':1,
                          'file_name':f'{index}.tif',
                          'height':800,
                          'width':800,
                          'date_captured':'none'}
            images.append(image_dict)
            # load in annotation xml to dict
            tree = ET.parse(row['annpath'])
            root = tree.getroot()
    
            
            # extract bounding box from dictionary and append to json as annotation
            for bbox in root.findall('object'):
                xmin = int(bbox.find('bndbox/xmin').text)
                ymin = int(bbox.find('bndbox/ymin').text)
                xmax = int(bbox.find('bndbox/xmax').text)
                ymax = int(bbox.find('bndbox/ymax').text)
                cx, cy = get_true_centroid(xmin, ymin, xmax, ymax)
                new_xmin = max(cx - (0.5 * box_size), 0)
                new_ymin = max(cy - (0.5 * box_size), 0)
                new_xmax = min(cx + (0.5 * box_size), 800)
                new_ymax = min(cy + (0.5 * box_size), 800)
                width = new_xmax - new_xmin
                height = new_ymax - new_ymin

                annotation_dict = {
                    'id': ann_number,
                    'image_id':index,
                    'category_id': 1,
                    'bbox': [new_xmin, new_ymin, width, height],
                    'area': width*height,
                    'segmentation':[],
                    'iscrowd':0,
                    }
                annotations.append(annotation_dict)
                ann_number += 1
            usage_json['info'] = info
            usage_json['licenses'] = licenses
            usage_json['categories'] = categories
            usage_json['images'] = images
            usage_json['annotations'] = annotations
            
            json_data = json.dumps(usage_json, indent=4)
    
            
        with open(f'{usage}.json', 'w') as json_file:
            json_file.write(json_data)

In [28]:
create_annotation_json(100)

100%|███████████████████████████████████| 3800/3800 [04:31<00:00, 14.01it/s]
100%|███████████████████████████████████| 1049/1049 [00:21<00:00, 49.00it/s]
100%|█████████████████████████████████████| 547/547 [00:06<00:00, 89.38it/s]


## Perform steps to create transfer dataset

In [23]:
directories = ['ND']

In [24]:
# rename all image files to reflect their physical location
for d in directories:
    dirpath = os.path.join(dataset_path, d, f'{d}_data')
    for file in os.listdir(dirpath):
        rename = f"{d}_{file}"
        new_path = os.path.join(dirpath, rename)
        os.rename(os.path.join(dirpath, file), new_path)

In [25]:
# rename all annotation files to match their corresponding image
for d in directories:
    dirpath = os.path.join(dataset_path, d, f'{d}_annotations')
    for file in os.listdir(dirpath):
        rename = f"{d}_{file}"
        new_path = os.path.join(dirpath, rename)
        os.rename(os.path.join(dirpath, file), new_path)

In [26]:
# create a dataframe with files, annotations, and corresponding paths
file_list = []
for d in directories:
    dirpath = os.path.join(dataset_path, d, f'{d}_data')
    for file in os.listdir(dirpath):
        if file.endswith('.tif'):
            annotation = file[:-4]+'.xml'
            file_list.append({
                'filename': file, 
                'filepath':os.path.join(dirpath, file),
                'annpath': os.path.join(dataset_path, d, f'{d}_annotations', annotation),
                'ann': annotation
            })

In [27]:
df = pd.DataFrame(file_list)
df.head()

Unnamed: 0,filename,filepath,annpath,ann
0,ND_237.tif,/workspace/Data_share/Data_share/ND/ND_data/ND...,/workspace/Data_share/Data_share/ND/ND_annotat...,ND_237.xml
1,ND_456.tif,/workspace/Data_share/Data_share/ND/ND_data/ND...,/workspace/Data_share/Data_share/ND/ND_annotat...,ND_456.xml
2,ND_589.tif,/workspace/Data_share/Data_share/ND/ND_data/ND...,/workspace/Data_share/Data_share/ND/ND_annotat...,ND_589.xml
3,ND_141.tif,/workspace/Data_share/Data_share/ND/ND_data/ND...,/workspace/Data_share/Data_share/ND/ND_annotat...,ND_141.xml
4,ND_364.tif,/workspace/Data_share/Data_share/ND/ND_data/ND...,/workspace/Data_share/Data_share/ND/ND_annotat...,ND_364.xml


## Ensure that image chips do not contain invalid values

In [28]:
df['valid'] = True
for index, row in tqdm(df.iterrows(), total=len(df)):
    with rasterio.open(row['filepath'], 'r') as src:
        data = src.read()
    # this operation catches invalid values that are too high or too low compared to the rest of the DEM
    df.at[index, 'valid'] = False if (np.max(data) - np.min(data)) > 10000 else True
    if (np.max(data) - np.min(data)) > 10000:
        print (np.min(data), np.max(data))

filtered = len(df[df['valid']==False])

print(f'{filtered} out of {len(df)} chips filtered out due to invalid pixel values')

  3%|█▏                                        | 18/612 [00:00<00:10, 58.50it/s]

0.0 3.402823e+38
0.0 3.402823e+38
0.0 3.402823e+38


 18%|███████▏                                 | 108/612 [00:01<00:08, 59.46it/s]

0.0 3.402823e+38


 20%|████████▎                                | 125/612 [00:02<00:13, 37.26it/s]

0.0 3.402823e+38


 32%|████████████▉                            | 194/612 [00:03<00:06, 59.78it/s]

0.0 3.402823e+38


 35%|██████████████▍                          | 215/612 [00:03<00:06, 60.93it/s]

0.0 3.402823e+38
0.0 3.402823e+38


100%|█████████████████████████████████████████| 612/612 [00:10<00:00, 58.18it/s]

8 out of 612 chips filtered out due to invalid pixel values





In [29]:
df = df[df['valid']==True]

In [30]:
# create training, testing, and validation directories
output_directory = '/workspace/transfer_data'

os.makedirs(output_directory, exist_ok=True)
os.chdir(output_directory)
os.makedirs('test', exist_ok=True)

In [32]:
df['usage'] = 'test'
# use shutil to move files into directories based on the usage column
for index, row in tqdm(df.iterrows(), total=len(df)):
    filepath = row['filepath']
    usage = row['usage']
    new_path = os.path.join(output_directory, usage, f'{index}.tif')
    shutil.copy(filepath, new_path)

100%|████████████████████████████████████████| 604/604 [00:01<00:00, 377.96it/s]


## Iterating through annotation files to create the COCO json for each directory

In [33]:
# change to annotation directory
annotation_directory = os.path.join(output_directory, 'annotations')
os.makedirs(annotation_directory, exist_ok=True)
os.chdir(annotation_directory)

In [None]:
create_annotation_json(100)