# Converting the Drainage Crossings Dataset to COCO Format

In [2]:
!pip install pandas

Collecting pandas
  Downloading pandas-2.2.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (19 kB)
Collecting tzdata>=2022.7 (from pandas)
  Downloading tzdata-2024.1-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pandas-2.2.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.0/13.0 MB[0m [31m20.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading tzdata-2024.1-py2.py3-none-any.whl (345 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m345.4/345.4 kB[0m [31m19.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tzdata, pandas
Successfully installed pandas-2.2.2 tzdata-2024.1
[0m

In [3]:
import os
import pandas as pd
import numpy as np
import shutil
import json
import xml.etree.ElementTree as ET

In [4]:
# change to dataset path, all else can then be run without modifying
dataset_path = "/workspace/Data_share"

In [5]:
directories = ['CA', 'IL', 'ND', 'NE']

## 1. Renaming files, creating a data catalog, and splitting the dataset

In [6]:
# rename all image files to reflect their physical location
for d in directories:
    dirpath = os.path.join(dataset_path, d, f'{d}_data')
    for file in os.listdir(dirpath):
        rename = f"{d}_{file}"
        new_path = os.path.join(dirpath, rename)
        os.rename(os.path.join(dirpath, file), new_path)

In [7]:
# rename annotation directories for consistency
os.rename(os.path.join(dataset_path, 'CA', 'annotations'), os.path.join(dataset_path, 'CA', 'CA_annotations'))
os.rename(os.path.join(dataset_path, 'IL', 'annotations'), os.path.join(dataset_path, 'IL', 'IL_annotations'))

In [8]:
# rename all annotation files to match their corresponding image
for d in directories:
    dirpath = os.path.join(dataset_path, d, f'{d}_annotations')
    for file in os.listdir(dirpath):
        rename = f"{d}_{file}"
        new_path = os.path.join(dirpath, rename)
        os.rename(os.path.join(dirpath, file), new_path)

In [9]:
# create a dataframe with files, annotations, and corresponding paths
file_list = []
for d in directories:
    dirpath = os.path.join(dataset_path, d, f'{d}_data')
    for file in os.listdir(dirpath):
        if file.endswith('.tif'):
            annotation = file[:-4]+'.xml'
            file_list.append({
                'filename': file, 
                'filepath':os.path.join(dirpath, file),
                'annpath': os.path.join(dataset_path, d, f'{d}_annotations', annotation),
                'ann': annotation
            })

In [10]:
df = pd.DataFrame(file_list)
df.head()

Unnamed: 0,filename,filepath,annpath,ann
0,CA_682.tif,/workspace/Data_share/CA/CA_data/CA_682.tif,/workspace/Data_share/CA/CA_annotations/CA_682...,CA_682.xml
1,CA_1810.tif,/workspace/Data_share/CA/CA_data/CA_1810.tif,/workspace/Data_share/CA/CA_annotations/CA_181...,CA_1810.xml
2,CA_1671.tif,/workspace/Data_share/CA/CA_data/CA_1671.tif,/workspace/Data_share/CA/CA_annotations/CA_167...,CA_1671.xml
3,CA_1866.tif,/workspace/Data_share/CA/CA_data/CA_1866.tif,/workspace/Data_share/CA/CA_annotations/CA_186...,CA_1866.xml
4,CA_969.tif,/workspace/Data_share/CA/CA_data/CA_969.tif,/workspace/Data_share/CA/CA_annotations/CA_969...,CA_969.xml


In [11]:
# randomly split dataset
np.random.seed(0)
def assign_usage(file):
    n = np.random.rand()
    if n < 0.7:
        return 'train'
    elif n < 0.9:
        return 'validate'
    else:
        return 'test'

df['usage'] = df.apply(assign_usage, axis=1)
df.head()

Unnamed: 0,filename,filepath,annpath,ann,usage
0,CA_682.tif,/workspace/Data_share/CA/CA_data/CA_682.tif,/workspace/Data_share/CA/CA_annotations/CA_682...,CA_682.xml,train
1,CA_1810.tif,/workspace/Data_share/CA/CA_data/CA_1810.tif,/workspace/Data_share/CA/CA_annotations/CA_181...,CA_1810.xml,validate
2,CA_1671.tif,/workspace/Data_share/CA/CA_data/CA_1671.tif,/workspace/Data_share/CA/CA_annotations/CA_167...,CA_1671.xml,train
3,CA_1866.tif,/workspace/Data_share/CA/CA_data/CA_1866.tif,/workspace/Data_share/CA/CA_annotations/CA_186...,CA_1866.xml,train
4,CA_969.tif,/workspace/Data_share/CA/CA_data/CA_969.tif,/workspace/Data_share/CA/CA_annotations/CA_969...,CA_969.xml,train


## Restructuring the data directory to conform to COCO specifications

In [13]:
# create training, testing, and validation directories
output_directory = '/workspace/processed_data'

os.makedirs(output_directory, exist_ok=True)
os.chdir(output_directory)
os.makedirs('train', exist_ok=True)
os.makedirs('validate', exist_ok=True)
os.makedirs('test', exist_ok=True)

# use shutil to move files into directories based on the usage column
for index, row in df.iterrows():
    filepath = row['filepath']
    usage = row['usage']
    new_path = os.path.join(output_directory, usage, f'{index}.tif')
    shutil.copy(filepath, new_path)

## Iterating through annotation files to create the COCO json for each directory

In [14]:
# change to annotation directory
annotation_directory = os.path.join(output_directory, 'annotations')
os.makedirs(annotation_directory, exist_ok=True)
os.chdir(annotation_directory)

In [15]:
# for each given directory:
for usage in ['train', 'test', 'validate']:
    # initialize COCO json for the directory
    usage_json = {}
    info = {'year': 2024,
            'version': 1.0,
            'description': f'Data to {usage} drainage culvert detection task',
            'contributor': 'none',
            'url': 'none',
            'date_created':None}
    licenses = [{'id':1,
                'url':'https://creativecommons.org/publicdomain/zero/1.0/',
                'name':'Public Domain'}]
    categories = [{'id':0,
                   'name':'Drainage Culvert',
                   'supercategory':'none'}]
    images = []
    annotations = []
    # get a dataframe of only files for a given usage
    usage_df = df[df['usage'] == usage]
    for index, row in usage_df.iterrows():
        # add image information to json
        image_dict = {'id':index,
                      'license':1,
                      'file_name':f'{index}.tif',
                      'height':800,
                      'width':800,
                      'date_captured':'none'}
        images.append(image_dict)
        # load in annotation xml to dict
        tree = ET.parse(row['annpath'])
        root = tree.getroot()

        ann_number = 0
        # extract bounding box from dictionary and append to json as annotation
        for bbox in root.findall('object'):
            xmin = int(bbox.find('bndbox/xmin').text)
            ymin = int(bbox.find('bndbox/ymin').text)
            xmax = int(bbox.find('bndbox/xmax').text)
            ymax = int(bbox.find('bndbox/ymax').text)
            
            width = xmax - xmin
            height = ymax - ymin
            annotation_dict = {'id': ann_number,
                          'image_id':f'{index}.tif',
                          'category_id': 0,
                          'bbox': [xmin, ymax, width, height],
                          'area': width*height,
                          'segmentation':[],
                          'iscrowd':0}
            annotations.append(annotation_dict)
            ann_number += 1
        usage_json['info'] = info
        usage_json['licenses'] = licenses
        usage_json['categories'] = categories
        usage_json['images'] = images
        usage_json['annotations'] = annotations
        
        json_data = json.dumps(usage_json, indent=4)

        
    with open(f'{usage}.json', 'w') as json_file:
        json_file.write(json_data)