# Converting the Drainage Crossings Dataset to COCO Format

In [1]:
import os
import pandas as pd
import numpy as np
import shutil
import json
import xml.etree.ElementTree as ET
import rasterio
from tqdm import tqdm

In [2]:
# change to dataset path, all else can then be run without modifying
dataset_path = "/workspace/Data_share/"

In [3]:
directories = ['CA', 'IL', 'NE']

## 1. Renaming files, creating a data catalog, and splitting the dataset

### Renaming files for consistency across sub-directories and differentiability among images - only perform these steps once

In [4]:
# rename all image files to reflect their physical location
for d in directories:
    dirpath = os.path.join(dataset_path, d, f'{d}_data')
    for file in os.listdir(dirpath):
        rename = f"{d}_{file}"
        new_path = os.path.join(dirpath, rename)
        os.rename(os.path.join(dirpath, file), new_path)

In [5]:
# rename annotation directories for consistency
os.rename(os.path.join(dataset_path, 'CA', 'annotations'), os.path.join(dataset_path, 'CA', 'CA_annotations'))
os.rename(os.path.join(dataset_path, 'IL', 'annotations'), os.path.join(dataset_path, 'IL', 'IL_annotations'))

In [6]:
# rename all annotation files to match their corresponding image
for d in directories:
    dirpath = os.path.join(dataset_path, d, f'{d}_annotations')
    for file in os.listdir(dirpath):
        rename = f"{d}_{file}"
        new_path = os.path.join(dirpath, rename)
        os.rename(os.path.join(dirpath, file), new_path)

### Creating the data catalog

In [7]:
# create a dataframe with files, annotations, and corresponding paths
file_list = []
for d in directories:
    dirpath = os.path.join(dataset_path, d, f'{d}_data')
    for file in os.listdir(dirpath):
        if file.endswith('.tif'):
            annotation = file[:-4]+'.xml'
            file_list.append({
                'filename': file, 
                'filepath':os.path.join(dirpath, file),
                'annpath': os.path.join(dataset_path, d, f'{d}_annotations', annotation),
                'ann': annotation
            })

In [8]:
df = pd.DataFrame(file_list)
df.head()

Unnamed: 0,filename,filepath,annpath,ann
0,CA_612.tif,/workspace/Data_share/CA/CA_data/CA_612.tif,/workspace/Data_share/CA/CA_annotations/CA_612...,CA_612.xml
1,CA_341.tif,/workspace/Data_share/CA/CA_data/CA_341.tif,/workspace/Data_share/CA/CA_annotations/CA_341...,CA_341.xml
2,CA_1878.tif,/workspace/Data_share/CA/CA_data/CA_1878.tif,/workspace/Data_share/CA/CA_annotations/CA_187...,CA_1878.xml
3,CA_1504.tif,/workspace/Data_share/CA/CA_data/CA_1504.tif,/workspace/Data_share/CA/CA_annotations/CA_150...,CA_1504.xml
4,CA_2214.tif,/workspace/Data_share/CA/CA_data/CA_2214.tif,/workspace/Data_share/CA/CA_annotations/CA_221...,CA_2214.xml


### Ensure that image chips do not contain invalid values

In [9]:
df['valid'] = True
for index, row in tqdm(df.iterrows(), total=len(df)):
    with rasterio.open(row['filepath'], 'r') as src:
        data = src.read()
    # this operation catches invalid values that are too high or too low compared to the rest of the DEM
    df.at[index, 'valid'] = False if (np.max(data) - np.min(data)) > 10000 else True
    if (np.max(data) - np.min(data)) > 10000:
        print (np.min(data), np.max(data))

filtered = len(df[df['valid']==False])

print(f'{filtered} out of {len(df)} chips filtered out due to invalid pixel values')

 80%|████████████████████████████       | 4335/5400 [01:13<00:16, 63.63it/s]

0.0 3.402823e+38


 83%|█████████████████████████████      | 4489/5400 [01:16<00:14, 63.46it/s]

0.0 3.402823e+38


 84%|█████████████████████████████▍     | 4545/5400 [01:17<00:13, 63.22it/s]

0.0 3.402823e+38


 98%|██████████████████████████████████▏| 5280/5400 [01:29<00:01, 63.01it/s]

0.0 3.402823e+38


100%|███████████████████████████████████| 5400/5400 [01:30<00:00, 59.38it/s]

4 out of 5400 chips filtered out due to invalid pixel values





In [10]:
df = df[df['valid']==True]

### Randomly split dataset

In [11]:
# randomly split dataset
np.random.seed(0)
def assign_usage(file):
    n = np.random.rand()
    if n < 0.7:
        return 'train'
    elif n < 0.9:
        return 'validate'
    else:
        return 'test'

df['usage'] = df.apply(assign_usage, axis=1)
df.head()

Unnamed: 0,filename,filepath,annpath,ann,valid,usage
0,CA_612.tif,/workspace/Data_share/CA/CA_data/CA_612.tif,/workspace/Data_share/CA/CA_annotations/CA_612...,CA_612.xml,True,train
1,CA_341.tif,/workspace/Data_share/CA/CA_data/CA_341.tif,/workspace/Data_share/CA/CA_annotations/CA_341...,CA_341.xml,True,validate
2,CA_1878.tif,/workspace/Data_share/CA/CA_data/CA_1878.tif,/workspace/Data_share/CA/CA_annotations/CA_187...,CA_1878.xml,True,train
3,CA_1504.tif,/workspace/Data_share/CA/CA_data/CA_1504.tif,/workspace/Data_share/CA/CA_annotations/CA_150...,CA_1504.xml,True,train
4,CA_2214.tif,/workspace/Data_share/CA/CA_data/CA_2214.tif,/workspace/Data_share/CA/CA_annotations/CA_221...,CA_2214.xml,True,train


## 2. Create new data directory to conform to COCO specifications and generate COCO annotation json files

In [12]:
# create training, testing, and validation directories
output_directory = '/workspace/processed_data_v7/initial_data'

os.makedirs(output_directory, exist_ok=True)
os.chdir(output_directory)
os.makedirs('train', exist_ok=True)
os.makedirs('validate', exist_ok=True)
os.makedirs('test', exist_ok=True)

In [13]:
# use shutil to move files into directories based on the usage column
for index, row in tqdm(df.iterrows(), total=len(df)):
    filepath = row['filepath']
    usage = row['usage']
    new_path = os.path.join(output_directory, usage, f'{index}.tif')
    shutil.copy(filepath, new_path)

100%|██████████████████████████████████| 5396/5396 [00:07<00:00, 739.57it/s]


### Iterating through annotation files to create the COCO json for each directory

As validation and testing utilizes center crop, we filter bounding boxes which fall outside this center crop portion. 
This is to avoid counting these exterior bounding boxes against model performance when calling the COCO evaluator, which has no capacity to ignore objects outside the center crop zone.
We generate train, test, and validate jsons for each cropping experiment. Training jsons will be equivalent to each other.

In [22]:
# change to annotation directory
annotation_directory = os.path.join(output_directory, 'annotations')
os.makedirs(annotation_directory, exist_ok=True)
os.chdir(annotation_directory)

In [23]:
def generate_json(crop):
    # for each given directory:
    ann_number = 0
    for usage in ['train', 'test', 'validate']:
        # initialize COCO json for the directory
        usage_json = {}
        info = {'year': 2024,
                'version': 1.0,
                'description': f'Data to {usage} drainage culvert detection task',
                'contributor': 'none',
                'url': 'none',
                'date_created':None}
        licenses = [{'id':1,
                    'url':'https://creativecommons.org/publicdomain/zero/1.0/',
                    'name':'Public Domain'}]
        categories = [{'id':1,
                       'name':'Drainage Culvert',
                       'supercategory':'none'}]
        images = []
        annotations = []
        # get a dataframe of only files for a given usage
        usage_df = df[df['usage'] == usage]
        for index, row in tqdm(usage_df.iterrows(), total=len(usage_df)):
            # add image information to json
            image_dict = {'id':index,
                          'license':1,
                          'file_name':f'{index}.tif',
                          'height':800,
                          'width':800,
                          'date_captured':'none'}
            images.append(image_dict)
            # load in annotation xml to dict
            tree = ET.parse(row['annpath'])
            root = tree.getroot()
    
            
            # extract bounding box from dictionary and append to json as annotation
            for bbox in root.findall('object'):
                xmin = int(bbox.find('bndbox/xmin').text)
                ymin = int(bbox.find('bndbox/ymin').text)
                xmax = int(bbox.find('bndbox/xmax').text)
                ymax = int(bbox.find('bndbox/ymax').text)
    
                limit = ((800 - crop) / 2) + 50 # sets a box outside of which to exclude centroids
                if xmax <= limit and usage != 'train':
                    continue
                elif xmin >= 800 - limit and usage != 'train':
                    continue
                elif ymax <= limit and usage != 'train':
                    continue
                elif ymin >= 800 - limit and usage != 'train':
                    continue
                    
                width = xmax - xmin
                height = ymax - ymin
                annotation_dict = {'id': ann_number,
                              'image_id':index,
                              'category_id': 1,
                              'bbox': [xmin, ymin, width, height],
                              'area': width*height,
                              'segmentation':[],
                              'iscrowd':0}
                annotations.append(annotation_dict)
                ann_number += 1
        usage_json['info'] = info
        usage_json['licenses'] = licenses
        usage_json['categories'] = categories
        usage_json['images'] = images
        usage_json['annotations'] = annotations
        
        json_data = json.dumps(usage_json, indent=4)
    
            
        with open(f'{usage}_{crop}.json', 'w') as json_file:
            json_file.write(json_data)

In [32]:
for crop in [256, 400, 600, 800]:
    generate_json(crop)

100%|█████████████████████████████████| 3800/3800 [00:00<00:00, 7013.45it/s]
100%|███████████████████████████████████| 547/547 [00:00<00:00, 7939.03it/s]
100%|█████████████████████████████████| 1049/1049 [00:00<00:00, 7937.97it/s]
100%|█████████████████████████████████| 3800/3800 [00:00<00:00, 7133.23it/s]
100%|███████████████████████████████████| 547/547 [00:00<00:00, 7935.43it/s]
100%|█████████████████████████████████| 1049/1049 [00:00<00:00, 7899.53it/s]
100%|█████████████████████████████████| 3800/3800 [00:00<00:00, 7913.95it/s]
100%|███████████████████████████████████| 547/547 [00:00<00:00, 7814.16it/s]
100%|█████████████████████████████████| 1049/1049 [00:00<00:00, 7772.43it/s]
100%|█████████████████████████████████| 3800/3800 [00:00<00:00, 7844.75it/s]
100%|███████████████████████████████████| 547/547 [00:00<00:00, 4796.63it/s]
100%|█████████████████████████████████| 1049/1049 [00:00<00:00, 7721.62it/s]


## Perform steps to create transfer dataset

In [3]:
directories = ['ND']

### Rename files, following previous conventions - only run this subsection once

In [4]:
# rename all image files to reflect their physical location
for d in directories:
    dirpath = os.path.join(dataset_path, d, f'{d}_data')
    for file in os.listdir(dirpath):
        rename = f"{d}_{file}"
        new_path = os.path.join(dirpath, rename)
        os.rename(os.path.join(dirpath, file), new_path)

In [5]:
# rename all annotation files to match their corresponding image
for d in directories:
    dirpath = os.path.join(dataset_path, d, f'{d}_annotations')
    for file in os.listdir(dirpath):
        rename = f"{d}_{file}"
        new_path = os.path.join(dirpath, rename)
        os.rename(os.path.join(dirpath, file), new_path)

### Creating the transfer learning dataframe

In [6]:
# create a dataframe with files, annotations, and corresponding paths
file_list = []
for d in directories:
    dirpath = os.path.join(dataset_path, d, f'{d}_data')
    for file in os.listdir(dirpath):
        if file.endswith('.tif'):
            annotation = file[:-4]+'.xml'
            file_list.append({
                'filename': file, 
                'filepath':os.path.join(dirpath, file),
                'annpath': os.path.join(dataset_path, d, f'{d}_annotations', annotation),
                'ann': annotation
            })

In [7]:
df = pd.DataFrame(file_list)
df.head()

Unnamed: 0,filename,filepath,annpath,ann
0,ND_299.tif,/workspace/Data_share/ND/ND_data/ND_299.tif,/workspace/Data_share/ND/ND_annotations/ND_299...,ND_299.xml
1,ND_227.tif,/workspace/Data_share/ND/ND_data/ND_227.tif,/workspace/Data_share/ND/ND_annotations/ND_227...,ND_227.xml
2,ND_457.tif,/workspace/Data_share/ND/ND_data/ND_457.tif,/workspace/Data_share/ND/ND_annotations/ND_457...,ND_457.xml
3,ND_505.tif,/workspace/Data_share/ND/ND_data/ND_505.tif,/workspace/Data_share/ND/ND_annotations/ND_505...,ND_505.xml
4,ND_516.tif,/workspace/Data_share/ND/ND_data/ND_516.tif,/workspace/Data_share/ND/ND_annotations/ND_516...,ND_516.xml


###Ensure that image chips do not contain invalid values

In [8]:
df['valid'] = True
for index, row in tqdm(df.iterrows(), total=len(df)):
    with rasterio.open(row['filepath'], 'r') as src:
        data = src.read()
    # this operation catches invalid values that are too high or too low compared to the rest of the DEM
    df.at[index, 'valid'] = False if (np.max(data) - np.min(data)) > 10000 else True
    if (np.max(data) - np.min(data)) > 10000:
        print (np.min(data), np.max(data))

filtered = len(df[df['valid']==False])

print(f'{filtered} out of {len(df)} chips filtered out due to invalid pixel values')

 29%|██████████▊                          | 179/612 [00:03<00:07, 60.08it/s]

0.0 3.402823e+38


 44%|████████████████▍                    | 272/612 [00:04<00:05, 57.45it/s]

0.0 3.402823e+38


 50%|██████████████████▎                  | 303/612 [00:05<00:05, 58.50it/s]

0.0 3.402823e+38


 54%|███████████████████▊                 | 328/612 [00:05<00:04, 59.38it/s]

0.0 3.402823e+38
0.0 3.402823e+38


 65%|███████████████████████▉             | 395/612 [00:06<00:03, 58.25it/s]

0.0 3.402823e+38


 77%|████████████████████████████▍        | 470/612 [00:08<00:02, 60.68it/s]

0.0 3.402823e+38
0.0 3.402823e+38


100%|█████████████████████████████████████| 612/612 [00:10<00:00, 57.56it/s]

8 out of 612 chips filtered out due to invalid pixel values





In [9]:
df = df[df['valid']==True]

In [24]:
# create annotation directory
output_directory = '/workspace/processed_data_v7/transfer_data'

os.makedirs(output_directory, exist_ok=True)
os.chdir(output_directory)
os.makedirs('test', exist_ok=True)

In [11]:
# as we are using this dataset entirely for testing, random splitting is not necessary

df['usage'] = 'test'
# use shutil to move files into directories based on the usage column
for index, row in tqdm(df.iterrows(), total=len(df)):
    filepath = row['filepath']
    usage = row['usage']
    new_path = os.path.join(output_directory, usage, f'{index}.tif')
    shutil.copy(filepath, new_path)

100%|████████████████████████████████████| 604/604 [00:00<00:00, 639.12it/s]


## Iterating through annotation files to create the COCO json for each directory

Once again, we filter bounding boxes which fall outside the zone of the image that will be cropped to in testing for each crop we are utilizing in experiments

In [12]:
# change to annotation directory
annotation_directory = os.path.join(output_directory, 'annotations')
os.makedirs(annotation_directory, exist_ok=True)
os.chdir(annotation_directory)

In [20]:
def generate_json(crop):
    # for each given directory:
    ann_number = 0
    for usage in ['test']:
        # initialize COCO json for the directory
        usage_json = {}
        info = {'year': 2024,
                'version': 1.0,
                'description': f'Data to {usage} drainage culvert detection task',
                'contributor': 'none',
                'url': 'none',
                'date_created':None}
        licenses = [{'id':1,
                    'url':'https://creativecommons.org/publicdomain/zero/1.0/',
                    'name':'Public Domain'}]
        categories = [{'id':1,
                       'name':'Drainage Culvert',
                       'supercategory':'none'}]
        images = []
        annotations = []
        # get a dataframe of only files for a given usage
        usage_df = df[df['usage'] == usage]
        for index, row in tqdm(usage_df.iterrows(), total=len(usage_df)):
            # add image information to json
            image_dict = {'id':index,
                          'license':1,
                          'file_name':f'{index}.tif',
                          'height':800,
                          'width':800,
                          'date_captured':'none'}
            images.append(image_dict)
            # load in annotation xml to dict
            tree = ET.parse(row['annpath'])
            root = tree.getroot()
    
            
            # extract bounding box from dictionary and append to json as annotation
            for bbox in root.findall('object'):
                xmin = int(bbox.find('bndbox/xmin').text)
                ymin = int(bbox.find('bndbox/ymin').text)
                xmax = int(bbox.find('bndbox/xmax').text)
                ymax = int(bbox.find('bndbox/ymax').text)
    
                limit = ((800 - crop) / 2) + 50 # sets a box outside of which to exclude centroids
                if xmax <= limit and usage != 'train':
                    continue
                elif xmin >= 800 - limit and usage != 'train':
                    continue
                elif ymax <= limit and usage != 'train':
                    continue
                elif ymin >= 800 - limit and usage != 'train':
                    continue
                    
                width = xmax - xmin
                height = ymax - ymin
                annotation_dict = {'id': ann_number,
                              'image_id':index,
                              'category_id': 1,
                              'bbox': [xmin, ymin, width, height],
                              'area': width*height,
                              'segmentation':[],
                              'iscrowd':0}
                annotations.append(annotation_dict)
                ann_number += 1
        usage_json['info'] = info
        usage_json['licenses'] = licenses
        usage_json['categories'] = categories
        usage_json['images'] = images
        usage_json['annotations'] = annotations
        
        json_data = json.dumps(usage_json, indent=4)
    
            
        with open(f'{usage}_{crop}.json', 'w') as json_file:
            json_file.write(json_data)

In [21]:
for crop in [256, 400, 600, 800]:
    generate_json(crop)

100%|███████████████████████████████████| 604/604 [00:00<00:00, 5143.66it/s]
100%|███████████████████████████████████| 604/604 [00:00<00:00, 8687.94it/s]
100%|███████████████████████████████████| 604/604 [00:00<00:00, 9114.28it/s]
100%|███████████████████████████████████| 604/604 [00:00<00:00, 9034.23it/s]
