# Converting the Drainage Crossings Dataset to COCO Format

In [29]:
import os
import pandas as pd
import numpy as np
import shutil

In [2]:
# change to dataset path, all else can then be run without modifying
dataset_path = "/home/denys/dl-gpu/Data_share"

In [8]:
directories = ['CA', 'IL', 'ND', 'NE']

## 1. Renaming files, creating a data catalog, and splitting the dataset

In [14]:
# rename all image files to reflect their physical location
for d in directories:
    dirpath = os.path.join(dataset_path, d, f'{d}_data')
    for file in os.listdir(dirpath):
        rename = f"{d}_{file}"
        new_path = os.path.join(dirpath, rename)
        os.rename(os.path.join(dirpath, file), new_path)

In [15]:
# rename annotation directories for consistency
os.rename(os.path.join(dataset_path, 'CA', 'annotations'), os.path.join(dataset_path, 'CA', 'CA_annotations'))
os.rename(os.path.join(dataset_path, 'IL', 'annotations'), os.path.join(dataset_path, 'IL', 'IL_annotations'))

In [16]:
# rename all annotation files to match their corresponding image
for d in directories:
    dirpath = os.path.join(dataset_path, d, f'{d}_annotations')
    for file in os.listdir(dirpath):
        rename = f"{d}_{file}"
        new_path = os.path.join(dirpath, rename)
        os.rename(os.path.join(dirpath, file), new_path)

In [37]:
# create a dataframe with files, annotations, and corresponding paths
file_list = []
for d in directories:
    dirpath = os.path.join(dataset_path, d, f'{d}_data')
    for file in os.listdir(dirpath):
        if file.endswith('.tif'):
            annotation = file[:-4]+'.xml'
            file_list.append({
                'filename': file, 
                'filepath':os.path.join(dirpath, file),
                'annpath': os.path.join(dataset_path, d, f'{d}_annotations', annotation),
                'ann': annotation
            })

In [38]:
df = pd.DataFrame(file_list)
df.head()

Unnamed: 0,filename,filepath,annpath,ann
0,CA_682.tif,/home/denys/dl-gpu/Data_share/CA/CA_data/CA_68...,/home/denys/dl-gpu/Data_share/CA/CA_annotation...,CA_682.xml
1,CA_1810.tif,/home/denys/dl-gpu/Data_share/CA/CA_data/CA_18...,/home/denys/dl-gpu/Data_share/CA/CA_annotation...,CA_1810.xml
2,CA_1671.tif,/home/denys/dl-gpu/Data_share/CA/CA_data/CA_16...,/home/denys/dl-gpu/Data_share/CA/CA_annotation...,CA_1671.xml
3,CA_1866.tif,/home/denys/dl-gpu/Data_share/CA/CA_data/CA_18...,/home/denys/dl-gpu/Data_share/CA/CA_annotation...,CA_1866.xml
4,CA_969.tif,/home/denys/dl-gpu/Data_share/CA/CA_data/CA_96...,/home/denys/dl-gpu/Data_share/CA/CA_annotation...,CA_969.xml


In [39]:
# randomly split dataset
np.random.seed(0)
def assign_usage(file):
    n = np.random.rand()
    if n < 0.7:
        return 'train'
    elif n < 0.9:
        return 'validate'
    else:
        return 'test'

df['usage'] = df.apply(assign_usage, axis=1)
df.head()

Unnamed: 0,filename,filepath,annpath,ann,usage
0,CA_682.tif,/home/denys/dl-gpu/Data_share/CA/CA_data/CA_68...,/home/denys/dl-gpu/Data_share/CA/CA_annotation...,CA_682.xml,train
1,CA_1810.tif,/home/denys/dl-gpu/Data_share/CA/CA_data/CA_18...,/home/denys/dl-gpu/Data_share/CA/CA_annotation...,CA_1810.xml,validate
2,CA_1671.tif,/home/denys/dl-gpu/Data_share/CA/CA_data/CA_16...,/home/denys/dl-gpu/Data_share/CA/CA_annotation...,CA_1671.xml,train
3,CA_1866.tif,/home/denys/dl-gpu/Data_share/CA/CA_data/CA_18...,/home/denys/dl-gpu/Data_share/CA/CA_annotation...,CA_1866.xml,train
4,CA_969.tif,/home/denys/dl-gpu/Data_share/CA/CA_data/CA_96...,/home/denys/dl-gpu/Data_share/CA/CA_annotation...,CA_969.xml,train


## Restructuring the data directory to conform to COCO specifications

In [None]:
# create training, testing, and validation directories

# use shutil to move files into directories based on the usage column

## Iterating through annotation files to create the COCO json for each directory

In [None]:
# for each given directory:

# initialize COCO json for the directory

# read in all xml files in a given directory

# extract annotations and append to json