# xView Vehicle Object Detection Data Prep

This notebook prepares data for training an object detection model on the xView dataset.

Set an S3 URI that you have write access to here:

In [10]:
base_dir = "s3://raster-vision-example"

In [2]:
import os
import json
import rastervision as rv
import boto3
import botocore
import rasterio
import random
import copy
from shapely.geometry import (Polygon, shape)

Root temporary directory cannot be used: None. Using root: /opt/data/tmp/
Temporary directory is: /opt/data/tmp/tmp9yiypnow


In [3]:
label_path = '/opt/data/xview/labels/xView_train.geojson'
if not os.path.exists(label_path):
    !aws s3 cp s3://raster-vision-xview-example/xView_train.geojson $label_path

download: s3://raster-vision-xview-example/xView_train.geojson to ../../data/xview/labels/xView_train.geojson


### Filter out non vehicles

In [4]:
label_js = None
with open(label_path) as f:
    label_js = json.loads(f.read())

In [5]:
vehicle_types = [17, 18, 19, 20, 21, 23, 24, 25, 26, 27, 28, 29, 32, 
                 53, 54, 55, 56, 57, 59, 60, 61, 62, 63, 64, 65, 66]

In [6]:
vehicle_features = []
for f in label_js['features']:
    if f['properties']['type_id'] in vehicle_types:
        f['properties']['class_name'] = 'vehicle'
        vehicle_features.append(f)
label_js['features'] = vehicle_features

### Find training and test images

In [7]:
image_to_vehicle_counts = {}
for f in label_js['features']:
    image_id = f['properties']['image_id']
    if image_id not in image_to_vehicle_counts.keys():
        image_to_vehicle_counts[image_id] = 1
    else:
        image_to_vehicle_counts[image_id] += 1

In [8]:
experiment_image_count = round(len(image_to_vehicle_counts.keys()) * 0.1)
sorted_images_and_counts = sorted(image_to_vehicle_counts.items(), key=lambda x: x[1])
selected_images_and_counts = sorted_images_and_counts[-experiment_image_count:]

In [9]:
# Split training and validation
ratio = 0.8
training_sample_size = round(ratio * experiment_image_count)
train_sample = random.sample(range(experiment_image_count), training_sample_size)

train_images = []
test_images = []

In [11]:
for i in range(training_sample_size):
    img = selected_images_and_counts[i][0]
    img_uri = os.path.join(base_dir, 'train_images', img)
    if i in train_sample:
        train_images.append(img_uri)
    else:
        test_images.append(img_uri)                

### Get individual geojson for each tiff

In [14]:
processed_labels_dir = '/opt/data/xview/processed_labels/'

In [19]:
def subset_labels(tiff_list, processed_labels_dir):
    def f(tiff_uri):
        tiff_basename = os.path.basename(tiff_uri)
        tiff_features = []
        for l in label_js['features']:
            image_id = l['properties']['image_id']
            if image_id == tiff_basename:
                tiff_features.append(l)
        labels_subset = copy.deepcopy(label_js)
        labels_subset['features'] = tiff_features
        return labels_subset   
    for i in train_images:
        basename = os.path.splitext(os.path.basename(i))[0]
        tiff_geojson = f(i)
        with open(os.path.join(processed_labels_dir, '{}.geojson'.format(basename)), 'w') as file:
            file.write(json.dumps(tiff_geojson, indent=4))
            

In [None]:
subset_labels(train_images, processed_labels_dir)
subset_labels(test_images, processed_labels_dir)

In [None]:
!ls $processed_labels_dir