## Applying Deep Learning Techniques to Predict Simulium Damnosum Habitat Locations
<hr>
### Data
- 0.6m panchromatic and 2.4 8-band satellite data from worldview-2
- ~30 data points of river locations and Simulium Damnosum larvae counts

### Approach
Few labeled data and an abundance of unlabeled data makes this problem suitable for a semi-supervised approach. We wil first build a variational auto encoder to extract features that we can then use a supervised approach using the labels.

### 1: Filter river coordinates
Unlabeled river coordinates are highly coorelated and overlap. Decoorelating requires filtering redundant points and ensuring the selected points are well spread out. A simple way to do this is to overlay a grid on the satellite data and select at most one point from each grid cell.

In [49]:
"""
Read river points
"""
from osgeo import ogr

river_points_file = 'data/uganda_river_points/uganda_river_points.shp'

point_data = ogr.Open(river_points_file)
    
layer = point_data.GetLayer(0)
points = []
for i in range(layer.GetFeatureCount()):
    feature = layer.GetFeature(i)
    geometry = feature.GetGeometryRef()
    points.append(dict(lat = geometry.GetY(), long = geometry.GetX()))
    
    
"""
Read satellite data
"""

import gdal
import osr
import numpy as np

sat_file = 'data/sat/06/06m.tif'

tif = gdal.Open(sat_file)
width = tif.RasterXSize
height = tif.RasterYSize

geo_transform = tif.GetGeoTransform()

img_space = osr.SpatialReference()
img_space.ImportFromWkt(tif.GetProjectionRef())

geo_space = osr.SpatialReference()
geo_space.SetWellKnownGeogCS('WGS84')

# Used to convert from geocoordinate space to image space
transform = osr.CoordinateTransformation(geo_space, img_space)

"""
Convert points to image space
"""

image_points = []
print(geo_transform[0], geo_transform[3])
for point in points:
    coord = np.array(transform.TransformPoint(point['long'], point['lat']))
    coord[0] = int((coord[0] - geo_transform[0]) / geo_transform[1]) #x pixel
    coord[1] = int((coord[1] - geo_transform[3]) / geo_transform[5]) #y pixel
    
    #check bounds
    if(coord[0] < 50 or coord[1] < 50 or coord[0] >= width - 50 or coord[1] >= height - 50):
        continue
    image_points.append(np.array([coord[0], coord[1]], dtype = np.int))

"""
Filter points based on a grid
"""

grid_scale = 50
filtered_points = {}
for point in image_points:
    grid_key = str((point / grid_scale).astype(np.int))
    if grid_key in filtered_points:
        continue
    filtered_points[grid_key] = point

filtered_points = np.array([v for k, v in filtered_points.items()])

"""
Randomize points to further decoorelate
"""

np.random.shuffle(filtered_points)

"""
Store filtered points as csv
"""

out_file = 'data/filtered_points.csv'
np.savetxt(out_file, filtered_points, delimiter = ',', fmt = '%i')

423348.0 371402.0


### 2: Extract site images
Use unlabeled pixel coordinates as the center point for extracting image patches

In [50]:
import gdal
import osr
import numpy as np
from scipy.misc import toimage

sat_file = 'data/sat/06/06m.tif'
point_file = 'data/filtered_points.csv'

tif = gdal.Open(sat_file)

points = np.loadtxt(point_file, delimiter = ',')
for i, point in enumerate(points):
    top_left = point - 50
    dimensions = [100, 100]
    pixels = np.zeros((dimensions[1], dimensions[0], 3), dtype = np.uint8)
    for band in range(1, 4):
        raster = tif.GetRasterBand(band)
        img_data = raster.ReadAsArray(top_left[0], top_left[1], dimensions[0], dimensions[1])
        pixels[:, :, band - 1] = img_data
    
    img = toimage(pixels)
    img.save('data/sites/{0}.png'.format(i))