## Demonstration: Reading and reanalyzing high resolution data on AWS

In this notebook, you will see how to:

- Read and write data from an EC2 instance to/from an AWS S3 bucket
- Use python's `rasterio`, `geopandas`, and `shapely` to subset and retile PlanetScope basemap imagery
- Calculate NDVI using the retiled data

## Preparation
### Packages

In [1]:
import cloudtools as ct  # from https://github.com/agroimpacts/cloudtools.git
import re
import os
import errno
from subprocess import run

import rasterio
from rasterio.merge import merge
from rasterio import fill
from rasterio.plot import show
from rasterio.io import MemoryFile
from rasterio.warp import reproject, Resampling

from shapely.geometry import box
import geopandas as gpd
from geopandas.tools import sjoin

import affine

import numpy as np
import pandas as pd

import tempfile

### Functions

We'll need the following functions to do the image retiling and reprojection further down.

In [None]:
def get_tempfile_name(file_name = 'mosaic.tif'):
    """Create a temporary filename in the tmp directory
    """
    file_path = os.path.join(
        tempfile.gettempdir(), 
        next(tempfile._get_candidate_names()) + "_" + file_name
    )
#     with tempfile.TemporaryDirectory() as tmp:
#         file_path = os.path.join(tmp, file_name)
    
    return file_path

def dst_transform(poly, res = 0.005 / 200):
    """
    Create transform from boundaries of tiles
    
    Parameters
    ----------
    poly : GeoDataFrame
        Polygon containing dimensions of interest
    res : float
        Resolution desired for output transform
    
    Returns
    -------
    An Affine transform

    """
    bounds = poly['geometry'].bounds.values.flatten()
    minx = bounds[0]
    maxy = bounds[3]
    transform = affine.Affine(res, 0, minx, 0, -res, maxy)
    return(transform)


def reproject_retile_image(
    src_images, dst_transform, dst_width, dst_height, nbands, 
    dst_crs, fileout, dst_dtype = np.int16, inmemory = True, cleanup=True):
    """Takes an input images or list of images and merges (if several) and 
    reprojects and retiles it to align to the resolution and extent defined by
    an polygon and associated transform
    
    
    Parameters:
    ----------
    src_images : list 
        File path or list of file paths to input image(s). A list of images
        will be merged first.
    dst_transform : affine
        affine transformation object defining projection of output image
    dst_width : int 
        The pixel width of the output image
    dst_height : int
        The pixel height of the output image
    nbands : int
        Number of bands in input images
    dst_crs : str
        Code for output CRS, e.g "EPSG:4326"
    file_out : str
        Output file path and name for output geotiff
    dst_dtype : type
        Numpy data type (default is int16)
    inmemory : bool
        If a mosaic should be made in memory or not. Default is True. 
        If set to False then a mosaic with the mosaic will be 
        written to disk in a directory called ~/tmp and then removed
        upon completion
    cleanup : bool
        Whether to remove temporary mosaic (if made) or not
    
    Returns
    -------
    geotiff of retiled image writen to disk 
    """
    
    
    def reproject_retile(src, nbands, dst_height, dst_width, fileout, dst_dtype): 
        src_kwargs = src.meta.copy()  # get metadata
        kwargs = src_kwargs
        kwargs.update({
            "width": dst_width,
            "height": dst_height,
            "count": nbands,
            "crs": dst_crs,
            "transform": dst_transform,
        })
        dst_canvas = np.zeros((nbands, dst_height, dst_width))
        for i in range(1, nbands + 1):
            dst_canvas[i-1,] = reproject(
                source = rasterio.band(src, i),
                destination = dst_canvas[i-1,],
                src_transform = src.transform,
                src_crs = src.crs,
                dst_transform = dst_transform,
                dst_crs = dst_crs,
                resampling = Resampling.cubic
            )[0]
        with rasterio.open(fileout, "w", **kwargs) as dst:
            dst.write(np.rint(dst_canvas).astype(dst_dtype))
            
    # mosaic if list
    if type(src_images) is list:
        print('Mosaicking {} images'.format(len(src_images)))
        
        images_to_mosaic = []
        for image in src_images:
            src = rasterio.open(image)
            images_to_mosaic.append(src)

        # perform mosaic
        mosaic, out_trans = merge(images_to_mosaic)

        out_meta = src.meta.copy()
        out_meta.update({
            "height": mosaic.shape[1],
            "width": mosaic.shape[2],
            "transform": out_trans,
        })
        
        if inmemory:
            print('Mosaicking in memory')
            with MemoryFile() as memfile:
                with memfile.open(**out_meta) as dst:
                    dst.write(mosaic)

                print('Reprojecting, retiling {}'.format(os.path.basename(fileout)))
                reproject_retile(src, nbands, dst_height, dst_width, fileout, 
                                 dst_dtype)
        else: 
            temp_mosaic = get_tempfile_name('mosaic.tif')
            print('Creating temporary mosaick {}'.format(temp_mosaic))

            with rasterio.open(temp_mosaic, "w", **out_meta) as dst:
                  dst.write(mosaic)
            
            print('Reprojecting, retiling {}'.format(os.path.basename(fileout)))
            with rasterio.open(temp_mosaic, "r") as src:
                reproject_retile(src, nbands, dst_height, dst_width, fileout, 
                                 dst_dtype) 
            
            if cleanup: 
                print('Removing temporary mosaick {}'.format(fileout))
                os.remove(temp_mosaic)
            
    else: 
        print('Retiling from single image')
        # src_image = src_ima
        print('Reprojecting, retiling {}'.format(os.path.basename(fileout)))
        with rasterio.open(src_images, "r") as src:
            reproject_retile(src, nbands, dst_height, dst_width, fileout, 
                             dst_dtype) 
    
    print('Retiling and reprojecting of {} complete!'.format(fileout))

### Directory for local outputs

In [None]:
proj_dir = os.path.dirname(os.path.dirname(os.path.abspath(os.getcwd())))
image_dir = os.path.join(proj_dir, 'projects/images')

try:
    os.makedirs(image_dir)
except OSError as e:
    if e.errno != errno.EEXIST:
        raise

### Datasets

#### Images

Create credentials needed to read PlanetScope imagery stored on an S3 bucket called `activemapper`. Use those credentials to create a list of PlanetScope images stored in the bucket. 

In [None]:
s3resource = ct.aws_resource('s3', profile=None)
keys = ct.list_objects(s3resource, 'activemapper', 'ecaas_2021/nicfi', '.tif')
image_list = [key for key in keys if re.search(r'2021-11', key)]

# Print the first ten image names
for image in image_list[0:10]:
    print(os.path.basename(image))

#### Areas of interest

Our area of interest with these data is in Ghana, where we have a tiling grid that we use to prepare imagery for land cover mapping using a deep learning model. We are going to use this grid to retile the PlanetScope imagery, which has its own grid, reprojecting it to another coordinate system and resampling it to a slightly finer resolution.  

We first need to load in the grid, which is stored as a geojson, also on S3. 

In [None]:
tilefile = "s3://activemapper/ecaas_2021/geoms/ghana_ejura_tain_buf179.geojson"
tile_polys = gpd.read_file(tilefile)
tile_polys = tile_polys.astype(
    {"tile": "str", "tile_col": "int", "tile_row": "int"}
)

And then plot it to look at it

In [None]:
tile_polys.boundary.plot(figsize=(25,10))

#### Overlap between images and AOI tiles

The first thing we want to do is figure out where the image data are in relation to our tile grid. To do that, we need to get the extent of each PlanetScope tiles, and turn that into a set of polygons.  

The first thing we need to do is convert the list of file paths we collected for imagery into a set of paths that can be used to read in images using rasterio. 

In [None]:
s3path = "/vsis3/activemapper"
image_paths = ["{}/{}".format(s3path, image) for image in image_list]
for path in image_paths[0:10]:
    print(path)

Note the `/vsis3/` at the front of the path. This is construction used by `gdal` to read data from an S3 bucket (`rasterio` is a package based on `gdal`). 

We are now going to read through that list of images and collect their bounding boxes, turn them into a geometry, and then combine them into a `GeoDataFrame`. 

This code block creates a list of polygons. 

In [None]:
image_bounds = []
for image in image_paths:
    img = rasterio.open(image)
    geom = box(*img.bounds)
    pol = gpd.GeoDataFrame({"file":os.path.basename(image),"geometry":[geom]})
    image_bounds.append(pol)

And this combines them into a `GeoDataFrame`

In [None]:
nicfi_tile_polys = gpd.GeoDataFrame(
    pd.concat(image_bounds, ignore_index=True), 
    crs=img.crs
)
nicfi_tile_polys

We'll do a little more to extract the row and column ID from the file name. 

In [None]:
# def sub_function(variable, position):
#     out = re.sub("planet*.*mosaic_|.tif", "", variable).split("-")[position]
#     return out

# nicfi_tile_polys = nicfi_tile_polys.assign(
#     row = lambda df: df['file'].map(lambda file: sub_function(file, 0))
# ).assign(
#     col = lambda df: df['file'].map(lambda file: sub_function(file, 1))
# )[['row', 'col', 'file', 'geometry']]

def sub_function(variable):
    out = re.sub("planet*.*mosaic_|.tif", "", variable)
    return out

nicfi_tile_polys = nicfi_tile_polys.assign(
    tile = lambda df: df['file'].map(lambda file: sub_function(file))
)[['tile', 'file', 'geometry']]

nicfi_tile_polys

Let's have a look at the image boundaries (blue). More or less the same shape as our tiling grid. 

In [None]:
nicfi_tile_polys.boundary.plot(figsize=(25,10))

#### Transform AOI tiles

However, the two sets of data are in different coordinate systems, as we can see:

In [None]:
nicfi_tile_polys.crs

In [None]:
tile_polys.crs

So we have to reproject the tiling grid into mercator:

In [None]:
tile_polys_merc = tile_polys.to_crs(nicfi_tile_polys.crs)

And now we can compare the two together:

In [3]:
ax1 = nicfi_tile_polys.boundary.plot(figsize=(25,10))
tile_polys_merc.boundary.plot(ax = ax1, color='red')

The image boundaries (blue) are much larger than the tile grid (red). 

## Select image subset

Now let's select one of red tiles and then use that to:

1. Pull out a subset of the PlanetScope data corresponding to the tile boundaries
2. Reproject that imagery and resample it
3. Store it in a new bucket for later analysis

You will see above that the two grids are not exactly aligned, so one of the red tiles might actually overlap several PlanetScope images. So we will:

4. Select a specific tile
5. See which PlanetScope images it intersects

In [None]:
# index
tile = tile_polys_merc.iloc[[499]]
tiles_int = sjoin(tile, nicfi_tile_polys, how='left')
tiles_int

We use a spatial join to identify intersecting NICFI tiles (identified with the columns index_right, tile_right, and file). These correspond to our selected tile (red below), which is tile565867 (tile_left). It overlaps four NICFI images (grey). 

In [None]:
nicfi_tiles_int = nicfi_tile_polys[nicfi_tile_polys['file'].isin(tiles_int['file'])]

ax1 = nicfi_tile_polys.boundary.plot(figsize=(25,10))
nicfi_tiles_int.plot(ax = ax1, color='grey')
tile.boundary.plot(ax = ax1, color='red')

## Retile and resample images

Read image(s) to subset into a list

In [5]:
if len(nicfi_tiles_int['file']) > 1:
    image_list = ['/vsis3/activemapper/ecaas_2021/nicfi/{}'.format(file) \
                  for file in nicfi_tiles_int['file']]
else: 
    image_list = '/vsis3/activemapper/ecaas_2021/nicfi/{}'.format(
        nicfi_tiles_int['file'].values[0]
    )
image_list

### Output paths

Set up output paths for image and COG version of image (the final one we want)

In [None]:
dst_img = '{}/{}_2021-11_buf179.tif'.format(
    image_dir, 
    tile['tile'].values.flatten()[0]
)
dst_cog = re.sub('.tif', '_cog.tif', dst_img)

### Define the output extent and projection

By using the the GCS version of the tile polygon

In [None]:
poly = tile_polys[tile_polys['tile'].isin(tile['tile'])]
transform = dst_transform(poly)

### Retile and reproject

In [None]:
reproject_retile_image(image_list, transform, 2358, 2358, 4, "EPSG:4326", 
                       dst_img, inmemory=False)

## Create Cloud-Optimized Geotiff

To optimize the image for use in cloud storage

First create the COG.

In [None]:
cmd = ['rio', 'cogeo', 'create', '-b', '1,2,3,4', dst_img, dst_cog]
p = run(cmd, capture_output=True)
msg = p.stderr.decode().split('\n')
msg[-2]

Then check that it is valid

In [8]:
cog = rasterio.open(dst_cog).read([3,2,1])
cog_norm = (cog - cog.min()) / (cog.max() - cog.min())

ax1 = poly.boundary.plot(figsize=(25,10))
show(cog_norm, ax=ax1)
# poly.boundary.plot(ax = ax1, color='red'

## Move to S3

We will write this a separate S3 bucket `s3://geog287387`. You will specify your own folder names underneath that. 

In [9]:
myusername = 'testuser'
foldername = 'nicfi'

bucket = 'geog287387'  # don't change this

We'll use these to create AWS credentials and a move to move the file into.

In [None]:
s3_client = ct.aws_client('s3', profile=None)
s3_file = '{}/{}/{}'.format(myusername, foldername, os.path.basename(dst_cog))
s3_file

And upload to S3

In [None]:
with open(dst_cog, "rb") as f:
    s3_client.upload_fileobj(f, bucket, s3_file)

### Check image on S3

Now let's check it on S3, both by looking at the console and by querying the bucket from here

In [None]:
s3resource = ct.aws_resource('s3', profile=None)
keys = ct.list_objects(s3resource, bucket, myusername + "/" + foldername)
# image_list = [key for key in keys if re.search(r'2021-11', key)]
keys

That shows you which images are on your bucket.

Now let's read the image in directly from the bucket and plot it here.

In [None]:
s3cog_path = '/vsis3/{}/{}'.format(bucket, s3_file)
s3cog = rasterio.open(s3cog_path)

ax1 = poly.boundary.plot(figsize=(25,10))
show(s3cog, ax = ax1)

Now that we have our image on S3, we can remove it from our local directory. Let's first see what is in it:

In [None]:
os.listdir(image_dir)

And we can remove the files we created by name

In [None]:
for f in [dst_img, dst_cog]:
    os.remove(f)

os.listdir(image_dir)

## Assignment

Now that you have seen all the steps for selecting and retiling an image, try this again, but with a new tile selected from our tile grid. Go back up to the [Select Image Subset](#Select-image-subset) section, and choose a new tile ID, and work back down to here. 