# ML4FLoods - Query Function Developement

Authors: Beth Curran (Mitchell) and Christopher Rudolph

Supervised by Cormac Purcell and Gonzalo Mateo-Garcia for Trillium Technologies. 

Original codebase by Gonzalo Mateo-Garcia https://github.com/spaceml-org/ml4floods/blob/0f8fdc4b6b891ed249f52fda19ee23c50462e237/ml4floods/data/ee_download.py

Aim: To adapt and extend on code from ee_download.py in order to query avaliable Earth Engine data for a given AOI. Addressing deliverable **D1.1**.

**D1.1 Workflow to easily define event parameters (time-ranges, AOIs), query available satellite data and visualise satellite metadata.**

## Packages Required

In [None]:
pip install git+https://github.com/spaceml-org/ml4floods#egg=ml4floods

In [4]:
import traceback
import warnings

from io import StringIO
import ee
import time
import os
from glob import glob # not sure what this was used for -chris
from typing import Optional, Callable, List, Tuple
from shapely.geometry import mapping, Polygon
import numpy as np
import geopandas as gpd
import pandas as pd
import fsspec
from datetime import datetime, timezone
import math

from ml4floods.data import ee_download, utils

## Setting up Search Parameters

### Area of Interest (AOI) Boundaries

In [9]:
#Reading in the AOI 

#For the purposes on this workflow we are assuming the AOI is given as a file that is readable by geopandas
def read_aoi(my_gpd_aoi:str):
    my_aoi = gpd.read_file(my_gpd_aoi)
    
    aoi_codes = my_aoi['aoi_code']
    js = json.loads(my_aoi.to_json()) # convertin gthe aoi to a json and getting the 
    
    total_bounds = ee.Geometry(ee.FeatureCollection(js).geometry())
    total_bound_coordinates = total_bounds.getInfo()['coordinates']
    
    n_polygons = len(total_bounds_coordinates)
    
    print('Number of polygons in AOI:', n_polygons)
    
    return total_bounds, aoi_codes, n_polygons


### Desired Date Range

In [20]:
#Specifiying date of interest and converting to a variable to be passed, 
#inlcude year as a seperate variable for year long data (e.g. permanent water)

#User to input dates with format 'dd/mm/yy' seperated by a comma. 

date_range = ('dd/mm/yy', 'dd/mm/yy')

def get_datetime(date_range: tuple): # function returns a tuple containing datetime
    datetime_start= datetime.strptime(date_range[0], '%d/%m/%y')
    datetime_end= datetime.strptime(date_range[1], '%d/%m/%y')
    
    start_year = datetime_start.year
    end_year = datetime_end.year
    
    return datetime_start, datetime_end, start_year, end_year 

### Boundaries for each Polygon in AOI

In [24]:
#returns a list of the bounds of the polygons, with a parameter for how many polygons you want to use. 
#The idea is that if you want all of them you can input the number printed from the 'read_aoi' function call.

def get_polygon_bounds(total_bounds, aoi_codes, n_polygons):
    # Get the coordinates for each AOI.
    all_polygon_bounds = total_bounds.getInfo()['coordinates']

    # Reducing the number of AOIs to n_polygons 
    polygon_aoi_codes = aoi_codes[0:n_polygons]

    # Updating a list with the first 20 coordinates due to processing times. 
    polygon_bounds = []
    for i in range(len(all_polygon_bounds[0:n_polygons])):
      
       polygon_bounds = np.append(polygon_bounds, ee.Geometry.Polygon(all_polygon_bounds[i]))
                       
    return polygon_bounds, polygon_aoi_codes

## Initial Queries

### Reading in Data on Avaliable Images

In [6]:
#returns arrays of imageCollections for each source (landsat and s2)

def get_imageCollections(polygon_bounds: np.ndarray, dates: tuple): #bounds_list refers to a list of geometries/polygons extracted above from the aoi

    get_landsat_collection = ee_download.get_landsat_collection
    get_s2_collection = ee_download.get_s2_collection
    
    landsat_imgs = []
    s2_imgs = []
    
    for i in range(len(polygon_bounds)):
        landsat_imgs = np.append(landsat_imgs, get_landsat_collection(date_start=dates[0], date_end=dates[1] , bounds= polygon_bounds[i]))
        s2_imgs = np.append(s2_imgs, get_s2_collection(date_start=dates[0], date_end=dates[1] , bounds= polygon_bounds[i]))

    return landsat_imgs, s2_imgs

### Extracting Relevant Properties from Data and Formatting into a Dataframe

In [None]:
def properties_dataframe(landsat_imgs: ee.ImageCollection, s2_imgs: ee.ImageCollection, polygon_bounds, polygon_aoi_codes):
    # --------- Landsat ---------
    landsat_coordinates = [[] for _ in range(len(landsat_imgs))] # creating number of lists to match the number of polygons so that each polygon will have it's coordinates
    n_landsat = []

    # Loop 1: Getting the number of images for each AOI, returns an array where each index is an 
    # AOI and each value in the index is the number of images of that AOI. 
    for i in range(len(landsat_imgs)):

        # Get the coordinates for each AOI
        landsat_coordinates[i] = polygon_bounds[i]['coordinates']

        # Of the 20 indices, there will be the number of images in each. 
        n_landsat = np.append(n_landsat, len(landsat_imgs[i].getInfo()['features']))

    # End loop 1

    id_list = []
    cloudCover_list = []
    validPixels_list = []
    source_list = ['landsat']*np.int(n_landsat.sum()) # creating a list the size of the total number of images of all the polygons
    coordinates_list = [] # creating number of lists to match the number of polygons so that each polygon will have it's coordinates
    aoi_list = []

    # Loop 2: Updating property lists that will be used to populate a dataframe.
    for i in range(len(landsat_imgs)): # Iterating through all the polygons

        for k in range(np.int(n_landsat[i])): # Iterating through all the images of each polygon

            id_list = np.append(id_list, landsat_imgs[i].getInfo()['features'][k].get('id'))
            cloudCover_list = np.append(cloudCover_list, landsat_imgs[i].getInfo()['features'][k].get('properties')['CLOUD_COVER'])
            validPixels_list = np.append(validPixels_list, landsat_imgs[i].getInfo()['features'][k].get('properties')['valids'])
            aoi_list = np.append(aoi_list, polygon_aoi_codes[i])
            index_coord = landsat_coordinates[i]
            coordinates_list.extend(index_coord)      
    # End loop 2

    # Populate the dataframe
    landsat_df = gpd.GeoDataFrame({'id':id_list, '%cloud_cvr':cloudCover_list, 'source': source_list, 'aoi_code': aoi_list, '%valid_pxl': validPixels_list, 'geometry':coordinates_list})
    landsat_df.head(10)
    print('Number of landsat images:', len(landsat_df))

    # --------- S2 ---------
    s2_coordinates = [[] for _ in range(len(s2_imgs))] # creating number of lists to match the number of polygons so that each polygon will have it's coordinates
    n_s2 = []
    
    # Loop 1
    for i in range(len(s2_imgs)): # Iterating through all the polygons

        s2_coordinates[i] = polygon_bounds[i]['coordinates']

        n_s2 = np.append(n_s2, len(s2_imgs[i].getInfo()['features'])) # Of the 20 indices, there will be the number of images in each. 
    # End loop 1

    id_list = []
    cloudCover_list = []
    validPixels_list = []
    source_list = ['s2']*np.int(n_s2.sum()) # creating a list the size of the total number of images of all the polygons
    coordinates_list = [] # creating number of lists to match the number of polygons so that each polygon will have it's coordinates
    aoi_list = []

    # Loop 2
    for i in range(len(s2_imgs)): # Iterating through all the polygons

        for k in range(np.int(n_s2[i])): # Iterating through all the images of each polygon

            id_list = np.append(id_list, s2_imgs[i].getInfo()['features'][k].get('id'))
            cloudCover_list = np.append(cloudCover_list, s2_imgs[i].getInfo()['features'][k].get('properties')['CLOUDY_PIXEL_PERCENTAGE'])
            validPixels_list = np.append(validPixels_list, s2_imgs[i].getInfo()['features'][k].get('properties')['valids'])
            aoi_list = np.append(aoi_list, polygon_aoi_codes[i])
            index_coord = s2_coordinates[i]
            coordinates_list.extend(index_coord)      
    # End loop 2

    # Populate the dataframe
    s2_df = gpd.GeoDataFrame({'id':id_list, '%cloud_cvr':cloudCover_list, 'source': source_list, 'aoi_code': aoi_list, '%valid_pxl': validPixels_list, 'geometry':coordinates_list})
    s2_df.head(10)
    print('Number of s2 images:', len(s2_df))
    
    # Combining into one dataframe, assigning crs and formatting geometry column for plotting
    
    all_df = pd.concat([landsat_df, s2_df])
    
    all_df.crs = 'EPSG:4326'
    
    all_df['geometry'] = all_df['geometry'].apply(Polygon)
    
    print('Total number of images:', len(all_df))
          
    return all_df

## Appendix 1: Previous versions

In [None]:
#first attempt at a function to produce a dataframe of properties. This did work but we found that it was inefficient for looping over multiple polygons. 

def extract_properties(landsat_imgs, s2_imgs, init_bounds, aoi_code):
    
    coordinates = init_bounds['coordinates']
    
    landsat_dict = landsat_imgs.getInfo()
    s2_dict = s2_imgs.getInfo()
    
    n_landsat = len(landsat_dict['features'])
    print('Landsat images:', n_landsat)
    n_s2 = len(s2_dict['features'])
    print('s2 images:', n_s2)
    
    n_total = n_landsat + n_s2 
    
    properties_df = pd.DataFrame(index=np.arange(0, n_total), columns=['id', 'source', 'cloud_cover', 'percentage_valid_pixels', 'coordinates', 'aoi_code']) 
    
    
    for i in range(n_landsat):
        properties_df['id'][i] = landsat_dict['features'][i].get('id')
        properties_df['source'][i] = 'landsat'
        properties_df['cloud_cover'][i] = landsat_dict['features'][i].get('properties')['CLOUD_COVER']
        properties_df['percentage_valid_pixels'][i] = landsat_dict['features'][i].get('properties')['valids'] 
        properties_df['coordinates'][i] = coordinates
        properties_df['aoi_code'][i] = aoi_code
        #find area/co-ordinates associated with each image
        
    for i in range(n_s2):
        j = i + n_landsat
        properties_df['id'][j] = s2_dict['features'][i].get('id')
        properties_df['source'][j] = 's2'
        properties_df['cloud_cover'][j] = s2_dict['features'][i].get('properties')['CLOUD_COVERAGE_ASSESSMENT']
        properties_df['percentage_valid_pixels'][j] = s2_dict['features'][i].get('properties')['valids'] 
        properties_df['coordinates'][j] = coordinates
        properties_df['aoi_code'][j] = aoi_code
        #find area/co-ordinates associated with each image
    
    return properties_df