In [1]:
from importlib import reload

In [2]:
import re
import math
from pprint import pprint
import numpy as np
import pandas as pd
import tensorflow as tf

In [3]:
import tfr2human.parser as tfp
import tfr2human.utils as utils
import gcs_helpers.save as gsave

---

In [4]:
RESOLUTION=10
SIZE=192
NOISY=True
NOISE_REDUCER=10

In [5]:
VERSION=1
# update to use tfrecords of interest
SRC_FOLDER='global-high-water'
TFR_SELECTOR=f'gs://surface-water-public/gee-exports/TFR/{SRC_FOLDER}/*.tfrecord.gz'
# update destination folder/bucket
GCS_BUCKET='surface-water-public'
GCS_FOLDER=f'data/{SRC_FOLDER}/v{VERSION}'

In [6]:
TFR_LIST = tf.io.gfile.glob(TFR_SELECTOR)
len(TFR_LIST)

125

---

In [7]:
WATER_BANDS=['gsw']
S1_BANDS=['VV','VH','angle','VV_mean','VH_mean']
BANDS=S1_BANDS+WATER_BANDS

FEATURE_PROPS={
    'aoi': tf.string,
    'block_id': tf.string,
    'group_id': tf.string,
    'lon_id': tf.string,
    'lat_id': tf.string,
    'water_score': tf.float32,
    'water': tf.float32,
    'no_data': tf.float32,
    'not_water': tf.float32,
    'crs': tf.string,
    'utm': tf.string,
    'nsh': tf.string,
    'ewh': tf.string,
    'year': tf.float32,
    'month': tf.float32,
    'date': tf.string,
    'lon': tf.float32,
    'lat': tf.float32,
    'biome_name': tf.string,
    'biome_num': tf.float32,
    'eco_id': tf.float32,
    'eco_name': tf.string,
    'nnh': tf.float32,
    'nnh_name': tf.string,
}

In [8]:
WATER_COLUMNS={
    0: 'no_data_pixel_count',
    1: 'not_water_pixel_count',
    2: 'water_pixel_count'
}


def process_water(parser,element):
    water=parser.image(element,bands=WATER_BANDS,dtype=np.uint8)
    values,counts=np.unique(water,return_counts=True)
    props={v: c for (v,c) in zip(values,counts)}
    props={WATER_COLUMNS[i]: props.get(i,0) for i in range(3)}
    if props['not_water_pixel_count']:
        props['water_ratio']=props['water_pixel_count']/props['not_water_pixel_count']
    else:
        props['water_ratio']=1
    return water, props


def process_s1(parser,element):
    s1=parser.image(element,bands=S1_BANDS,dtype=np.float32)
    props={ 
        's1_na_pixel_count': np.count_nonzero(np.isnan(s1)),
        '-': np.count_nonzero((s1[0]*s1[1])==0),
    }
    return s1, props


def image_name(lon_id,lat_id,year,month):
    name=f'tile_{lon_id}_{lat_id}'
    return f'{name}-{int(year)}{str(int(month)).zfill(2)}.tif'

DEFAULT_BATCH_SIZE=100

def run(parser,take=40,skip=0,batch_size=DEFAULT_BATCH_SIZE):
    """ example:
        - parse all data properties (note: you could have also passed `keys` to `.data()` for a subset of properties )
        - parse bands into distinct images
    """
    parsed_data=parser.dataset.skip(skip)
    if take:
        parsed_data=parsed_data.take(take)
    for batch_index, batch in utils.get_batches(parsed_data,batch_size=batch_size):
        print('\n'*2)
        print('='*75)
        batch_index=batch_index+int(skip/batch_size)+1
        print('BATCH:',batch_index)
        print('='*75)
        rows=[]
        try:
            for i,element in enumerate(batch):
                props=parser.data(element)
                if NOISY and (not (i%NOISE_REDUCER)): 
                    print(f'\t- {i} [{props["aoi"]}, {props["date"]}]...')
                water, water_props=process_water(parser,element)
                s1, s1_props=process_s1(parser,element)
                props.update(water_props)
                props.update(s1_props)
                rows.append(props)
                # export images
                lon=props['lon']
                lat=props['lat']
                crs=props['crs']
                name=image_name(
                    lon_id=props['lon_id'],
                    lat_id=props['lat_id'],
                    year=props['year'],
                    month=props['month'])
                gsave.image(
                  s1,
                  name,
                  utils.image_profile(lon,lat,crs,RESOLUTION,s1),
                  folder=f'{GCS_FOLDER}/S1',
                  bucket=GCS_BUCKET)
                gsave.image(
                  water,
                  name,
                  utils.image_profile(lon,lat,crs,RESOLUTION,water),
                  folder=f'{GCS_FOLDER}/GSW',
                  bucket=GCS_BUCKET)
            df=pd.DataFrame(rows)
            gcs_path=gsave.csv(
              df,
              f'EXPORTS_BATCH-{batch_index}.csv',
              folder=f'{GCS_FOLDER}/CSV',
              bucket=GCS_BUCKET)
            print('-'*75)
            print(gcs_path)

        except Exception as e:
            print('\n'*2)
            print("FAILURE:")
            print(str(e))
            print()
            print(i, batch)
            print()
            pprint(props)
            print('\n'*2)

In [None]:
tfp=reload(tfp)

""" interuppted after batch 4:
TAKE=None
SKIP=0
BATCH_SIZE=500
"""

TAKE=None
BATCH_SIZE=500
SKIP=0

# TAKE=None
# BATCH_SIZE=5
# SKIP=4*BATCH_SIZE

print('\n'*2)
print('-'*100)
print('TAKE:',TAKE,'BATCH_SIZE:',BATCH_SIZE)
print('-'*100)
print()
pprint(BANDS)
print()
pprint(FEATURE_PROPS)
print()
print('-'*100)
print('\n'*2)


parser=tfp.TFRParser(
    TFR_LIST,
    specs=FEATURE_PROPS,
    band_specs=BANDS,
    dims=[SIZE,SIZE])


run(parser,take=TAKE,skip=SKIP,batch_size=BATCH_SIZE)





----------------------------------------------------------------------------------------------------
TAKE: None BATCH_SIZE: 500
----------------------------------------------------------------------------------------------------

['VV', 'VH', 'angle', 'VV_mean', 'VH_mean', 'gsw']

{'aoi': tf.string,
 'biome_name': tf.string,
 'biome_num': tf.float32,
 'block_id': tf.string,
 'crs': tf.string,
 'date': tf.string,
 'eco_id': tf.float32,
 'eco_name': tf.string,
 'ewh': tf.string,
 'group_id': tf.string,
 'lat': tf.float32,
 'lat_id': tf.string,
 'lon': tf.float32,
 'lon_id': tf.string,
 'month': tf.float32,
 'nnh': tf.float32,
 'nnh_name': tf.string,
 'no_data': tf.float32,
 'not_water': tf.float32,
 'nsh': tf.string,
 'utm': tf.string,
 'water': tf.float32,
 'water_score': tf.float32,
 'year': tf.float32}

----------------------------------------------------------------------------------------------------






BATCH: 1
	- 0 [africa_central, 2015-01-01]...
	- 10 [africa_central, 2015-

---

### INSPECT MASTER DATASET

In [None]:
URL_TMPL='https://storage.googleapis.com/surface-water-public/data/global-high-water/v1/CSV/EXPORTS_BATCH-{}.csv'

In [None]:
df=pd.concat([pd.read_csv(URL_TMPL.format(i)) for i in range(1,35)])

In [None]:
DATA_COLS=[
    'water',
    'water_pixel_count',
    'water_ratio',
    'not_water',
    'not_water_pixel_count',
    'no_data',
    'no_data_pixel_count',
    's1_zero_pixel_count']

In [None]:
df[DATA_COLS].describe()

---

In [None]:
NB_PIXELS=192**2
MAX_NO_DATA=NB_PIXELS*0.15
MAX_S1_ZERO=NB_PIXELS*0.05

In [None]:
df=df[df.no_data_pixel_count<MAX_NO_DATA]
df.shape[0]

In [None]:
df=df[df.s1_zero_pixel_count<MAX_S1_ZERO]
df.shape[0]

---

In [None]:
df[df.water_ratio>0.95].shape,df[df.water_ratio<0.001].shape

---