In [1]:
!ls /DATA/data/v1/jrc/*/*.tif | wc -l
!ls /DATA/data/v1/sentinel_1/*/*.tif | wc -l

5701
5701


---

In [2]:
import re
from pathlib import Path
import pandas as pd
import numpy as np
import image_kit.io as io
import mproc

---

In [3]:
def to_float_str(value,sep='d',precision=2):
    value=float(re.sub(sep,'.',str(value)))
    return str(round(value,precision))


def lonlat_strs(path,precision=2):
    parts=path.split('/')[-1].split('_')
    lon=parts[2]
    lat="-".join(parts[-1].split('_')[-1].split('-')[:-1])
    return to_float_str(lon), to_float_str(lat)


def block_id(path):
    lon,lat=lonlat_strs(path)
    lon=lon.split('.')[0]    
    lat=lat.split('.')[0]
    return f'block_{lon}_{lat}'


def group_id(path):
    lon,lat=lonlat_strs(path)
    lon=lon.split('.')    
    lat=lat.split('.')
    return f'group_{lon[0]}.{lon[1][:2]}_{lat[0]}.{lat[1][:2]}'

In [4]:
# DSET='https://storage.googleapis.com/surface-water-public/data/v1/datasets/surface-water.africa.master.csv'
# _df=pd.read_csv(DSET)
# r=_df.sample().iloc[0]
# path=r.gsw_path
# print((r.block_id,r.group_id)==(block_id(path), group_id(path)))
# path,r.block_id,r.group_id,block_id(path), group_id(path)

---

In [5]:
SIZE=512

def win_quad(i,win_size,size=SIZE):
    nb_cols=int(size/win_size)
    col=i//nb_cols    
    row=i-col*nb_cols
    return (col*win_size,row*win_size,win_size,win_size)

def get_windows(win_size,size=SIZE):
    return [win_quad(i,win_size=win_size,size=size) for i in range(int(size/win_size)**2)]

---

In [6]:
LIMIT=None

In [7]:
DSETS=['africa','africa_mtn','africa_small']
BUCKET='surface-water-public'
DATA_DIR='/DATA'
JRC_FOLDER='jrc'
S1_FOLDER='sentinel_1'
JRC_DIR=f'{DATA_DIR}/data/v1/{JRC_FOLDER}'
BANDS=['no_data','not_water','water']
MAX_WATER=0.6
MIN_WATER=0.005
MAX_NODATA=0.3
WIN_SIZES=[128,256,192]

In [8]:
COLS=[
    'gsw_path',
    's1_path',
    'block_id',
    'group_id',
    'water',
    'not_water',
    'no_data',
    'dataset']

WCOLS=[
    'gsw_path',
    's1_path',
    'block_id',
    'group_id',
    'window_index',
    'window',
    'water',
    'not_water',
    'no_data',
    'dataset']

In [9]:
def hist(values,counts,nb_pixels):
    _hist={ v: c for v,c in zip(values,counts)}
    return { c: _hist.get(i,0)/nb_pixels for i,c in enumerate(BANDS) }


def check_jrc_window(im,window=None):
    if window:
        x,y,w,h=window
        im=im[:,y:y+h,x:x+w]
    else:
        h,w=im.shape[1:]
    values,counts=np.unique(im,return_counts=True)
    out={}
    if len(values)>1:
        h=hist(values,counts,w*h)
        if h['no_data']<MAX_NODATA:
            w=h['water']
            if (w>=MIN_WATER) & (w<MAX_WATER):
                out=h
    return out   



def get_row_data(im,path,window_index,window):
    row=check_jrc_window(im,window)
    if row:
        row['gsw_path']=re.sub(f'^{DATA_DIR}',f'gs://{BUCKET}',str(path))
        row['s1_path']=re.sub(f'/{JRC_FOLDER}/',f'/{S1_FOLDER}/',row['gsw_path'])
        row['block_id']=block_id(row['gsw_path'])
        row['group_id']=group_id(row['gsw_path'])
        if window:
            row['window_index']=window_index
            row['window']=window
    return row


def proc_jrc(path,windows=None):
    im=io.read(path,return_profile=False)
    rows=[]
    if windows:
        return [get_row_data(im,path,i,w) for i,w in enumerate(windows)]
    else:
        return [get_row_data(im,path,False,False)]

In [10]:
def proc_paths(paths,windows=None):
    rows=[]
    for p in paths:
        rows+=proc_jrc(p,windows)
    return [r for r in rows if r]

In [11]:
def proc_datasets(windows,datasets=DSETS,limit=None):
    dfs=[]
    for d in datasets:
        paths=list(Path(f'{JRC_DIR}/{d}').glob('*.tif'))
        df=pd.DataFrame(proc_paths(paths[:limit],windows=windows))
        df['dataset']=d
        print(f'{d}: {len(paths)} => {df.shape[0]}')
        dfs.append(df)
    return pd.concat(dfs)  

---

In [12]:
%time df=proc_datasets(None,limit=LIMIT)[COLS]
path='surface-water.africa.csv'
df.to_csv(path,index=False)
print('-'*100)
print(path,df.shape[0])
print('-'*100)
df.sample(3)

africa: 3393 => 2662
africa_mtn: 2126 => 1711
africa_small: 182 => 144
CPU times: user 37.4 s, sys: 4.21 s, total: 41.6 s
Wall time: 41.6 s
----------------------------------------------------------------------------------------------------
surface-water.africa.csv 4517
----------------------------------------------------------------------------------------------------


Unnamed: 0,gsw_path,s1_path,block_id,group_id,water,not_water,no_data,dataset
1268,gs://surface-water-public/data/v1/jrc/africa_m...,gs://surface-water-public/data/v1/sentinel_1/a...,block_29_-30,group_29.37_-30.23,0.040672,0.940563,0.018764,africa_mtn
1083,gs://surface-water-public/data/v1/jrc/africa_m...,gs://surface-water-public/data/v1/sentinel_1/a...,block_29_-28,group_29.51_-28.72,0.0075,0.922077,0.070423,africa_mtn
346,gs://surface-water-public/data/v1/jrc/africa/G...,gs://surface-water-public/data/v1/sentinel_1/a...,block_32_-24,group_32.66_-24.1,0.039181,0.960762,5.7e-05,africa


---

In [13]:
WSIZE=128
%time df=proc_datasets(get_windows(WSIZE),limit=LIMIT)[WCOLS]
path=f'surface-water.africa.win{WSIZE}.csv'
df.to_csv(path,index=False)
print('-'*100)
print(path,df.shape[0])
print('-'*100)
df.sample(3)

africa: 3393 => 14752
africa_mtn: 2126 => 9235
africa_small: 182 => 775
CPU times: user 36.1 s, sys: 4.4 s, total: 40.5 s
Wall time: 40.5 s
----------------------------------------------------------------------------------------------------
surface-water.africa.win128.csv 24762
----------------------------------------------------------------------------------------------------


Unnamed: 0,gsw_path,s1_path,block_id,group_id,window_index,window,water,not_water,no_data,dataset
424,gs://surface-water-public/data/v1/jrc/africa/G...,gs://surface-water-public/data/v1/sentinel_1/a...,block_1_33,group_1.61_33.33,13,"(384, 128, 128, 128)",0.008301,0.97876,0.012939,africa
65,gs://surface-water-public/data/v1/jrc/africa/G...,gs://surface-water-public/data/v1/sentinel_1/a...,block_26_-27,group_26.46_-27.21,11,"(256, 384, 128, 128)",0.082458,0.914429,0.003113,africa
6297,gs://surface-water-public/data/v1/jrc/africa_m...,gs://surface-water-public/data/v1/sentinel_1/a...,block_29_-28,group_29.56_-28.9,7,"(128, 384, 128, 128)",0.022949,0.976135,0.000916,africa_mtn


In [14]:
WSIZE=256
%time df=proc_datasets(get_windows(WSIZE),limit=LIMIT)[WCOLS]
path=f'surface-water.africa.win{WSIZE}.csv'
df.to_csv(path,index=False)
print('-'*100)
print(path,df.shape[0])
print('-'*100)
df.sample(3)
df.sample(3)

africa: 3393 => 6198
africa_mtn: 2126 => 3923
africa_small: 182 => 337
CPU times: user 35.3 s, sys: 4.08 s, total: 39.4 s
Wall time: 39.4 s
----------------------------------------------------------------------------------------------------
surface-water.africa.win256.csv 10458
----------------------------------------------------------------------------------------------------


Unnamed: 0,gsw_path,s1_path,block_id,group_id,window_index,window,water,not_water,no_data,dataset
4249,gs://surface-water-public/data/v1/jrc/africa/G...,gs://surface-water-public/data/v1/sentinel_1/a...,block_31_-23,group_31.87_-23.98,2,"(256, 0, 256, 256)",0.032852,0.962296,0.004852,africa
87,gs://surface-water-public/data/v1/jrc/africa_m...,gs://surface-water-public/data/v1/sentinel_1/a...,block_28_-30,group_28.05_-30.1,2,"(256, 0, 256, 256)",0.038147,0.959793,0.00206,africa_mtn
925,gs://surface-water-public/data/v1/jrc/africa/G...,gs://surface-water-public/data/v1/sentinel_1/a...,block_-6_32,group_-6.34_32.1,0,"(0, 0, 256, 256)",0.093338,0.906662,0.0,africa


In [15]:
WSIZE=192
%time df=proc_datasets(get_windows(WSIZE),limit=LIMIT)[WCOLS]
path=f'surface-water.africa.win{WSIZE}.csv'
df.to_csv(path,index=False)
print('-'*100)
print(path,df.shape[0])
print('-'*100)
df.sample(3)

africa: 3393 => 5208
africa_mtn: 2126 => 3210
africa_small: 182 => 290
CPU times: user 26.1 s, sys: 4.1 s, total: 30.2 s
Wall time: 30.2 s
----------------------------------------------------------------------------------------------------
surface-water.africa.win192.csv 8708
----------------------------------------------------------------------------------------------------


Unnamed: 0,gsw_path,s1_path,block_id,group_id,window_index,window,water,not_water,no_data,dataset
5171,gs://surface-water-public/data/v1/jrc/africa/G...,gs://surface-water-public/data/v1/sentinel_1/a...,block_30_7,group_30.58_7.62,2,"(192, 0, 192, 192)",0.459635,0.358426,0.181939,africa
63,gs://surface-water-public/data/v1/jrc/africa_s...,gs://surface-water-public/data/v1/sentinel_1/a...,block_20_-34,group_20.42_-34.06,1,"(0, 192, 192, 192)",0.015191,0.979953,0.004856,africa_small
4517,gs://surface-water-public/data/v1/jrc/africa/G...,gs://surface-water-public/data/v1/sentinel_1/a...,block_27_-22,group_27.55_-22.59,2,"(192, 0, 192, 192)",0.377903,0.620307,0.00179,africa


---

---

##### GCS UPLOAD/CLEANUP

---

---

In [16]:
# !gsutil mv gs://surface-water-public/data/v1/datasets gs://surface-water-public/data/v1/gee_output_datasets

In [17]:
!gsutil cp *.csv gs://surface-water-public/data/v1/datasets

Copying file://surface-water.africa.csv [Content-Type=text/csv]...
Copying file://surface-water.africa.win128.csv [Content-Type=text/csv]...       
Copying file://surface-water.africa.win192.csv [Content-Type=text/csv]...       
Copying file://surface-water.africa.win256.csv [Content-Type=text/csv]...       
- [4 files][ 14.4 MiB/ 14.4 MiB]                                                
Operation completed over 4 objects/14.4 MiB.                                     
