In [1]:
!ls /DATA/data/v1/jrc/*/*.tif | wc -l
!ls /DATA/data/v1/sentinel_1/*/*.tif | wc -l

5701
5701


---

In [2]:
import re
from pathlib import Path
import pandas as pd
import numpy as np
import image_kit.io as io
import mproc

---

In [3]:
def to_float_str(value,sep='d',precision=2):
    value=float(re.sub(sep,'.',str(value)))
    return str(round(value,precision))


def lonlat_strs(path,precision=2):
    parts=path.split('/')[-1].split('_')
    lon=parts[2]
    lat="-".join(parts[-1].split('_')[-1].split('-')[:-1])
    return to_float_str(lon), to_float_str(lat)


def block_id(path):
    lon,lat=lonlat_strs(path)
    lon=lon.split('.')[0]    
    lat=lat.split('.')[0]
    return f'block_{lon}_{lat}'


def group_id(path):
    lon,lat=lonlat_strs(path)
    lon=lon.split('.')    
    lat=lat.split('.')
    return f'group_{lon[0]}.{lon[1][:2]}_{lat[0]}.{lat[1][:2]}'

In [4]:
# DSET='https://storage.googleapis.com/surface-water-public/data/v1/datasets/surface-water.africa.master.csv'
# _df=pd.read_csv(DSET)
# r=_df.sample().iloc[0]
# path=r.gsw_path
# print((r.block_id,r.group_id)==(block_id(path), group_id(path)))
# path,r.block_id,r.group_id,block_id(path), group_id(path)

---

In [5]:
SIZE=512

def win_quad(i,win_size,size=SIZE):
    nb_cols=int(size/win_size)
    col=i//nb_cols    
    row=i-col*nb_cols
    return (col*win_size,row*win_size,win_size,win_size)

def get_windows(win_size,size=SIZE):
    return [win_quad(i,win_size=win_size,size=size) for i in range(int(size/win_size)**2)]

---

In [6]:
LIMIT=None

In [7]:
DSETS=['africa','africa_mtn','africa_small']
BUCKET='surface-water-public'
DATA_DIR='/DATA'
JRC_FOLDER='jrc'
S1_FOLDER='sentinel_1'
JRC_DIR=f'{DATA_DIR}/data/v1/{JRC_FOLDER}'
BANDS=['no_data','not_water','water']
MAX_WATER=0.6
MIN_WATER=0.005
MAX_NODATA=0.3
WIN_SIZES=[128,256,192]

In [8]:
COLS=[
    'gsw_path',
    's1_path',
    'block_id',
    'group_id',
    'water',
    'not_water',
    'no_data',
    'dataset']

WCOLS=[
    'gsw_path',
    's1_path',
    'block_id',
    'group_id',
    'window_index',
    'window',
    'water',
    'not_water',
    'no_data',
    'dataset']

In [9]:
def hist(values,counts,nb_pixels):
    _hist={ v: c for v,c in zip(values,counts)}
    return { c: _hist.get(i,0)/nb_pixels for i,c in enumerate(BANDS) }


def check_jrc_window(im,window=None):
    if window:
        x,y,w,h=window
        im=im[:,y:y+h,x:x+w]
    else:
        h,w=im.shape[1:]
    values,counts=np.unique(im,return_counts=True)
    out={}
    if len(values)>1:
        h=hist(values,counts,w*h)
        if h['no_data']<MAX_NODATA:
            w=h['water']
            if (w>=MIN_WATER) & (w<MAX_WATER):
                out=h
    return out   



def get_row_data(im,path,window_index,window):
    row=check_jrc_window(im,window)
    if row:
        row['gsw_path']=re.sub(f'^{DATA_DIR}',f'gs://{BUCKET}',str(path))
        s1_path=re.sub('/GSW_','/S1_',row['gsw_path'])
        row['s1_path']=re.sub(f'/{JRC_FOLDER}/',f'/{S1_FOLDER}/',s1_path)
        row['block_id']=block_id(row['gsw_path'])
        row['group_id']=group_id(row['gsw_path'])
        if window:
            row['window_index']=window_index
            row['window']=window
    return row


def proc_jrc(path,windows=None):
    im=io.read(path,return_profile=False)
    rows=[]
    if windows:
        return [get_row_data(im,path,i,w) for i,w in enumerate(windows)]
    else:
        return [get_row_data(im,path,False,False)]

In [10]:
def proc_paths(paths,windows=None):
    rows=[]
    for p in paths:
        rows+=proc_jrc(p,windows)
    return [r for r in rows if r]

In [11]:
def proc_datasets(windows,datasets=DSETS,limit=None):
    dfs=[]
    for d in datasets:
        paths=list(Path(f'{JRC_DIR}/{d}').glob('*.tif'))
        df=pd.DataFrame(proc_paths(paths[:limit],windows=windows))
        df['dataset']=d
        print(f'{d}: {len(paths)} => {df.shape[0]}')
        dfs.append(df)
    return pd.concat(dfs)  

---

In [12]:
%time df=proc_datasets(None,limit=LIMIT)[COLS]
path='surface-water.africa.csv'
df.to_csv(path,index=False)
print('-'*100)
print(path,df.shape[0])
print('-'*100)
df.sample(3)

africa: 3393 => 2662
africa_mtn: 2126 => 1711
africa_small: 182 => 144
CPU times: user 35.5 s, sys: 4.17 s, total: 39.7 s
Wall time: 53.3 s
----------------------------------------------------------------------------------------------------
surface-water.africa.csv 4517
----------------------------------------------------------------------------------------------------


Unnamed: 0,gsw_path,s1_path,block_id,group_id,water,not_water,no_data,dataset
1817,gs://surface-water-public/data/v1/jrc/africa/G...,gs://surface-water-public/data/v1/sentinel_1/a...,block_35_5,group_35.94_5.48,0.012051,0.98698,0.000969,africa
1235,gs://surface-water-public/data/v1/jrc/africa_m...,gs://surface-water-public/data/v1/sentinel_1/a...,block_30_-29,group_30.13_-29.55,0.025043,0.97459,0.000366,africa_mtn
839,gs://surface-water-public/data/v1/jrc/africa_m...,gs://surface-water-public/data/v1/sentinel_1/a...,block_31_-29,group_31.65_-29.06,0.13419,0.722355,0.143456,africa_mtn


---

In [13]:
WSIZE=128
%time df=proc_datasets(get_windows(WSIZE),limit=LIMIT)[WCOLS]
path=f'surface-water.africa.win{WSIZE}.csv'
df.to_csv(path,index=False)
print('-'*100)
print(path,df.shape[0])
print('-'*100)
df.sample(3)

africa: 3393 => 14752
africa_mtn: 2126 => 9235
africa_small: 182 => 775
CPU times: user 34.1 s, sys: 3.95 s, total: 38 s
Wall time: 38 s
----------------------------------------------------------------------------------------------------
surface-water.africa.win128.csv 24762
----------------------------------------------------------------------------------------------------


Unnamed: 0,gsw_path,s1_path,block_id,group_id,window_index,window,water,not_water,no_data,dataset
2332,gs://surface-water-public/data/v1/jrc/africa_m...,gs://surface-water-public/data/v1/sentinel_1/a...,block_26_-32,group_26.85_-32.31,13,"(384, 128, 128, 128)",0.075623,0.91571,0.008667,africa_mtn
1857,gs://surface-water-public/data/v1/jrc/africa/G...,gs://surface-water-public/data/v1/sentinel_1/a...,block_29_-2,group_29.03_-2.26,5,"(128, 128, 128, 128)",0.150696,0.843079,0.006226,africa
2458,gs://surface-water-public/data/v1/jrc/africa_m...,gs://surface-water-public/data/v1/sentinel_1/a...,block_31_-29,group_31.57_-29.14,14,"(384, 256, 128, 128)",0.029968,0.96759,0.002441,africa_mtn


In [14]:
WSIZE=256
%time df=proc_datasets(get_windows(WSIZE),limit=LIMIT)[WCOLS]
path=f'surface-water.africa.win{WSIZE}.csv'
df.to_csv(path,index=False)
print('-'*100)
print(path,df.shape[0])
print('-'*100)
df.sample(3)
df.sample(3)

africa: 3393 => 6198
africa_mtn: 2126 => 3923
africa_small: 182 => 337
CPU times: user 32.9 s, sys: 3.94 s, total: 36.8 s
Wall time: 36.8 s
----------------------------------------------------------------------------------------------------
surface-water.africa.win256.csv 10458
----------------------------------------------------------------------------------------------------


Unnamed: 0,gsw_path,s1_path,block_id,group_id,window_index,window,water,not_water,no_data,dataset
2671,gs://surface-water-public/data/v1/jrc/africa_m...,gs://surface-water-public/data/v1/sentinel_1/a...,block_28_-32,group_28.93_-32.2,2,"(256, 0, 256, 256)",0.012085,0.98172,0.006195,africa_mtn
2964,gs://surface-water-public/data/v1/jrc/africa/G...,gs://surface-water-public/data/v1/sentinel_1/a...,block_16_0,group_16.65_0.53,3,"(256, 256, 256, 256)",0.185989,0.80687,0.007141,africa
4638,gs://surface-water-public/data/v1/jrc/africa/G...,gs://surface-water-public/data/v1/sentinel_1/a...,block_18_3,group_18.1_3.63,0,"(0, 0, 256, 256)",0.04451,0.95549,0.0,africa


In [15]:
WSIZE=192
%time df=proc_datasets(get_windows(WSIZE),limit=LIMIT)[WCOLS]
path=f'surface-water.africa.win{WSIZE}.csv'
df.to_csv(path,index=False)
print('-'*100)
print(path,df.shape[0])
print('-'*100)
df.sample(3)

africa: 3393 => 5208
africa_mtn: 2126 => 3210
africa_small: 182 => 290
CPU times: user 24.5 s, sys: 4.12 s, total: 28.6 s
Wall time: 28.6 s
----------------------------------------------------------------------------------------------------
surface-water.africa.win192.csv 8708
----------------------------------------------------------------------------------------------------


Unnamed: 0,gsw_path,s1_path,block_id,group_id,window_index,window,water,not_water,no_data,dataset
3965,gs://surface-water-public/data/v1/jrc/africa/G...,gs://surface-water-public/data/v1/sentinel_1/a...,block_22_-34,group_22.67_-34.0,2,"(192, 0, 192, 192)",0.347466,0.652534,0.0,africa
2011,gs://surface-water-public/data/v1/jrc/africa/G...,gs://surface-water-public/data/v1/sentinel_1/a...,block_32_-27,group_32.06_-27.43,2,"(192, 0, 192, 192)",0.339871,0.660129,0.0,africa
1735,gs://surface-water-public/data/v1/jrc/africa_m...,gs://surface-water-public/data/v1/sentinel_1/a...,block_29_-31,group_29.22_-31.36,3,"(192, 192, 192, 192)",0.043945,0.956055,0.0,africa_mtn


---

---

##### EXAMPLE

---

---

In [16]:
df.sample().iloc[0]

gsw_path        gs://surface-water-public/data/v1/jrc/africa_m...
s1_path         gs://surface-water-public/data/v1/sentinel_1/a...
block_id                                             block_25_-33
group_id                                       group_25.74_-33.75
window_index                                                    3
window                                       (192, 192, 192, 192)
water                                                   0.0662435
not_water                                                0.929986
no_data                                                0.00377062
dataset                                                africa_mtn
Name: 2284, dtype: object

In [19]:
df.s1_path.iloc[0]

'gs://surface-water-public/data/v1/sentinel_1/africa/S1_lon_32d1133595477_lat_-23d9087153653-201505.tif'

In [20]:
!ls /DATA/data/v1/sentinel_1/africa/S1_lon_32d1133595477_lat_-23d9087153653-201505.tif

/DATA/data/v1/sentinel_1/africa/S1_lon_32d1133595477_lat_-23d9087153653-201505.tif


---

---

##### GCS UPLOAD/CLEANUP

---

---

In [17]:
# !gsutil mv gs://surface-water-public/data/v1/datasets gs://surface-water-public/data/v1/gee_output_datasets

In [18]:
!gsutil cp *.csv gs://surface-water-public/data/v1/datasets

Copying file://surface-water.africa.csv [Content-Type=text/csv]...
Copying file://surface-water.africa.split.csv [Content-Type=text/csv]...        
Copying file://surface-water.africa.win128.csv [Content-Type=text/csv]...       
Copying file://surface-water.africa.win128.split.csv [Content-Type=text/csv]... 
\ [4 files][ 16.3 MiB/ 16.3 MiB]                                                
==> NOTE: You are performing a sequence of gsutil operations that may
run significantly faster if you instead use gsutil -m cp ... Please
see the -m section under "gsutil help options" for further information
about when gsutil -m can be advantageous.

Copying file://surface-water.africa.win192.csv [Content-Type=text/csv]...
Copying file://surface-water.africa.win192.split.csv [Content-Type=text/csv]... 
Copying file://surface-water.africa.win256.csv [Content-Type=text/csv]...       
Copying file://surface-water.africa.win256.split.csv [Content-Type=text/csv]... 
/ [8 files][ 27.2 MiB/ 27.2 MiB]        