In [None]:
!wget -q  https://share.phys.ethz.ch/~pf/albecker/abc/09072022_1154_train.h5
!wget -q  https://share.phys.ethz.ch/~pf/albecker/abc/09072022_1154_val.h5
!wget -q https://share.phys.ethz.ch/~pf/albecker/abc/09072022_1154_test.h5

Then copy TestFiles.zip and UniqueID-SentinelPair.csv to the current directory 


I only used these 5 files provided in the competition

In [None]:
!unzip TestFiles.zip -d TestFiles

Archive:  TestFiles.zip
  inflating: TestFiles/canopy_height_test.h5  
  inflating: TestFiles/cloud_test.h5  
  inflating: TestFiles/image_date_test.h5  
  inflating: TestFiles/image_name_test.h5  
  inflating: TestFiles/images_test.h5  
  inflating: TestFiles/lat_test.h5   
  inflating: TestFiles/lon_test.h5   
  inflating: TestFiles/scl_test.h5   
  inflating: TestFiles/shot_number_test.h5  
  inflating: TestFiles/standard_deviation_test.h5  
  inflating: TestFiles/x_topleft_test.h5  
  inflating: TestFiles/y_topleft_test.h5  


In [None]:
import h5py
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression

bands = ['B01', 'B02', 'B03', 'B04', 'B05', 'B06', 'B07', 'B08', 'B8A', 'B09', 'B11', 'B12']
keys = ['agbd', 'cloud', 'images', 'lat', 'lon', 'scl']

# use center 3x3 pixels within the 15x15 images
slice_center = (slice(None, None), slice(6, 9), slice(6, 9))

def process_arr(dict_arr, slice_center=slice_center, verbose=True):
    n = dict_arr['images'].shape[0]
    arr_cloud = dict_arr['cloud'][slice_center].astype(np.float32).mean(axis=(1, 2))
    arr_images = dict_arr['images'][slice_center].astype(np.float32).reshape((n, -1))
    arr_lat = dict_arr['lat'][slice_center].mean(axis=(1, 2))
    arr_lon = dict_arr['lon'][slice_center].mean(axis=(1, 2))
    arr_scl = dict_arr['scl'][slice_center].astype(np.float32).reshape((n, -1))

    arr = np.concatenate(
        [arr_cloud, arr_images, arr_lat, arr_lon, arr_scl], axis=1,
    )
    if verbose:
        print(arr.shape, arr_cloud.shape, arr_images.shape, arr_lat.shape, arr_lon.shape, arr_scl.shape)
    return arr

list_fn_train = ["09072022_1154_train.h5", "09072022_1154_val.h5", "09072022_1154_test.h5"]
dict_list_train = {key: [] for key in keys}
for fn_train in list_fn_train:
    h5 = h5py.File(fn_train, 'r')
    for key in keys:
        dict_list_train[key].append(np.array(h5[key]))
dict_arr_train = {}
print('training data:')
for key in keys:
    dict_arr_train[key] = np.concatenate(dict_list_train[key], axis=0)
    print(key, dict_arr_train[key].shape, dict_arr_train[key].dtype)

n_train = len(dict_arr_train['agbd'])

# clip unrealistic large agbd
upper_agbd = np.percentile(dict_arr_train['agbd'], 99) # 301.2308270263677
dict_arr_train['agbd'] = np.clip(dict_arr_train['agbd'], None, upper_agbd)

arr_train = process_arr(dict_arr_train)

print('test data:')
dict_arr_test = {}
for key in keys[1:]:
    dict_arr_test[key] = np.array(h5py.File(f'TestFiles/{key}_test.h5', 'r')[key])
    print(key, dict_arr_test[key].shape, dict_arr_test[key].dtype)

arr_test = process_arr(dict_arr_test)

df_idx = pd.read_csv('UniqueID-SentinelPair.csv')

# Wang et al 2019, ISPRS
# https://www.sciencedirect.com/science/article/pii/S0924271618303046
ratio = 2

preds = []
regs = [
    DecisionTreeRegressor(random_state=4572, max_depth=3, max_features=5),
    DecisionTreeRegressor(random_state=2275, max_depth=3, max_features=5),
    DecisionTreeRegressor(random_state=697, max_depth=3, max_features=5),
]
for reg in regs:
    reg.fit(arr_train, dict_arr_train['agbd'])
    pred = reg.predict(arr_test)
    pred = pred[df_idx['S2_idx']]

    preds.append(pred)

pred = np.stack(preds, axis=0).mean(axis=0) * ratio

df_pred = df_idx[['ID']].copy()
df_pred['pred'] = pred
# df_pred.to_csv('pred.csv', index=False)




training data:
agbd (35400,) float32
cloud (35400, 15, 15, 1) uint8
images (35400, 15, 15, 12) uint16
lat (35400, 15, 15, 1) float32
lon (35400, 15, 15, 1) float32
scl (35400, 15, 15, 1) uint8
(35400, 120) (35400, 1) (35400, 108) (35400, 1) (35400, 1) (35400, 9)
test data:
cloud (90, 15, 15, 1) uint8
images (90, 15, 15, 12) uint16
lat (90, 15, 15, 1) float32
lon (90, 15, 15, 1) float32
scl (90, 15, 15, 1) uint8
(90, 120) (90, 1) (90, 108) (90, 1) (90, 1) (90, 9)


In [None]:
# Cloud cover in test set is very high. Deal with it
scl_exclude = [
    0,
    1,#	ff0004	Saturated or defective
    # 2,#	868686	Dark Area Pixels
    3,#	774b0a	Cloud Shadows
    # 4,#	10d22c	Vegetation
    # 5,#	ffff52	Bare Soils
    # 6,#	0000ff	Water
    # 7,#	818181	Clouds Low Probability / Unclassified
    8,#	c0c0c0	Clouds Medium Probability
    9,#	f1f1f1	Clouds High Probability
    10,#	bac5eb	Cirrus
    # 11,#	52fff9	Snow / Ice
]
slice_center = (slice(6, 9), slice(6, 9))
from scipy.ndimage import binary_dilation
def mat2df(
    arr_img, arr_scl, arr_cloud=None, 
    bands=bands, slice_center=slice_center,
    scl_exclude=scl_exclude,
    cloud_threshold=20,
    dilation=3,
):
    list_rec = []
    for i in range(arr_img.shape[0]):
        rec = {}
        img = arr_img[i]
        scl = np.squeeze(arr_scl[i])
        indi = np.full(scl.shape, True, bool)
        for se in scl_exclude:
            indi = indi & (scl != se)
        if arr_cloud is not None:
            cloud = np.squeeze(arr_cloud[i])
            indi = indi & (cloud < cloud_threshold)
        indi = ~ binary_dilation(~indi, iterations=dilation)
        for iband, band in enumerate(bands):
            rec[band] = np.mean(img[iband][slice_center][indi[slice_center]])
        list_rec.append(rec)
    df = pd.DataFrame(list_rec)
    return df

from pathlib import Path
dict_test = {}
for fn in Path('./TestFiles').glob('*.h5'):
    fp = h5py.File(fn, "r")
    assert len(fp.keys()) == 1
    key = list(fp.keys())[0]
    arr = np.array(fp[key])
    # print(key, arr.shape)
    dict_test[key] = arr

df_test = mat2df(
    dict_test['images'].transpose(0, 3, 1, 2), 
    dict_test['scl'], 
    dict_test['cloud']
)
for var in ['canopy_height', 'standard_deviation', 'lon', 'lat']:
    list_res = []
    for idx in range(dict_test[var].shape[0]):
        list_res.append(dict_test[var][idx, :, :, 0][slice_center].mean())
    df_test[var] = list_res

# divide the test area into 5 regions according to location
df_test['region'] = 0
df_test.loc[df_test['lon'] < -7, 'region'] = 1
df_test.loc[(df_test['lon'] > -4) & (df_test['lat'] > 6.5), 'region'] = 2
df_test.loc[(df_test['lon'] > -4) & (df_test['lat'] < 6.5) & (df_test['lat'] > 6), 'region'] = 3
df_test.loc[(df_test['lon'] > -4) & (df_test['lat'] < 6) & (df_test['lon'] < -3.5), 'region'] = 4
df_test.loc[(df_test['lat'] < 6) & (df_test['lon'] > -3.5), 'region'] = 5

df_test = df_test.loc[df_idx['S2_idx']].reset_index()


df_test['rough'] = pred
df_test['refined'] = df_test['rough']

# simple criteria for cloudless observations
indi_clear = df_test['B04'] < 1000

# linear regression using cloudless observations and canopy height
# by region
for region in df_test['region'].unique():
    indi_cur = df_test['region'] == region
    reg = LinearRegression().fit(
        df_test.loc[indi_cur & indi_clear, ['canopy_height']],
        df_test.loc[indi_cur & indi_clear, 'rough'],
    )
    df_test.loc[indi_cur & ~indi_clear, 'refined'] = reg.predict(
        df_test.loc[indi_cur & ~indi_clear, ['canopy_height']]
    )

df_pred = df_idx[['ID']].copy()
df_pred['pred'] = df_test['refined']
df_pred.to_csv('pred.csv', index=False)

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


pred.csv is the submitted file.

In [None]:
df_pred

Unnamed: 0,ID,pred
0,ID_1EB0DGFP07,119.268402
1,ID_844T2PSXTK,117.040779
2,ID_4MCV3S8MLN,99.508906
3,ID_L7441JV5F3,98.800858
4,ID_5GUVM4YEWZ,103.160095
...,...,...
85,ID_MEW6189J1B,124.680570
86,ID_TH9HRUXGTP,104.247086
87,ID_GPC7YS3JG8,104.976980
88,ID_1P7PJMPV0R,88.263031
