# Load Sample Subset from Extracted Data with dask

In [1]:
import fnmatch
import geopandas as gpd
import os
import pandas as pd
from pathlib import Path

from eobox.raster import extraction
from eobox import sampledata

%matplotlib inline

In [2]:
dataset = sampledata.get_dataset("s2l1c")

src_vector = dataset["vector_file"]
burn_attribute = "pid"  # should be unique for the polygons and not contain zero
src_raster = fnmatch.filter(dataset["raster_files"], "*B0[2,3,4,8]*")  # 10 m bands
dst_names = ["_".join(Path(src).stem.split("_")[1::]) for src in src_raster]
extraction_dir = Path("./xxx_uncontrolled/s2l1c_ref__s2l1c/s2_l1c/10m")
extraction.extract(src_vector=src_vector,
                   burn_attribute=burn_attribute,
                   src_raster=src_raster,
                   dst_names=dst_names,
                   dst_dir=extraction_dir)
df_extracted = extraction.load_extracted(extraction_dir, "*pid.npy")
print(df_extracted.shape)
display(df_extracted.head())
index_29 = (df_extracted["aux_vector_pid"] == 29)
index_29.sum()

(3500, 1)


Unnamed: 0,aux_vector_pid
0,17
1,17
2,6
3,6
4,6


109

In [3]:
print(df_extracted[index_29].shape)
display(df_extracted[index_29].head(2))
display(df_extracted[index_29].tail(2))

(109, 1)


Unnamed: 0,aux_vector_pid
839,29
840,29


Unnamed: 0,aux_vector_pid
946,29
947,29


In [4]:
df_extracted_29 = extraction.load_extracted(extraction_dir, index=index_29)
print(df_extracted_29.shape)
df_extracted_29.head()

(109, 7)


Unnamed: 0,20170216T102101_B02,20170216T102101_B03,20170216T102101_B04,20170216T102101_B08,aux_coord_x,aux_coord_y,aux_vector_pid
839,1456,1136,1056,1696,341675.0,5820745.0,29
840,1456,1136,1056,1632,341685.0,5820745.0,29
841,1456,1136,1056,1632,341695.0,5820745.0,29
842,1424,1136,1056,1632,341705.0,5820745.0,29
843,1488,1136,1056,1632,341715.0,5820745.0,29


## Load with dask - WIP

In [7]:
npy_path_list = extraction.get_paths_of_extracted(extraction_dir)

In [15]:
%load_ext autoreload
%autoreload 2

In [22]:
ddf = extraction.load_extracted_dask(npy_path_list, index=None)
ddf

Unnamed: 0_level_0,20170216T102101_B02,20170216T102101_B03,20170216T102101_B04,20170216T102101_B08,aux_coord_x,aux_coord_y,aux_vector_pid
npartitions=1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
,uint16,uint16,uint16,uint16,float64,float64,uint32
,...,...,...,...,...,...,...


In [24]:
ddf_29 = extraction.load_extracted_dask(npy_path_list, index=index_29)
ddf_29

Unnamed: 0_level_0,20170216T102101_B02,20170216T102101_B03,20170216T102101_B04,20170216T102101_B08,aux_coord_x,aux_coord_y,aux_vector_pid
npartitions=1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
,uint16,uint16,uint16,uint16,float64,float64,uint32
,...,...,...,...,...,...,...


In [31]:
df_extracted_29.columns

Index(['20170216T102101_B02', '20170216T102101_B03', '20170216T102101_B04',
       '20170216T102101_B08', 'aux_coord_x', 'aux_coord_y', 'aux_vector_pid'],
      dtype='object')

In [33]:
ddf_29_df = ddf_29.compute()
ddf_29_df.head()

Unnamed: 0,20170216T102101_B02,20170216T102101_B03,20170216T102101_B04,20170216T102101_B08,aux_coord_x,aux_coord_y,aux_vector_pid
0,1456,1136,1056,1696,341675.0,5820745.0,29
1,1456,1136,1056,1632,341685.0,5820745.0,29
2,1456,1136,1056,1632,341695.0,5820745.0,29
3,1424,1136,1056,1632,341705.0,5820745.0,29
4,1488,1136,1056,1632,341715.0,5820745.0,29


In [35]:
assert ddf_29_df.shape == df_extracted_29.shape

True

In [39]:
assert (ddf_29_df.columns == df_extracted_29.columns).all()

Note that the index does not match!

In [41]:
ddf_29_df.index == df_extracted_29.index

array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False])

And therefore we cannot compare the dataframes.

In [40]:
ddf_29_df == df_extracted_29

ValueError: Can only compare identically-labeled DataFrame objects