 # EOCubeSceneCollection - level-3 data from extracted data

Import required libraries, 
 get the sampledata,
 parameters,
 extract the data 
 and load the extracted data.

In [3]:
from eobox.raster.cube import create_virtual_time_series
import fnmatch
import geopandas as gpd
import numpy as np
import os
import pandas as pd
from pathlib import Path

from eobox.raster import extraction
from eobox.raster import cube
from eobox import sampledata

def get_sampledata(year):
    dataset = sampledata.get_dataset("lsts")
    layers_paths = [Path(p) for p in dataset["raster_files"]]
    layers_df = pd.Series([p.stem for p in layers_paths]).str.split("_", expand=True) \
    .rename({0: "sceneid", 1:"band"}, axis=1)

    layers_df["date"] = pd.to_datetime(layers_df.sceneid.str[9:16], format="%Y%j")
    layers_df["uname"] = layers_df.sceneid.str[:3] + "_" + layers_df.date.dt.strftime("%Y-%m-%d") + "_" + layers_df.band.str[::] 
    layers_df["path"] = layers_paths

    layers_df = layers_df.sort_values(["date", "band"])
    layers_df = layers_df.reset_index(drop=True)

    layers_df_year = layers_df[(layers_df.date >= str(year)) & (layers_df.date < str(year+1))]
    layers_df_year = layers_df_year.reset_index(drop=True)
    return layers_df_year


df_layers = get_sampledata(2010)
src_vector = Path(sampledata.get_dataset("lsts")["raster_files"][0]).parent.parent / "sample_locations.gpkg"
extraction_dir = "xxx_uncontrolled/extracted"

qa="fmask"
qa_valid=[0, 1]
variables=["b3", "b4", "b5"]

idx_virtual = pd.date_range(start="2020-01-01", end="2020-12-31", freq="2W")
percentiles=[.1, .25, .5, .75, .9]
iqr=True
diffs=True 

dst_pattern_vts = "ls2010_vts2w_{date}_{var}"
dst_pattern_sm = "ls2008_stats1yr_{metric}_{var}"

l3_raster_dir = "xxx_uncontrolled/l3_raster"
l3_extraction_dir = "xxx_uncontrolled/extracted_l3"


extraction.extract(src_vector=str(src_vector),
                   burn_attribute="locid",
                   src_raster=df_layers["path"],
                   dst_names=df_layers["uname"],
                   dst_dir=extraction_dir)
df_extracted = extraction.load_extracted(extraction_dir)


Get the masked time series data, calculate statistical metrics and virtual time series.

You can also get the unmasked data and the mask layer as dictionary entry by leaving 'qa' unspecified.

In [4]:
dfs_var = cube.scoll_df_to_var_dfs(df_extracted, df_layers, qa=qa, qa_valid=qa_valid, verbose=True)

l3_features = []
for var, df_var in dfs_var.items():
    vts = cube.create_virtual_time_series(df_var=df_var, idx_virtual=idx_virtual, colname_pattern=dst_pattern_vts.replace("{var}", var)) #.round(0).astype(int)
    l3_features.append(vts)

    sm = cube.create_statistical_metrics(df_var=df_var, percentiles=percentiles, iqr=iqr, diffs=diffs, colname_pattern=dst_pattern_sm.replace("{var}", var)) #.astype(int)
    l3_features.append(sm)

l3_features = pd.concat(l3_features, axis=1)
l3_features


VARIABLE: b3
VARIABLE: b4
VARIABLE: b5


Unnamed: 0,ls2010_vts2w_2020-01-05_b3,ls2010_vts2w_2020-01-19_b3,ls2010_vts2w_2020-02-02_b3,ls2010_vts2w_2020-02-16_b3,ls2010_vts2w_2020-03-01_b3,ls2010_vts2w_2020-03-15_b3,ls2010_vts2w_2020-03-29_b3,ls2010_vts2w_2020-04-12_b3,ls2010_vts2w_2020-04-26_b3,ls2010_vts2w_2020-05-10_b3,...,ls2008_stats1yr_min_b5,ls2008_stats1yr_p10_b5,ls2008_stats1yr_p25_b5,ls2008_stats1yr_p50_b5,ls2008_stats1yr_p75_b5,ls2008_stats1yr_p90_b5,ls2008_stats1yr_max_b5,ls2008_stats1yr_p75-p25_b5,ls2008_stats1yr_max-min_b5,ls2008_stats1yr_p90-p10_b5
0,539.0,539.0,539.0,539.0,539.0,539.0,539.0,539.0,539.0,539.0,...,908.0,1005.0,1280.5,1347.0,1437.0,1460.0,1957.0,156.5,1049.0,455.0
1,504.0,504.0,504.0,504.0,504.0,504.0,504.0,504.0,504.0,504.0,...,876.0,902.1,934.25,1060.5,1111.5,1151.8,1357.0,177.25,481.0,249.7
2,504.0,504.0,504.0,504.0,504.0,504.0,504.0,504.0,504.0,504.0,...,781.0,926.6,1019.75,1071.5,1248.75,1361.4,1416.0,229.0,635.0,434.8
3,1097.0,1097.0,1097.0,1097.0,1097.0,1097.0,1097.0,1097.0,1097.0,1097.0,...,1154.0,1408.7,1467.75,1513.5,1625.0,1843.8,2085.0,157.25,931.0,435.1
4,1096.0,1096.0,1096.0,1096.0,1096.0,1096.0,1096.0,1096.0,1096.0,1096.0,...,1821.0,1843.4,1873.25,1954.5,2113.5,2321.1,2368.0,240.25,547.0,477.7


Check if we get the same results for the l3 raster products.

In [6]:
scoll = cube.EOCubeSceneCollection(df_layers=df_layers,
    chunksize=50,
    variables=["b3", "b4", "b5"],
    qa="fmask", 
    qa_valid=[0, 1],
    wdir="xxx_uncontrolled/l3_raster")
scoll.create_statistical_metrics(percentiles=percentiles, iqr=iqr, diffs=diffs, dst_pattern=f"{l3_raster_dir}/{dst_pattern_sm}.vrt", dtypes="float32")
scoll.create_virtual_time_series(idx_virtual=idx_virtual, dst_pattern=f"{l3_raster_dir}/{dst_pattern_vts}.vrt", dtypes="float32")

  0%|          | 0/3 [00:00<?, ?it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

b3: 4 / 4 chunks already processed and skipped.
b4: 4 / 4 chunks already processed and skipped.
b5: 4 / 4 chunks already processed and skipped.
b3: 4 / 4 chunks already processed and skipped.
b4: 4 / 4 chunks already processed and skipped.
b5: 4 / 4 chunks already processed and skipped.





In [7]:
src_raster = list(Path(l3_raster_dir).glob("*.vrt"))
dst_names = [pth.stem for pth in src_raster]

extraction.extract(src_vector=str(src_vector),
                   burn_attribute="locid",
                   src_raster=src_raster, 
                   dst_names=dst_names,
                   dst_dir=l3_extraction_dir)

df_extracted_l3 = extraction.load_extracted(l3_extraction_dir)

In [8]:
import numpy as np
for col in l3_features.columns:
    assert np.allclose(l3_features[col], df_extracted_l3[col], rtol=1e-05, atol=1e-08, equal_nan=False), col
