# Extract reference data - DEV

**TODO**: Create a Snakemake task.

In [39]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

# - - - - - - - - - - - - - - - - - - - - 
# DEFAULT IMPORTS - IN ALL NOTEBOKS
from src import configs

prjconf = configs.ProjectConfigParser()

# - - - - - - - - - - - - - - - - - - - - 
# NOTEBOOK SPECIFIC IMPORTS
import numpy as np
from pathlib import Path
import pandas as pd
import rasterio
from tqdm import tqdm

from eobox.raster import extract
from eobox.raster.extraction import add_vector_data_attributes_to_extracted
from eobox.raster import gdalutils

tilenames = prjconf.get("Params", "tiles").split(" ")
tilenames

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


['32UNU', '32UPU', '32UQU', '33UUP']

## Inputs

### Parameters

In [40]:
scoll_id = 1
scoll_name = f"scoll{scoll_id:02d}"

variables = ["Red", "NIR", "SWIR1", "SWIR2"]
idx_virtual = pd.date_range(start='2018-01-01', end="2018-12-31", freq='4W')


print(f"{'scoll_name':12s}: {scoll_name}")
print(f"{'variables':12s}: {variables}")
print(f"{'idx_virtual':12s}: {idx_virtual}")


scoll_name  : scoll01
variables   : ['Red', 'NIR', 'SWIR1', 'SWIR2']
idx_virtual : DatetimeIndex(['2018-01-07', '2018-02-04', '2018-03-04', '2018-04-01',
               '2018-04-29', '2018-05-27', '2018-06-24', '2018-07-22',
               '2018-08-19', '2018-09-16', '2018-10-14', '2018-11-11',
               '2018-12-09'],
              dtype='datetime64[ns]', freq='4W-SUN')


### Files

In [41]:
path_ref_vector_all_tiles = {}
for tile in tilenames:
    path_ref_vector_all_tiles[tile] = prjconf.get_path("Interim", "clc_subset5", tile=tile)
    assert path_ref_vector_all_tiles[tile].exists()
    print(path_ref_vector_all_tiles[tile])

paths_raster_all_tiles = {}
for tile in tilenames:
    paths_raster_all_tiles[tile] = prjconf.get_paths_features_vts_regular_raster(
        scoll_name, tile, variables, idx_virtual, as_dict=False)

/home/ben/Devel/Projects/classify-hls/data/interim/clc/clc2018_32UNU_subset_500000.gpkg
/home/ben/Devel/Projects/classify-hls/data/interim/clc/clc2018_32UPU_subset_500000.gpkg
/home/ben/Devel/Projects/classify-hls/data/interim/clc/clc2018_32UQU_subset_500000.gpkg
/home/ben/Devel/Projects/classify-hls/data/interim/clc/clc2018_33UUP_subset_500000.gpkg


## Run task

Distance to raster border (``dist2rb``) to be used in a multi-tile project after extraction, this auxiliar layer togehter with the coordinates helps to decide which pixel to take, given that they have been extracted multiple times in overlapping areas
Reasoning: we want the pixels which are most distance from the raster border

Create distance to polygon border (``dist2pb``) layer useful for filtering training samples / select purer inner polygon pixels


In [43]:
for tile in tilenames: 
    path_ref_vector = path_ref_vector_all_tiles[tile]
    paths_raster = paths_raster_all_tiles[tile]

    dir_extracted = prjconf.get_path("Processed", "extracted", tile=tile) / path_ref_vector.stem    
    print(dir_extracted)
    dst_names = [Path(fp).stem for fp in paths_raster]

    extract(
        src_vector = str(path_ref_vector),
        burn_attribute = "pid",
        src_raster = paths_raster,
        dst_names = dst_names,
        dst_dir = dir_extracted,
        dist2pb = True,
        dist2rb = True,
        src_raster_template = str(paths_raster[0]),
        gdal_dtype = 4,
        n_jobs = 6
        )
    add_vector_data_attributes_to_extracted(
        ref_vector=path_ref_vector,
        pid='pid',
        dir_extracted=dir_extracted,
        overwrite=False)

/home/ben/Devel/Projects/classify-hls/data/processed/L3/extracted/32UNU/clc2018_32UNU_subset_500000
Skipping column tile - datatype 'object' not (yet) supported.
/home/ben/Devel/Projects/classify-hls/data/processed/L3/extracted/32UPU/clc2018_32UPU_subset_500000


100%|██████████| 10728/10728 [00:00<00:00, 141601.71it/s]


Skipping column tile - datatype 'object' not (yet) supported.
/home/ben/Devel/Projects/classify-hls/data/processed/L3/extracted/32UQU/clc2018_32UQU_subset_500000


100%|██████████| 12467/12467 [00:00<00:00, 133770.93it/s]


Skipping column tile - datatype 'object' not (yet) supported.
/home/ben/Devel/Projects/classify-hls/data/processed/L3/extracted/33UUP/clc2018_33UUP_subset_500000


100%|██████████| 14260/14260 [00:00<00:00, 152930.01it/s]


Skipping column tile - datatype 'object' not (yet) supported.
